Shell Scripting AWK Arrays Advanced May 2026

Shell Scripting Advanced AWK: Arrays & Data Structures

AWK arrays are associative by nature — any string can be a key. Master grouping, counting, deduplication, frequency tables, multi-dimensional arrays, and sorting with PROCINFO to build powerful data aggregation pipelines.

AWK arrays are the secret weapon that makes AWK genuinely powerful for data analysis. They are associative — keys can be numbers or strings — and they grow on first assignment. With arrays you can group records, count occurrences, deduplicate, build lookup tables, and join multiple datasets without any external tools.

AWK
# ── Array assignment and access ───────────────────────────
awk 'BEGIN {
  a[1]   = "numeric key"
  a["x"] = "string key"
  a["host"] = "prod-01"
  for (k in a) print k, "→", a[k]
}'

# ── Count occurrences — the fundamental AWK pattern ───────
awk '{ count[$1]++ }
END {
  for (word in count)
    printf "%-20s %d\n", word, count[word]
}' words.txt

# ── Count log levels ──────────────────────────────────────
awk '{
  if      ($3 == "ERROR") errors++
  else if ($3 == "WARN")  warns++
  else if ($3 == "INFO")  infos++
}
END {
  printf "ERROR: %d  WARN: %d  INFO: %d\n", errors+0, warns+0, infos+0
}' app.log

# Shorter: use array key for level
awk '{ level[$3]++ }
END { for (l in level) printf "%-8s %d\n", l, level[l] }' app.log

# ── Check if key exists ───────────────────────────────────
awk '{ if ($1 in seen) next; seen[$1]=1; print }'   # deduplicate

# ── Delete a key ─────────────────────────────────────────
awk '{ delete seen[$1] }'
AWK
# access.log: IP METHOD URL STATUS BYTES TIME
# 10.0.1.5 GET /api/users 200 1842 0.12

# ── Sum bytes per IP ──────────────────────────────────────
awk '{ bytes[$1] += $5; hits[$1]++ }
END {
  printf "%-18s %10s %8s\n", "IP", "BYTES", "HITS"
  for (ip in bytes)
    printf "%-18s %10d %8d\n", ip, bytes[ip], hits[ip]
}' access.log

# ── Count requests per status code ────────────────────────
awk '{ status[$4]++ }
END {
  for (s in status)
    printf "HTTP %s: %d requests\n", s, status[s]
}' access.log

# ── Average response time per endpoint ────────────────────
awk '{ sum[$3] += $6; cnt[$3]++ }
END {
  for (url in sum)
    printf "%-40s avg=%.3fs n=%d\n", url, sum[url]/cnt[url], cnt[url]
}' access.log

# ── Top N by count ────────────────────────────────────────
awk '{ count[$1]++ }
END {
  for (k in count) print count[k], k
}' access.log | sort -rn | head -10

# ── Per-minute request rate ───────────────────────────────
awk '{
  # Assume timestamp is $7: 2026-05-01T10:14:02
  split($7, ts, "T")     # ts[2] = "10:14:02"
  split(ts[2], hms, ":")
  minute = hms[1] ":" hms[2]
  rpm[minute]++
}
END {
  for (m in rpm) print m, rpm[m]
}' access.log | sort
AWK
# ── Multi-dimensional: array[key1,key2] ───────────────────
# SUBSEP (default \034) joins the keys internally
awk '{ hits[$1,$4]++ }   # hits[IP, status_code]
END {
  for (combo in hits) {
    split(combo, parts, SUBSEP)
    printf "IP=%-15s STATUS=%s HITS=%d\n",
      parts[1], parts[2], hits[combo]
  }
}' access.log

# ── Lookup table: load reference data into array ──────────
# servers.csv: hostname,region,tier
# metrics.log: hostname cpu mem
awk 'BEGIN { FS="," }
FNR==NR {          # First file: servers.csv
  region[$1] = $2
  tier[$1]   = $3
  next
}
{                  # Second file: metrics.log
  host = $1
  printf "%-15s region=%-8s tier=%s cpu=%s mem=%s\n",
    host, region[host], tier[host], $2, $3
}' servers.csv metrics.log

# ── Deduplication preserving order ────────────────────────
awk '!seen[$0]++'           # one of AWK's most famous one-liners
# !seen[$0]++ evaluates to true the FIRST time a line appears
# because seen[$0] is 0 (falsy) and then gets incremented

# Deduplicate on specific field
awk '!seen[$1]++'           # unique on first field only
awk '!seen[$2,$4]++'        # unique on fields 2 and 4 combined
AWK
# ── Sort by key (gawk PROCINFO) ───────────────────────────
gawk '{ count[$1]++ }
END {
  PROCINFO["sorted_in"] = "@ind_str_asc"   # sort by key alphabetically
  for (k in count) printf "%-20s %d\n", k, count[k]
}' data.txt

# ── Sort options ──────────────────────────────────────────
# @ind_str_asc   — keys alphabetically ascending
# @ind_str_desc  — keys alphabetically descending
# @ind_num_asc   — keys numerically ascending
# @val_num_desc  — values numerically descending (top N)
# @val_str_asc   — values alphabetically ascending

# ── Top 5 IPs by request count ───────────────────────────
gawk '{ count[$1]++ }
END {
  PROCINFO["sorted_in"] = "@val_num_desc"
  n = 0
  for (ip in count) {
    printf "#%-2d %-18s %d reqs\n", ++n, ip, count[ip]
    if (n >= 5) break
  }
}' access.log

# ── Portable sort (pipe to sort for non-gawk) ────────────
awk '{ count[$1]++ }
END { for (k in count) print count[k], k }' data.txt \
  | sort -rn | head -5
Key Aggregated result High value Lookup enriched
awk — arrays and aggregation
vriddh@prod-01:~/scripts$awk '{level[$3]++} END{for(l in level) printf "%-8s %d\n",l,level[l]}' app.log | sort
ERROR 47
INFO 1842
WARN 128
vriddh@prod-01:~/scripts$awk '{count[$1]++} END{for(k in count) print count[k],k}' access.log | sort -rn | head -3
4821 10.0.1.42
2104 10.0.1.17
1893 10.0.1.5
vriddh@prod-01:~/scripts$awk '!seen[$0]++' duplicates.txt | wc -l
842
✔ AWK array rules — Test key existence with if (key in array) — never with if (array[key]) which creates the key. Use !seen[$0]++ for order-preserving deduplication. Use two-file technique (FNR==NR for first file, bare block for second) to join datasets. For top-N, pipe to sort -rn | head -N for portability, or use gawk's PROCINFO["sorted_in"] to sort inside AWK.