AWK arrays are the secret weapon that makes AWK genuinely powerful for data analysis. They are associative — keys can be numbers or strings — and they grow on first assignment. With arrays you can group records, count occurrences, deduplicate, build lookup tables, and join multiple datasets without any external tools.
1
AWK array basics — associative by default
AWK
# ── Array assignment and access ───────────────────────────
awk 'BEGIN {
a[1] = "numeric key"
a["x"] = "string key"
a["host"] = "prod-01"
for (k in a) print k, "→", a[k]
}'
# ── Count occurrences — the fundamental AWK pattern ───────
awk '{ count[$1]++ }
END {
for (word in count)
printf "%-20s %d\n", word, count[word]
}' words.txt
# ── Count log levels ──────────────────────────────────────
awk '{
if ($3 == "ERROR") errors++
else if ($3 == "WARN") warns++
else if ($3 == "INFO") infos++
}
END {
printf "ERROR: %d WARN: %d INFO: %d\n", errors+0, warns+0, infos+0
}' app.log
# Shorter: use array key for level
awk '{ level[$3]++ }
END { for (l in level) printf "%-8s %d\n", l, level[l] }' app.log
# ── Check if key exists ───────────────────────────────────
awk '{ if ($1 in seen) next; seen[$1]=1; print }' # deduplicate
# ── Delete a key ─────────────────────────────────────────
awk '{ delete seen[$1] }'
2
Grouping and aggregation
AWK
# access.log: IP METHOD URL STATUS BYTES TIME
# 10.0.1.5 GET /api/users 200 1842 0.12
# ── Sum bytes per IP ──────────────────────────────────────
awk '{ bytes[$1] += $5; hits[$1]++ }
END {
printf "%-18s %10s %8s\n", "IP", "BYTES", "HITS"
for (ip in bytes)
printf "%-18s %10d %8d\n", ip, bytes[ip], hits[ip]
}' access.log
# ── Count requests per status code ────────────────────────
awk '{ status[$4]++ }
END {
for (s in status)
printf "HTTP %s: %d requests\n", s, status[s]
}' access.log
# ── Average response time per endpoint ────────────────────
awk '{ sum[$3] += $6; cnt[$3]++ }
END {
for (url in sum)
printf "%-40s avg=%.3fs n=%d\n", url, sum[url]/cnt[url], cnt[url]
}' access.log
# ── Top N by count ────────────────────────────────────────
awk '{ count[$1]++ }
END {
for (k in count) print count[k], k
}' access.log | sort -rn | head -10
# ── Per-minute request rate ───────────────────────────────
awk '{
# Assume timestamp is $7: 2026-05-01T10:14:02
split($7, ts, "T") # ts[2] = "10:14:02"
split(ts[2], hms, ":")
minute = hms[1] ":" hms[2]
rpm[minute]++
}
END {
for (m in rpm) print m, rpm[m]
}' access.log | sort
3
Multi-dimensional arrays and lookup tables
AWK
# ── Multi-dimensional: array[key1,key2] ───────────────────
# SUBSEP (default \034) joins the keys internally
awk '{ hits[$1,$4]++ } # hits[IP, status_code]
END {
for (combo in hits) {
split(combo, parts, SUBSEP)
printf "IP=%-15s STATUS=%s HITS=%d\n",
parts[1], parts[2], hits[combo]
}
}' access.log
# ── Lookup table: load reference data into array ──────────
# servers.csv: hostname,region,tier
# metrics.log: hostname cpu mem
awk 'BEGIN { FS="," }
FNR==NR { # First file: servers.csv
region[$1] = $2
tier[$1] = $3
next
}
{ # Second file: metrics.log
host = $1
printf "%-15s region=%-8s tier=%s cpu=%s mem=%s\n",
host, region[host], tier[host], $2, $3
}' servers.csv metrics.log
# ── Deduplication preserving order ────────────────────────
awk '!seen[$0]++' # one of AWK's most famous one-liners
# !seen[$0]++ evaluates to true the FIRST time a line appears
# because seen[$0] is 0 (falsy) and then gets incremented
# Deduplicate on specific field
awk '!seen[$1]++' # unique on first field only
awk '!seen[$2,$4]++' # unique on fields 2 and 4 combined
4
Sorting arrays in AWK (gawk)
AWK
# ── Sort by key (gawk PROCINFO) ───────────────────────────
gawk '{ count[$1]++ }
END {
PROCINFO["sorted_in"] = "@ind_str_asc" # sort by key alphabetically
for (k in count) printf "%-20s %d\n", k, count[k]
}' data.txt
# ── Sort options ──────────────────────────────────────────
# @ind_str_asc — keys alphabetically ascending
# @ind_str_desc — keys alphabetically descending
# @ind_num_asc — keys numerically ascending
# @val_num_desc — values numerically descending (top N)
# @val_str_asc — values alphabetically ascending
# ── Top 5 IPs by request count ───────────────────────────
gawk '{ count[$1]++ }
END {
PROCINFO["sorted_in"] = "@val_num_desc"
n = 0
for (ip in count) {
printf "#%-2d %-18s %d reqs\n", ++n, ip, count[ip]
if (n >= 5) break
}
}' access.log
# ── Portable sort (pipe to sort for non-gawk) ────────────
awk '{ count[$1]++ }
END { for (k in count) print count[k], k }' data.txt \
| sort -rn | head -5
Terminal output
Key
Aggregated result
High value
Lookup enriched
vriddh@prod-01:~/scripts$awk '{level[$3]++} END{for(l in level) printf "%-8s %d\n",l,level[l]}' app.log | sort
ERROR 47
INFO 1842
WARN 128
vriddh@prod-01:~/scripts$awk '{count[$1]++} END{for(k in count) print count[k],k}' access.log | sort -rn | head -3
4821 10.0.1.42
2104 10.0.1.17
1893 10.0.1.5
vriddh@prod-01:~/scripts$awk '!seen[$0]++' duplicates.txt | wc -l
842
█
✔ AWK array rules — Test key existence with
if (key in array) — never with if (array[key]) which creates the key. Use !seen[$0]++ for order-preserving deduplication. Use two-file technique (FNR==NR for first file, bare block for second) to join datasets. For top-N, pipe to sort -rn | head -N for portability, or use gawk's PROCINFO["sorted_in"] to sort inside AWK.