AWK's string functions cover everything from simple substitution to complex regex extraction. The combination of match(), substr(), split(), and sprintf() lets you parse nearly any structured text format — including multi-part log lines, URLs, IP addresses, timestamps, and JSON-like data — without reaching for Python or Perl.
1
sub and gsub — find and replace
AWK
# sub(regex, replacement, target) — replace FIRST match
# gsub(regex, replacement, target) — replace ALL matches
# Default target is $0 (whole line)
# ── Basic substitution ────────────────────────────────────
awk '{ sub(/ERROR/, "CRITICAL"); print }' # first ERROR only
awk '{ gsub(/ERROR/, "CRITICAL"); print }' # all ERRORs
awk '{ gsub(/ +/, " "); print }' # collapse multiple spaces
awk '{ gsub(/^[[:space:]]+|[[:space:]]+$/, ""); print }' # trim
# ── Replace on specific field ─────────────────────────────
awk '{
gsub(/[0-9]+/, "***", $3) # redact numbers in field 3
print
}'
# ── & in replacement = matched text ──────────────────────
awk '{ gsub(/[0-9]+/, "[&]"); print }' # wrap numbers in brackets
# Input: "Request 42 from 10.0.1.5"
# Output: "Request [42] from [10].[0].[1].[5]"
# ── gensub (gawk only) — capture groups ───────────────────
gawk '{ print gensub(/([0-9]+)/, "(\\1)", "g") }'
# gensub(regex, replacement, how, target)
# how: "g" = all, "1" = first, "2" = second match...
# \\1 refers to capture group 1
# Swap two fields: "word1 word2" → "word2 word1"
gawk '{ print gensub(/^([^ ]+) ([^ ]+)/, "\\2 \\1", 1) }'
2
match — locate patterns precisely
AWK
# match(string, regex) — sets RSTART and RLENGTH
# RSTART: position of match (0 if no match)
# RLENGTH: length of match (-1 if no match)
# ── Extract IP address from log line ──────────────────────
awk '{ if (match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/))
print "IP:", substr($0, RSTART, RLENGTH) }' app.log
# ── Extract timestamp ─────────────────────────────────────
awk '{ if (match($0, /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/))
print substr($0, RSTART, RLENGTH), "→", $NF }' app.log
# ── Extract URL from access log ───────────────────────────
awk '{ if (match($0, /GET|POST|PUT|DELETE/)) {
method = substr($0, RSTART, RLENGTH)
match($0, /\/[^ "]+/)
path = substr($0, RSTART, RLENGTH)
print method, path
}
}' access.log
# ── gawk match with array (capture groups) ────────────────
gawk '{ if (match($0, /user=([^ ]+) action=([^ ]+)/, m))
print "User:", m[1], "Action:", m[2] }' audit.log
3
split — break strings into arrays
AWK
# split(string, array, separator) → returns number of elements
# Array is 1-indexed
# ── Parse timestamp components ────────────────────────────
awk '{
# $1 = "2026-05-01T10:14:02"
n = split($1, dt, "T") # dt[1]="2026-05-01" dt[2]="10:14:02"
split(dt[1], d, "-") # d[1]=2026 d[2]=05 d[3]=01
split(dt[2], t, ":") # t[1]=10 t[2]=14 t[3]=02
printf "Year=%s Month=%s Day=%s Hour=%s\n", d[1], d[2], d[3], t[1]
}' events.log
# ── Parse path components ─────────────────────────────────
awk '{
n = split($3, parts, "/") # split URL path
print "Endpoint:", parts[2], "Resource:", parts[3]
}' access.log
# /api/users → parts[1]="" parts[2]="api" parts[3]="users"
# ── Parse comma-separated tags ────────────────────────────
awk -F'\t' '{
n = split($4, tags, ",") # field 4 = "web,prod,nginx"
for (i=1; i<=n; i++) {
tag = tags[i]
gsub(/^[[:space:]]+|[[:space:]]+$/, "", tag) # trim
tag_count[tag]++
}
}
END {
for (t in tag_count) printf "%-20s %d\n", t, tag_count[t]
}' servers.tsv
# ── Split with regex separator ────────────────────────────
awk '{ n = split($0, parts, /[,;|]+/)
for (i=1; i<=n; i++) print parts[i] }' # split on any delimiter
4
sprintf — format strings precisely
AWK
# sprintf works like printf but returns the string
# ── Build formatted output strings ───────────────────────
awk '{
# Format bytes as human readable
if ($5 > 1073741824) size = sprintf("%.1fG", $5/1073741824)
else if ($5 > 1048576) size = sprintf("%.1fM", $5/1048576)
else if ($5 > 1024) size = sprintf("%.1fK", $5/1024)
else size = sprintf("%dB", $5)
printf "%-20s %8s\n", $1, size
}' transfer.log
# ── Pad numbers for sorting ───────────────────────────────
awk '{ printf "%s_%04d\n", $1, $2 }' # host_0042
# ── Build output filenames ────────────────────────────────
awk 'NR==1 || NR%1000==1 {
filename = sprintf("/tmp/chunk_%05d.txt", int(NR/1000))
if (current != filename) { close(current); current = filename }
}
{ print > current }' bigfile.txt
# ── Build JSON output ─────────────────────────────────────
awk 'BEGIN { print "[" }
NR>1 { if (NR>2) print ","
printf "{\"host\":\"%s\",\"cpu\":%s,\"mem\":%s}", $1, $2, $3
}
END { print "\n]" }' metrics.log
vriddh@prod-01:~/scripts$echo "2026-05-01T10:14:02 ERROR user=vriddh action=login ip=10.0.1.5" | gawk '{match($0,/user=([^ ]+) action=([^ ]+)/,m); print "User:",m[1],"Action:",m[2]}'
User: vriddh Action: login
vriddh@prod-01:~/scripts$awk '{gsub(/^[[:space:]]+|[[:space:]]+$/,"",$0); print}' messy.txt | head -3
prod-web-01
prod-db-01
prod-cache-01
█
✔ AWK string rules — Use
gsub() for global replace, sub() for first match only. Use & in replacement to refer to the matched text. Use match() + substr($0, RSTART, RLENGTH) to extract patterns. Use gawk's match(str, re, arr) for capture groups — it's the cleanest way to extract named substrings. Use sprintf() when you need to build a formatted string and store it in a variable rather than print it immediately.