Shell ScriptingReal-World ProjectAdvancedLogsAWKMay 2026

Shell Scripting Real-World Projects: Log Analysis and Reporting Pipeline

Build a complete Nginx access log analysis pipeline — traffic summary, status code breakdown, top endpoints and clients, error analysis, response time percentiles, and hourly traffic heatmaps.

Log files are the most information-dense artifact a server produces. A well-designed log analysis pipeline extracts traffic patterns, error rates, slow endpoints, top clients, and geographic distributions in seconds — without loading a single GB into Elasticsearch.

BASH
#!/usr/bin/env bash
# log_analysis.sh — Complete Nginx access log report

set -euo pipefail

LOG="${1:-/var/log/nginx/access.log}"
DATE="${2:-$(date -d yesterday +%Y-%m-%d 2>/dev/null || date -v-1d +%Y-%m-%d)}"

# Filter to yesterday's lines only
grep "^.*\[${DATE}" "${LOG}" 2>/dev/null > /tmp/log_slice.txt || \
  cp "${LOG}" /tmp/log_slice.txt

TOTAL=$(wc -l < /tmp/log_slice.txt)
[[ "${TOTAL}" -eq 0 ]] && { echo "No log entries for ${DATE}"; exit 0; }

printf "\n  %-60s\n" "ACCESS LOG REPORT — ${DATE}"
printf "  %-60s\n\n" "$(printf '─%.0s' {1..60})"

# ── Traffic summary ───────────────────────────────────────
printf "  TRAFFIC SUMMARY\n"
printf "  %-30s %s\n" "Total requests:" "${TOTAL}"
printf "  %-30s %s\n" "Unique IPs:" "$(awk '{print $1}' /tmp/log_slice.txt | sort -u | wc -l)"
printf "  %-30s %s\n" "Total bandwidth:" \
  "$(awk '{s+=$10}END{printf "%.1f MB",s/1048576}' /tmp/log_slice.txt)"

# HTTP status breakdown
printf "\n  STATUS CODES\n"
awk '{print $9}' /tmp/log_slice.txt | sort | uniq -c | sort -rn | head -10 | \
  awk '{printf "  %-10s %6d  %s\n",$2,$1,($2>=500?"✘ ERROR":($2>=400?"⚠ CLIENT":"✔"))}'

# ── Top endpoints ─────────────────────────────────────────
printf "\n  TOP 10 ENDPOINTS\n"
awk '{print $7}' /tmp/log_slice.txt | \
  sed 's/\?.*$//' | sort | uniq -c | sort -rn | head -10 | \
  awk '{printf "  %6d  %s\n",$1,$2}'

# ── Top IPs ───────────────────────────────────────────────
printf "\n  TOP 10 CLIENTS\n"
awk '{print $1}' /tmp/log_slice.txt | sort | uniq -c | sort -rn | head -10 | \
  awk '{printf "  %6d  %s\n",$1,$2}'

# ── Error analysis ────────────────────────────────────────
printf "\n  TOP ERRORS (4xx/5xx)\n"
awk '$9>=400{print $9,$7}' /tmp/log_slice.txt | \
  sed 's/\?.*$//' | sort | uniq -c | sort -rn | head -10 | \
  awk '{printf "  %6d  %s %s\n",$1,$2,$3}'

# ── Response time percentiles ─────────────────────────────
printf "\n  RESPONSE TIME (ms)\n"
awk 'NF>11{print $NF*1000}' /tmp/log_slice.txt | sort -n | \
  awk 'BEGIN{n=0} {a[n++]=$1} END{
    printf "  P50: %.0fms  P95: %.0fms  P99: %.0fms  Max: %.0fms\n",
    a[int(n*0.50)], a[int(n*0.95)], a[int(n*0.99)], a[n-1]
  }'

# ── Hourly traffic heatmap ─────────────────────────────────
printf "\n  HOURLY TRAFFIC\n"
awk '{match($4,/:[0-9]+:/); h=substr($4,RSTART+1,2); count[h]++}
END{
  max=0; for(h in count) if(count[h]>max) max=count[h]
  for(h=0;h<24;h++) {
    key=sprintf("%02d",h)
    bar=count[key]+0
    filled=int(bar/max*30)
    printf "  %02d  |",h
    for(i=0;i<filled;i++) printf "#"
    for(i=filled;i<30;i++) printf " "
    printf "| %d\n",bar
  }
}' /tmp/log_slice.txt

rm -f /tmp/log_slice.txt
BASH
#!/usr/bin/env bash
# error_trend.sh — Detect rising error rates in application logs

LOG_DIR="/var/log/myapp"
WINDOW_MINUTES=5
THRESHOLD=50   # errors per window to alert

# Count errors in last N minutes
count_recent_errors() {
  local pattern="${1:-ERROR}"
  find "${LOG_DIR}" -name "*.log" -newer /tmp/.error_check_last 2>/dev/null \
    -exec grep -c "${pattern}" {} + 2>/dev/null | \
    awk '{s+=$1}END{print s+0}'
}

touch /tmp/.error_check_last
sleep "${WINDOW_MINUTES}m" &
SLEEP_PID=$!

# Background: tail logs for the window
tail -f "${LOG_DIR}"/app.log 2>/dev/null | \
  awk -v limit="${THRESHOLD}" '
    /ERROR/{errors++}
    errors >= limit {
      print "THRESHOLD: "errors" errors in window"
      exit
    }
  ' &
TAIL_PID=$!

wait "${SLEEP_PID}"
kill "${TAIL_PID}" 2>/dev/null

# Final count
ERRORS=$(count_recent_errors "ERROR")
CRITS=$(count_recent_errors "CRITICAL")
touch /tmp/.error_check_last

printf "  Last %d min: %d ERRORs, %d CRITICALs\n" \
  "${WINDOW_MINUTES}" "${ERRORS}" "${CRITS}"

(( ERRORS > THRESHOLD )) && \
  echo "ALERT: ${ERRORS} errors exceed threshold ${THRESHOLD}" | \
  mail -s "Error spike on $(hostname)" ops@example.com
✔ Log analysis rules — Always filter by date first to avoid processing irrelevant historical data. Use awk '{print $9}' for Nginx status codes, $7 for URLs, $1 for IPs — the combined log format is consistent. Strip query strings with sed 's/?.*//' before counting endpoints. Store slice files in /tmp and clean up after. Run the full analysis in cron daily and ship the output by email.