When an incident happens, you need answers in seconds, not minutes. An incident response toolkit is a curated set of diagnostic commands you run immediately — capturing system state, active connections, recent errors, and resource consumers — all from a single script that documents the investigation as it runs.
1Incident snapshot collector
BASH
#!/usr/bin/env bash
# incident_snapshot.sh — Capture full system state during an incident
set -euo pipefail
INCIDENT_ID="${1:-INC-$(date +%Y%m%d-%H%M%S)}"
SNAP_DIR="/var/incidents/${INCIDENT_ID}"
mkdir -p "${SNAP_DIR}"
log() { echo "[$(date +%H:%M:%S)] $*" | tee -a "${SNAP_DIR}/NOTES.txt"; }
log "=== INCIDENT SNAPSHOT: ${INCIDENT_ID} ==="
log "Host: $(hostname) User: ${USER} Time: $(date)"
# ── System state ──────────────────────────────────────────
log "Capturing system state..."
date > "${SNAP_DIR}/timestamp.txt"
uptime > "${SNAP_DIR}/uptime.txt"
free -h > "${SNAP_DIR}/memory.txt"
top -bn1 > "${SNAP_DIR}/top.txt"
ps aux --sort=-%cpu > "${SNAP_DIR}/processes_cpu.txt"
ps aux --sort=-%mem > "${SNAP_DIR}/processes_mem.txt"
df -h > "${SNAP_DIR}/disk_usage.txt"
iostat -x 1 5 2>/dev/null > "${SNAP_DIR}/iostat.txt" || true
vmstat 1 5 > "${SNAP_DIR}/vmstat.txt"
cat /proc/loadavg > "${SNAP_DIR}/loadavg.txt"
dmesg | tail -100 > "${SNAP_DIR}/dmesg.txt"
# ── Network ───────────────────────────────────────────────
log "Capturing network state..."
ss -tunaep > "${SNAP_DIR}/connections.txt"
ss -s > "${SNAP_DIR}/socket_summary.txt"
netstat -rn 2>/dev/null > "${SNAP_DIR}/routes.txt" || ip route > "${SNAP_DIR}/routes.txt"
ss -nt state established > "${SNAP_DIR}/established.txt"
# Count connections by remote IP
ss -nt state established | awk 'NR>1{split($5,a,":");c[a[1]]++}END{for(k in c)print c[k],k}' | \
sort -rn | head -20 > "${SNAP_DIR}/top_clients.txt"
# ── Application logs ──────────────────────────────────────
log "Capturing recent logs..."
tail -1000 /var/log/nginx/error.log > "${SNAP_DIR}/nginx_errors.txt" 2>/dev/null || true
tail -1000 /var/log/myapp/app.log > "${SNAP_DIR}/app_log.txt" 2>/dev/null || true
# Last 50 errors
grep -h "ERROR\|CRITICAL\|FATAL" /var/log/myapp/*.log 2>/dev/null | \
tail -50 > "${SNAP_DIR}/recent_errors.txt" || true
# ── Database ──────────────────────────────────────────────
log "Capturing database state..."
mysql --defaults-file=/etc/myapp/mysql.conf -BNs -e \
"SELECT id,user,host,db,command,time,state,LEFT(info,200) FROM information_schema.PROCESSLIST
ORDER BY time DESC" 2>/dev/null > "${SNAP_DIR}/mysql_processlist.txt" || true
mysql --defaults-file=/etc/myapp/mysql.conf -BNs -e \
"SHOW GLOBAL STATUS LIKE 'Threads_connected'" 2>/dev/null \
> "${SNAP_DIR}/mysql_threads.txt" || true
redis-cli INFO all 2>/dev/null > "${SNAP_DIR}/redis_info.txt" || true
# ── Service status ────────────────────────────────────────
log "Capturing service status..."
for svc in nginx php8.3-fpm mysql redis-server; do
systemctl status "${svc}" --no-pager 2>/dev/null \
>> "${SNAP_DIR}/service_status.txt" || true
echo "---" >> "${SNAP_DIR}/service_status.txt"
done
# ── Package the snapshot ──────────────────────────────────
log "Packaging snapshot..."
tar -czf "/var/incidents/${INCIDENT_ID}.tar.gz" -C "${SNAP_DIR}" .
log "Snapshot saved: /var/incidents/${INCIDENT_ID}.tar.gz"
log " Size: $(du -sh "/var/incidents/${INCIDENT_ID}.tar.gz" | cut -f1)"
log ""
log "Quick stats:"
log " Load: $(cat /proc/loadavg | awk '{print $1,$2,$3}')"
log " Memory: $(free | awk '/Mem:/{printf "%.0f%%", $3/$2*100}') used"
log " Top proc: $(ps aux --sort=-%cpu | awk 'NR==2{print $11" ("$3"%CPU)"}')"
log " DB connections: $(cat "${SNAP_DIR}/mysql_threads.txt" 2>/dev/null | awk '{print $2}' || echo 'N/A')"
2Interactive incident investigation
BASH
#!/usr/bin/env bash
# incident_investigate.sh — Interactive investigation during incidents
# ── Find the process consuming most CPU ───────────────────
top_cpu() {
ps aux --sort=-%cpu | head -6
echo ""
echo "strace on PID? (enter PID or q to skip):"
read -r pid
[[ "${pid}" =~ ^[0-9]+$ ]] && sudo strace -p "${pid}" -c -f 2>&1 | head -30
}
# ── Find what's using disk ────────────────────────────────
top_disk() {
du -sh /* 2>/dev/null | sort -rh | head -20
echo ""
echo " Growing files (written in last 10 min):"
find /var /tmp /opt -newer /tmp/.inc_marker -size +10M \
-exec ls -lh {} + 2>/dev/null | sort -k5 -rh | head -10
touch /tmp/.inc_marker
}
# ── Check who is connected ────────────────────────────────
active_connections() {
echo "Active SSH sessions:"
who
echo ""
echo "Recent auth events:"
grep -E "(Accepted|Failed|Invalid)" /var/log/auth.log | tail -20
}
# ── Database hot spots ────────────────────────────────────
db_hotspots() {
mysql --defaults-file=/etc/myapp/mysql.conf -e "
SELECT id,user,time,LEFT(info,100) AS query
FROM information_schema.PROCESSLIST
WHERE command != 'Sleep' AND time > 5
ORDER BY time DESC"
}
# ── Live error rate ───────────────────────────────────────
live_errors() {
echo "Tailing error log (Ctrl+C to stop):"
tail -f /var/log/myapp/app.log | grep --color=auto -E "ERROR|CRITICAL|FATAL"
}
echo "Incident Investigation Commands:"
echo " top_cpu — CPU hotspot analysis"
echo " top_disk — Disk usage investigation"
echo " active_connections — Who is connected"
echo " db_hotspots — Slow database queries"
echo " live_errors — Tail error log"
echo ""
echo "Or run: ./incident_snapshot.sh INC-$(date +%Y%m%d-%H%M%S)"
✔ Incident response rules — Capture state first, investigate second — system state changes fast during an incident. Store everything to files with timestamps so post-incident review is possible. Run the snapshot script at the start of every incident, not after you've fixed it. Keep the incident ID consistent across all files, logs, and Slack messages. Package the snapshot as a tarball for sharing with team members and for archiving.