#!/bin/bash
# Fix obviously erroneous CPU metrics in Graphite+collectd Whisper files.
cd /var/lib/graphite/whisper/collectd/ || exit 50
for host in *; do
echo "Treating host ${host}:"
cd "${host}" || exit 49
cpu_count=$(find -type d -name 'cpu-*' | wc -l)
((cpu_threshold=110*cpu_count))
echo " ${host} has ${cpu_count} CPUs, considering a threshold of ${cpu_threshold}"
for file in cpu-*/cpu-*.wsp; do
echo " Treating ${file}:"
whisper-dump "${file}" | perl -lanE '
# Skip non-data lines:
next unless m#^\d+: \d+, [0-9.]+$#;
($timestamp, $value) = ($F[1], $F[2]);
# Spot timestamps with abnormal values:
if ($value > '${cpu_threshold}') {
$fix_timestamp = $timestamp;
next;
}
# Pick up the next value to replace the abnormal one:
if ($fix_timestamp) {
$fix = sprintf(q[%d:%f], $fix_timestamp, $value);
push(@fixes, $fix);
$fix_timestamp = 0;
}
END {
if (@fixes) {
printf(qq[ whisper-update '${file}' %s\n], join(q[ ], @fixes));
}
}'
done
cd - > /dev/null || exit 40
done