1 | 1 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,35 @@ |
1 |
+#!/bin/bash |
|
2 |
+# Fix obviously erroneous CPU metrics in Graphite+collectd Whisper files. |
|
3 |
+cd /var/lib/graphite/whisper/collectd/ || exit 50 |
|
4 |
+ |
|
5 |
+for host in *; do |
|
6 |
+ echo "Treating host ${host}:" |
|
7 |
+ cd "${host}" || exit 49 |
|
8 |
+ cpu_count=$(find -type d -name 'cpu-*' | wc -l) |
|
9 |
+ ((cpu_threshold=110*cpu_count)) |
|
10 |
+ echo " ${host} has ${cpu_count} CPUs, considering a threshold of ${cpu_threshold}" |
|
11 |
+ for file in cpu-*/cpu-*.wsp; do |
|
12 |
+ echo " Treating ${file}:" |
|
13 |
+ whisper-dump "${file}" | perl -lanE ' |
|
14 |
+ # Skip non-data lines: |
|
15 |
+ next unless m#^\d+: \d+, [0-9.]+$#; |
|
16 |
+ ($timestamp, $value) = ($F[1], $F[2]); |
|
17 |
+ # Spot timestamps with abnormal values: |
|
18 |
+ if ($value > '${cpu_threshold}') { |
|
19 |
+ $fix_timestamp = $timestamp; |
|
20 |
+ next; |
|
21 |
+ } |
|
22 |
+ # Pick up the next value to replace the abnormal one: |
|
23 |
+ if ($fix_timestamp) { |
|
24 |
+ $fix = sprintf(q[%d:%f], $fix_timestamp, $value); |
|
25 |
+ push(@fixes, $fix); |
|
26 |
+ $fix_timestamp = 0; |
|
27 |
+ } |
|
28 |
+ END { |
|
29 |
+ if (@fixes) { |
|
30 |
+ printf(qq[ whisper-update '${file}' %s\n], join(q[ ], @fixes)); |
|
31 |
+ } |
|
32 |
+ }' |
|
33 |
+ done |
|
34 |
+ cd - > /dev/null || exit 40 |
|
35 |
+done |