UNPKG

1.85 kBapplication/x-shView Raw
1#!/bin/bash
2# is your statsd machine maxing out cpu? ... unable to pull udp packets out of the buffer
3# at a fast enough rate? (see `netstat -su` ) timer metrics are by far the most cpu intensive
4# and tuning the sampling of those is key to keeping cpu load under control.
5# this tool (to be run on your graphite server) shows for all your timing metric keys how many packets
6# it accepted in a given interval like 1hour. using this information you can make informed decisions as
7# to which keys to sample and how much.
8# note that in some bad cases you might see no effect after increasing your amount of sampling, the explanation is
9# that you were first sending so many packets of which only a fraction were being processed and shown in these counts,
10# that even after sampling more statsd still can't process them all and your count stays in the same range.
11
12graphite_url=http://<your graphite url>
13whisper_dir=/var/lib/carbon/whisper
14timers_subdir=stats/timers
15
16# you may want to adjust this function according to the characteristics of your environment
17# I wish whisper-fetch.py supported the same function API as the http endpoint does, then I could avoid http here.
18function get_indicative_count () {
19 metric=$1
20 url=$graphite_url'/render/?from=-1h&target=summarize('$metric',%221hour%22,%22sum%22)&format=csv'
21 wget -q "$url" -O - | tail -n -1 | sed 's#.*,##' # yields a number ending with .0 or whitespace if values were None
22}
23
24function list_timer_count_files () {
25 find "$whisper_dir/$timers_subdir" -name 'count.wsp' | sed -e "s#$whisper_dir/\($timers_subdir/.*/count\).wsp#\1#" -e 's#/#.#g'
26}
27
28function list_timer_counts () {
29 for metric in $(list_timer_count_files); do
30 echo "$metric $(get_indicative_count $metric)"
31 done
32}
33list_timer_counts | grep 'count .*\.0' | sort -n -k2