1 |
|
2 | # is your statsd machine maxing out cpu? ... unable to pull udp packets out of the buffer
|
3 | # at a fast enough rate? (see `netstat -su` ) timer metrics are by far the most cpu intensive
|
4 | # and tuning the sampling of those is key to keeping cpu load under control.
|
5 | # this tool (to be run on your graphite server) shows for all your timing metric keys how many packets
|
6 | # it accepted in a given interval like 1hour. using this information you can make informed decisions as
|
7 | # to which keys to sample and how much.
|
8 | # note that in some bad cases you might see no effect after increasing your amount of sampling, the explanation is
|
9 | # that you were first sending so many packets of which only a fraction were being processed and shown in these counts,
|
10 | # that even after sampling more statsd still can't process them all and your count stays in the same range.
|
11 |
|
12 | graphite_url=http://<your graphite url>
|
13 | whisper_dir=/var/lib/carbon/whisper
|
14 | timers_subdir=stats/timers
|
15 |
|
16 | # you may want to adjust this function according to the characteristics of your environment
|
17 | # I wish whisper-fetch.py supported the same function API as the http endpoint does, then I could avoid http here.
|
18 | function get_indicative_count () {
|
19 | metric=$1
|
20 | url=$graphite_url'/render/?from=-1h&target=summarize('$metric',%221hour%22,%22sum%22)&format=csv'
|
21 | wget -q "$url" -O - | tail -n -1 | sed 's#.*,##' # yields a number ending with .0 or whitespace if values were None
|
22 | }
|
23 |
|
24 | function list_timer_count_files () {
|
25 | find "$whisper_dir/$timers_subdir" -name 'count.wsp' | sed -e "s#$whisper_dir/\($timers_subdir/.*/count\).wsp#\1#" -e 's#/#.#g'
|
26 | }
|
27 |
|
28 | function list_timer_counts () {
|
29 | for metric in $(list_timer_count_files); do
|
30 | echo "$metric $(get_indicative_count $metric)"
|
31 | done
|
32 | }
|
33 | list_timer_counts | grep 'count .*\.0' | sort -n -k2
|