diff --git a/roles/collectd/templates/buddyinfo.py.j2 b/roles/collectd/templates/buddyinfo.py.j2 new file mode 100644 index 0000000000000000000000000000000000000000..a994812761b43a4f0304e93c8502e435b8b56c28 --- /dev/null +++ b/roles/collectd/templates/buddyinfo.py.j2 @@ -0,0 +1,183 @@ +#!/usr/bin/python + +########################################################################## +# Copyright (c) 2015, Salesforce.com, Inc. +# All rights reserved. +# +# Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# Neither the name of Salesforce.com nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +########################################################################## + +""" +**buddyinfo.py** + +Linux uses buddy allocator for memory management. Pages +are allocated in each NUMA node and zones within each +node. Within each zones, pages are allocated as +contiguous groups of 1, 2, 3, 4, and so on order +pages where 1 means 4K pages. Number of free pages in +each bucket is exposed through /proc/buddyinfo +When this number goes below a threshold in any bucket, +kswapd (slowpath for finding free pages) kicks in. It +then scans for free pages in all order levels until +all of them reach above min limit. This process can take +long time and may cause issues for GC latencies. + +Typical contents of /proc/buddyinfo: + +- Node 0, zone Normal 1490 4026 12224 8508 4493 1929 849 301 101 45 5257 +- Node 1, zone DMA 1 1 1 1 1 0 1 0 1 1 3 +- Node 1, zone DMA32 15 3 2 5 8 7 4 4 7 8 681 +- Node 1, zone Normal 6061 13681 20887 15188 9097 4546 1948 731 273 125 3976 + +Here are the fields interpretation in each row: +1. NUMA node (such as 0 or 1) +2. Zone name (Normal, DMA32, DMA, etc.) +3. Col. 3 to end: page order or buckets on contiguous memory sizes: 4K, 8K, 16K, 32K, 64K, 128K, 256K, 512K, 1024K, and 2048K + +""" + +import collectd +import platform +import os +import socket +import time +import re + +os_name = platform.system() + +BUDDY_FNAME = '/proc/buddyinfo' +METRIC_PLUGIN = 'buddyinfo' +METRIC_TYPE = 'gauge' + +buddy_fields = ['numa_node', + 'zone_name', + 'bucket_free_pages' + ] +buddy_metrics = ['bucket_free_pages_per_sec', + 'total_free_pages_per_sec', + 'pct_fragment_per_sec' + ] + +white_list = [] +node_list = [] +zone_list = [] + +stats_cache = {} +stats_current = {} + +re_buddyinfo=re.compile(r'^\s*Node\s+(?P<node>\d+)' + r',\s+zone\s+(?P<zone>\S+)\s+(?P<pages>.*)$') + + +def init_stats_cache(): + global white_list + + if os.path.exists(BUDDY_FNAME): + num_buckets = 0 + with open(BUDDY_FNAME) as f: + for line in f: + match = re_buddyinfo.search(line) + if not match: + collectd.error('buddyinfo: unknown line pattern: %s' % (line)) + continue; + node = match.group('node') + zone = match.group('zone') + free_pages = match.group('pages').strip().split() + num_buckets = len(free_pages) + if node not in node_list: + node_list.append(node) + if zone not in zone_list: + zone_list.append(zone) + stats_cache[(node, zone, 'val')] = free_pages + stats_cache[(node, zone, 'ts')] = time.time() + f.close() + for i in range(0, num_buckets): + white_list.append('free_pages_' + str(4*2**i) + 'K') + collectd.info('buddyinfo: node_list : %s' % (node_list)) + collectd.info('buddyinfo: zone_list : %s' % (zone_list)) + collectd.info('buddyinfo: white_list: %s' % (white_list)) + else: + collectd.info('buddyinfo: init_stats_cache: path: %s does not exist' + % (BUDDY_FNAME)) + +def collect_buddyinfo(): + if os.path.exists(BUDDY_FNAME): + with open(BUDDY_FNAME) as f: + for line in f: + match = re_buddyinfo.search(line) + if not match: + continue; + node = match.group('node') + zone = match.group('zone') + free_pages = match.group('pages').strip().split() + stats_current[(node, zone, 'val')] = free_pages + stats_current[(node, zone, 'ts')] = time.time() + key_val = dict(zip(white_list, free_pages)) + metric = collectd.Values() + metric.plugin = METRIC_PLUGIN + metric.plugin_instance = node + metric.type = METRIC_TYPE + for k in range(0, len(white_list)): + metric.type_instance = 'node_'+ node + '_zone_' + zone + '.' + metric.type_instance += white_list[k] + metric.values = [free_pages[k]] + metric.dispatch() + f.close() + else: + collectd.error('buddyinfo: procfs path: %s does not exist' + % (BUDDY_FNAME)) + +def swap_current_cache(): + stats_cache = stats_current.copy() + +def configer(ObjConfiguration): + collectd.info('buddyinfo plugin: configuring host' ) + +def initer(): + collectd.info('buddyinfo initer: white list: %s' % (white_list)) + init_stats_cache() + collectd.info('buddyinfo init: stats_cache: %s' % (stats_cache)) + +def reader(input_data=None): + collect_buddyinfo() + swap_current_cache() + +def writer(metric, data=None): + for i in metric.values: + collectd.debug('%s (%s): %f' % (metric.plugin, metric.type, i)) + +def shutdown(): + collectd.info('buddyinfo plugin shutting down') + +#== Callbacks ==# +if (os_name == 'Linux'): + collectd.register_config(configer) + collectd.register_init(initer) + collectd.register_read(reader) + collectd.register_write(writer) + collectd.register_shutdown(shutdown) +else: + collectd.warning('buddyinfo plugin currently works for Linux only')