buddyinfo.py.j2

#!/usr/bin/python

##########################################################################
# Copyright (c) 2015, Salesforce.com, Inc.
# All rights reserved.
#
# Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#
# Neither the name of Salesforce.com nor the names of its
# contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
##########################################################################

"""
**buddyinfo.py**

Linux uses buddy allocator for memory management. Pages
are allocated in each NUMA node and zones within each
node. Within each zones, pages are allocated as
contiguous groups of 1, 2, 3, 4, and so on order
pages where 1 means 4K pages. Number of free pages in
each bucket is exposed through /proc/buddyinfo
When this number goes below a threshold in any bucket,
kswapd (slowpath for finding free pages) kicks in. It
then scans for free pages in all order levels until
all of them reach above min limit. This process can take
long time and may cause issues for GC latencies.

Typical contents of /proc/buddyinfo:

- Node 0, zone   Normal   1490   4026  12224   8508   4493   1929    849    301    101     45   5257
- Node 1, zone      DMA      1      1      1      1      1      0      1      0      1      1      3
- Node 1, zone    DMA32     15      3      2      5      8      7      4      4      7      8    681
- Node 1, zone   Normal   6061  13681  20887  15188   9097   4546   1948    731    273    125   3976

Here are the fields interpretation in each row:
1. NUMA node (such as 0 or 1)
2. Zone name (Normal, DMA32, DMA, etc.)
3. Col. 3 to end: page order or buckets on contiguous memory sizes: 4K, 8K, 16K, 32K, 64K, 128K, 256K, 512K, 1024K, and 2048K

"""

import collectd
import platform
import os
import socket
import time
import re

os_name = platform.system()

BUDDY_FNAME = '/proc/buddyinfo'
METRIC_PLUGIN = 'buddyinfo'
METRIC_TYPE = 'gauge'

buddy_fields = ['numa_node',
                 'zone_name',
                 'bucket_free_pages'
                ]
buddy_metrics = ['bucket_free_pages_per_sec',
                  'total_free_pages_per_sec',
                  'pct_fragment_per_sec'
                 ]

white_list = []
node_list = []
zone_list = []

stats_cache = {}
stats_current = {}

re_buddyinfo=re.compile(r'^\s*Node\s+(?P<node>\d+)'
                        r',\s+zone\s+(?P<zone>\S+)\s+(?P<pages>.*)$')


def init_stats_cache():
   global white_list

   if os.path.exists(BUDDY_FNAME):
      num_buckets = 0
      with open(BUDDY_FNAME) as f:
         for line in f:
            match = re_buddyinfo.search(line)
            if not match:
               collectd.error('buddyinfo: unknown line pattern: %s' % (line))
               continue;
            node = match.group('node')
            zone = match.group('zone')
            free_pages = match.group('pages').strip().split()
            num_buckets = len(free_pages)
            if node not in node_list:
               node_list.append(node)
            if zone not in zone_list:
               zone_list.append(zone)
            stats_cache[(node, zone, 'val')] = free_pages
            stats_cache[(node, zone, 'ts')] = time.time()
      f.close()
      for i in range(0, num_buckets):
         white_list.append('free_pages_' + str(4*2**i) + 'K')
      collectd.info('buddyinfo: node_list : %s' % (node_list))
      collectd.info('buddyinfo: zone_list : %s' % (zone_list))
      collectd.info('buddyinfo: white_list: %s' % (white_list))
   else:
      collectd.info('buddyinfo: init_stats_cache: path: %s does not exist'
                    % (BUDDY_FNAME))

def collect_buddyinfo():
    if os.path.exists(BUDDY_FNAME):
        with open(BUDDY_FNAME) as f:
            for line in f:
               match = re_buddyinfo.search(line)
               if not match:
                  continue;
               node = match.group('node')
               zone = match.group('zone')
               free_pages = match.group('pages').strip().split()
               stats_current[(node, zone, 'val')] = free_pages
               stats_current[(node, zone, 'ts')] = time.time()
               key_val = dict(zip(white_list, free_pages))
               metric = collectd.Values()
               metric.plugin = METRIC_PLUGIN
               metric.plugin_instance = node
               metric.type = METRIC_TYPE
               for k in range(0, len(white_list)):
                  metric.type_instance = 'node_'+ node + '_zone_' + zone + '.'
                  metric.type_instance += white_list[k]
                  metric.values = [free_pages[k]]
                  metric.dispatch()
            f.close()
    else:
        collectd.error('buddyinfo: procfs path: %s does not exist'
                       % (BUDDY_FNAME))

def swap_current_cache():
   stats_cache = stats_current.copy()

def configer(ObjConfiguration):
   collectd.info('buddyinfo plugin: configuring host' )

def initer():
   collectd.info('buddyinfo initer: white list: %s' % (white_list))
   init_stats_cache()
   collectd.info('buddyinfo init: stats_cache: %s' % (stats_cache))

def reader(input_data=None):
   collect_buddyinfo()
   swap_current_cache()

def writer(metric, data=None):
   for i in metric.values:
      collectd.debug('%s (%s): %f' % (metric.plugin, metric.type, i))

def shutdown():
   collectd.info('buddyinfo plugin shutting down')

#== Callbacks ==#
if (os_name == 'Linux'):
   collectd.register_config(configer)
   collectd.register_init(initer)
   collectd.register_read(reader)
   collectd.register_write(writer)
   collectd.register_shutdown(shutdown)
else:
   collectd.warning('buddyinfo plugin currently works for Linux only')