diff --git a/roles/collectd/tasks/main.yml b/roles/collectd/tasks/main.yml index 78537218634b2c573464e8963e3830c4abafbc93..b951071e3d1bc86b7a9e57ffcd2a68a0ff643d4e 100644 --- a/roles/collectd/tasks/main.yml +++ b/roles/collectd/tasks/main.yml @@ -8,9 +8,32 @@ template: src=collectd.conf.j2 dest=/etc/collectd.d/collectd.conf mode=0600 owner=root group=root become: true become_user: root + register: configchange + +- name: create directory for python collectd components + file: path=/etc/collectd.python state=directory owner=root group=root mode=755 + become: true + become_user: root + +- name: install buddyinfo python script + template: src=buddyinfo.py.j2 dest=/etc/collectd.python/buddyinfo.py mode=0700 owner=root group=root + become: true + become_user: root + +- name: install cuda_collectd python script + template: src=cuda_collectd.py.j2 dest=/etc/collectd.python/cuda_collectd.py mode=0700 owner=root group=root + become: true + become_user: root + when: cudamonitor is defined - name: start collectd service service: name=collectd state=started enabled=true become: true become_user: root +- name: restart collectd service + service: name=collectd state=restarted enabled=true + become: true + become_user: root + when: configchange | changed + diff --git a/roles/collectd/templates/collectd.conf.j2 b/roles/collectd/templates/collectd.conf.j2 index 31d5b0fbacf897f44869291e7691827fa596252d..aee0dd817297efa0eff4935223975e2635390675 100644 --- a/roles/collectd/templates/collectd.conf.j2 +++ b/roles/collectd/templates/collectd.conf.j2 @@ -159,7 +159,7 @@ LoadPlugin memory #LoadPlugin powerdns LoadPlugin processes #LoadPlugin protocols -#LoadPlugin python +LoadPlugin python #LoadPlugin redis #LoadPlugin rrdcached #LoadPlugin rrdtool @@ -903,8 +903,17 @@ LoadPlugin users # IgnoreSelected false #</Plugin> -#<Plugin python> -# ModulePath "/path/to/your/python/modules" +<Plugin python> + ModulePath "/etc/collectd.python/" + Import "buddyinfo" + <Module buddyinfo> + </Module> + {% if cudamonitor is defined %} + Import "cuda_collectd" + <Module cuda_collectd> + </Module> + {% endif %} +</Plugin> # LogTraces true # Interactive true # Import "spam" @@ -1050,22 +1059,95 @@ LoadPlugin users #</Plugin> <Plugin table> - <Table "/proc/slabinfo"> - Instance "slabinfo" - Separator " " - <Result> - Type gauge - InstancePrefix "active_objs" - InstancesFrom 0 - ValuesFrom 1 - </Result> - <Result> - Type gauge - InstancePrefix "objperslab" - InstancesFrom 0 - ValuesFrom 4 - </Result> - </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "alloc_sleep_millisecs" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/defrag"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "defrag" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "full_scans" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "max_ptes_none" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "pages_collapsed" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "pages_to_scan" + ValuesFrom 0 + </Result> + </Table> + <Table "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs"> + Instance "khugepaged" + Separator " " + <Result> + Type gauge + InstancePrefix "scan_sleep_millisecs" + ValuesFrom 0 + </Result> + </Table> + <Table "/proc/vmstat"> + Instance "vmstat" + Separator " " + <Result> + Type gauge + InstancePrefix "vmstat" + InstancesFrom 0 + ValuesFrom 1 + </Result> + </Table> +# <Table "/proc/slabinfo"> +# Instance "slabinfo" +# Separator " " +# <Result> +# Type gauge +# InstancePrefix "active_objs" +# InstancesFrom 0 +# ValuesFrom 1 +# </Result> +# <Result> +# Type gauge +# InstancePrefix "objperslab" +# InstancesFrom 0 +# ValuesFrom 4 +# </Result> +# </Table> </Plugin> #<Plugin tail> diff --git a/roles/collectd/templates/cuda_collectd.py.j2 b/roles/collectd/templates/cuda_collectd.py.j2 new file mode 100644 index 0000000000000000000000000000000000000000..780d21a9eb6b48da4f4c4154f542d0baaae109da --- /dev/null +++ b/roles/collectd/templates/cuda_collectd.py.j2 @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import collectd +import subprocess +import xml.etree.ElementTree as ET + +def read(data=None): + vl = collectd.Values(type='gauge') + vl.plugin = 'cuda' + + out = subprocess.check_output(['nvidia-smi', '-q', '-x']) + root = ET.fromstring(out) + + for gpu in root.iter('gpu'): + vl.plugin_instance = 'cuda-%s' % (gpu.attrib['id']) + + try: + vl.dispatch(type='fanspeed', + values=[float(gpu.find('fan_speed').text.split()[0])]) + except: + pass + try: + vl.dispatch(type='temperature', + values=[float(gpu.find('temperature/gpu_temp').text.split()[0])]) + except: + pass + + try: + vl.dispatch(type='memory', type_instance='used', + values=[1e6 * float(gpu.find('memory_usage/used').text.split()[0])]) + except: + pass + +collectd.register_read(read) +