Skip to content
Snippets Groups Projects
Commit 71b1c79d authored by Chris Hines's avatar Chris Hines
Browse files

collectd monitoring changes ... monitor additional things in proc, don't...

collectd monitoring changes ... monitor additional things in proc, don't monitor slabinfo as its noisy, monitor gpus
parent bf7146ab
No related branches found
No related tags found
1 merge request!134collectd monitoring changes ... monitor additional things in proc, don't monitor…
......@@ -8,9 +8,32 @@
template: src=collectd.conf.j2 dest=/etc/collectd.d/collectd.conf mode=0600 owner=root group=root
become: true
become_user: root
register: configchange
- name: create directory for python collectd components
file: path=/etc/collectd.python state=directory owner=root group=root mode=755
become: true
become_user: root
- name: install buddyinfo python script
template: src=buddyinfo.py.j2 dest=/etc/collectd.python/buddyinfo.py mode=0700 owner=root group=root
become: true
become_user: root
- name: install cuda_collectd python script
template: src=cuda_collectd.py.j2 dest=/etc/collectd.python/cuda_collectd.py mode=0700 owner=root group=root
become: true
become_user: root
when: cudamonitor is defined
- name: start collectd service
service: name=collectd state=started enabled=true
become: true
become_user: root
- name: restart collectd service
service: name=collectd state=restarted enabled=true
become: true
become_user: root
when: configchange | changed
......@@ -159,7 +159,7 @@ LoadPlugin memory
#LoadPlugin powerdns
LoadPlugin processes
#LoadPlugin protocols
#LoadPlugin python
LoadPlugin python
#LoadPlugin redis
#LoadPlugin rrdcached
#LoadPlugin rrdtool
......@@ -903,8 +903,17 @@ LoadPlugin users
# IgnoreSelected false
#</Plugin>
#<Plugin python>
# ModulePath "/path/to/your/python/modules"
<Plugin python>
ModulePath "/etc/collectd.python/"
Import "buddyinfo"
<Module buddyinfo>
</Module>
{% if cudamonitor is defined %}
Import "cuda_collectd"
<Module cuda_collectd>
</Module>
{% endif %}
</Plugin>
# LogTraces true
# Interactive true
# Import "spam"
......@@ -1050,22 +1059,95 @@ LoadPlugin users
#</Plugin>
<Plugin table>
<Table "/proc/slabinfo">
Instance "slabinfo"
Separator " "
<Result>
Type gauge
InstancePrefix "active_objs"
InstancesFrom 0
ValuesFrom 1
</Result>
<Result>
Type gauge
InstancePrefix "objperslab"
InstancesFrom 0
ValuesFrom 4
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "alloc_sleep_millisecs"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/defrag">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "defrag"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "full_scans"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "max_ptes_none"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "pages_collapsed"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "pages_to_scan"
ValuesFrom 0
</Result>
</Table>
<Table "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs">
Instance "khugepaged"
Separator " "
<Result>
Type gauge
InstancePrefix "scan_sleep_millisecs"
ValuesFrom 0
</Result>
</Table>
<Table "/proc/vmstat">
Instance "vmstat"
Separator " "
<Result>
Type gauge
InstancePrefix "vmstat"
InstancesFrom 0
ValuesFrom 1
</Result>
</Table>
# <Table "/proc/slabinfo">
# Instance "slabinfo"
# Separator " "
# <Result>
# Type gauge
# InstancePrefix "active_objs"
# InstancesFrom 0
# ValuesFrom 1
# </Result>
# <Result>
# Type gauge
# InstancePrefix "objperslab"
# InstancesFrom 0
# ValuesFrom 4
# </Result>
# </Table>
</Plugin>
#<Plugin tail>
......
#!/usr/bin/env python
import collectd
import subprocess
import xml.etree.ElementTree as ET
def read(data=None):
vl = collectd.Values(type='gauge')
vl.plugin = 'cuda'
out = subprocess.check_output(['nvidia-smi', '-q', '-x'])
root = ET.fromstring(out)
for gpu in root.iter('gpu'):
vl.plugin_instance = 'cuda-%s' % (gpu.attrib['id'])
try:
vl.dispatch(type='fanspeed',
values=[float(gpu.find('fan_speed').text.split()[0])])
except:
pass
try:
vl.dispatch(type='temperature',
values=[float(gpu.find('temperature/gpu_temp').text.split()[0])])
except:
pass
try:
vl.dispatch(type='memory', type_instance='used',
values=[1e6 * float(gpu.find('memory_usage/used').text.split()[0])])
except:
pass
collectd.register_read(read)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment