Skip to content
Snippets Groups Projects
Commit 521a2013 authored by Kerri Wait's avatar Kerri Wait
Browse files

Update telegraf role to intelligently populate hw_counters plugin config

parent 6949098d
No related branches found
No related tags found
9 merge requests!399Capture extra NFS stats,!393Hotfix: monitor NFS GETATTR stats via telegraf,!392Temporarily disable inputs.filecount in telegraf,!389Update telegraf config to ignore more ethX interfaces in ethtool plugin,!388Fix the telegraf config for mlx hw counters to get rid of errors in logs,!387Telegraf 1.15 nvidia_smi fix,!386Telegraf 1.15 nvidia_smi plugin,!380Telegraf monitoring for Karaage (hpc.erc.monash.edu.au),!374Fix hw_counters for telegraf and enable ethtool telegraf plugin
This commit is part of merge request !380. Comments created here will be created in the context of that merge request.
......@@ -6,10 +6,12 @@ import os
def render_data(data):
return json.dumps(data)
device_name = 'mlx5_0'
hw_counters = {}
path = '/sys/class/infiniband/{}/ports/1/hw_counters/'.format(device_name)
path_prefix = '/sys/class/infiniband'
path_suffix = 'ports/1/hw_counters'
hw_counters[device_name] = os.listdir(path)
for device_name in os.listdir(path_prefix):
path = os.path.join(path_prefix, device_name, path_suffix)
hw_counters[device_name] = os.listdir(path)
print(render_data(hw_counters))
\ No newline at end of file
......@@ -22,6 +22,22 @@
become: true
become_user: root
- name: Create custom fact directory
file:
path: /etc/ansible/facts.d
state: directory
become: true
- name: Insert custom fact file
copy:
src: files/hw_counters.fact
dest: /etc/ansible/facts.d/hw_counters.fact
mode: 0755
become: true
- name: Re-run setup to use custom facts
setup: ~
- name: Make a directory for extra files
file:
state: directory
......@@ -47,7 +63,7 @@
dest: '/opt/telegraf/bin/telegraf_slurmstats.py'
become: true
become_user: root
#
- name: Install Telegraf config
template:
src: telegraf.conf.j2
......@@ -61,4 +77,3 @@
become_user: root
tags:
- configuration
......@@ -172,116 +172,26 @@
{% endif %}
# Read mlx hardware counters
{% if 'hw_counters' in ansible_local %}
{% for interface in ansible_local['hw_counters'] %}
[[inputs.multifile]]
name_override = "infiniband"
base_dir = "/sys/class/infiniband"
interval = "60s"
name_override = 'infiniband'
base_dir = '/sys/class/infiniband'
interval = '60s'
[[inputs.multifile.tags]]
device="mlx5_0"
port="1"
type="hw_counters"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/duplicate_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/lifespan"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
conversion = "int"
device = '{{ interface }}'
port = '1'
type = 'hw_counters'
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
conversion = "int"
{% for counter in ansible_local['hw_counters'][interface] | sort %}
[[inputs.multifile.file]]
file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
conversion = 'int'
{% endfor %}
{% endfor %}
{% endif %}
###############################################################################
# SERVICE INPUTS #
###############################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment