Skip to content
Snippets Groups Projects
Commit dfec522d authored by Kerri Wait's avatar Kerri Wait Committed by Andreas Hamacher
Browse files

Fix hw_counters for telegraf and enable ethtool telegraf plugin

parent f3782110
No related branches found
No related tags found
No related merge requests found
#!/bin/env python
import json
import os
def render_data(data):
return json.dumps(data)
hw_counters = {}
path_prefix = '/sys/class/infiniband'
path_suffix = 'ports/1/hw_counters'
for device_name in os.listdir(path_prefix):
path = os.path.join(path_prefix, device_name, path_suffix)
hw_counters[device_name] = os.listdir(path)
print(render_data(hw_counters))
\ No newline at end of file
......@@ -22,6 +22,22 @@
become: true
become_user: root
- name: Create custom fact directory
file:
path: /etc/ansible/facts.d
state: directory
become: true
- name: Insert custom fact file
copy:
src: files/hw_counters.fact
dest: /etc/ansible/facts.d/hw_counters.fact
mode: 0755
become: true
- name: reload ansible_local
setup: filter=ansible_local
- name: Make a directory for extra files
file:
state: directory
......@@ -47,7 +63,7 @@
dest: '/opt/telegraf/bin/telegraf_slurmstats.py'
become: true
become_user: root
#
- name: Install Telegraf config
template:
src: telegraf.conf.j2
......@@ -61,4 +77,3 @@
become_user: root
tags:
- configuration
......@@ -119,6 +119,15 @@
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true
# Returns ethtool statistics for given interfaces
[[inputs.ethtool]]
interval = "60s"
# List of interfaces to pull metrics for
# interface_include = ["mlx0", "p1p1"]
# List of interfaces to ignore when pulling metrics.
interface_exclude = ["eth0", "eth1", "lo", "virbr0", "virbr0-nic"]
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
......@@ -126,6 +135,12 @@
regular_only = false
interval = "60s"
# Collect statistics about itself
[[inputs.internal]]
## If true, collect telegraf memory stats.
# collect_memstats = true
interval = "60s"
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
......@@ -166,116 +181,26 @@
{% endif %}
# Read mlx hardware counters
{% if 'hw_counters' in ansible_local %}
{% for interface in ansible_local['hw_counters'] %}
[[inputs.multifile]]
name_override = "infiniband"
base_dir = "/sys/class/infiniband"
interval = "60s"
name_override = 'infiniband'
base_dir = '/sys/class/infiniband'
interval = '60s'
[[inputs.multifile.tags]]
device="mlx5_0"
port="1"
type="hw_counters"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/duplicate_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/lifespan"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
conversion = "int"
device = '{{ interface }}'
port = '1'
type = 'hw_counters'
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
conversion = "int"
{% for counter in ansible_local['hw_counters'][interface] | sort %}
[[inputs.multifile.file]]
file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
conversion = 'int'
{% endfor %}
{% endfor %}
{% endif %}
###############################################################################
# SERVICE INPUTS #
###############################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment