diff --git a/roles/telegraf/files/hw_counters.fact b/roles/telegraf/files/hw_counters.fact new file mode 100644 index 0000000000000000000000000000000000000000..4011a8f7ad5a6fbfab334e116c7b2ffe134cff4d --- /dev/null +++ b/roles/telegraf/files/hw_counters.fact @@ -0,0 +1,17 @@ +#!/bin/env python + +import json +import os + +def render_data(data): + return json.dumps(data) + +hw_counters = {} +path_prefix = '/sys/class/infiniband' +path_suffix = 'ports/1/hw_counters' + +for device_name in os.listdir(path_prefix): + path = os.path.join(path_prefix, device_name, path_suffix) + hw_counters[device_name] = os.listdir(path) + +print(render_data(hw_counters)) \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 317c616ece475b95e056ac0d8fd715363af23665..234ab230f8cd61708f12362d952ce94e36a3c70c 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -22,6 +22,22 @@ become: true become_user: root +- name: Create custom fact directory + file: + path: /etc/ansible/facts.d + state: directory + become: true + +- name: Insert custom fact file + copy: + src: files/hw_counters.fact + dest: /etc/ansible/facts.d/hw_counters.fact + mode: 0755 + become: true + +- name: reload ansible_local + setup: filter=ansible_local + - name: Make a directory for extra files file: state: directory @@ -47,7 +63,7 @@ dest: '/opt/telegraf/bin/telegraf_slurmstats.py' become: true become_user: root - +# - name: Install Telegraf config template: src: telegraf.conf.j2 @@ -61,4 +77,3 @@ become_user: root tags: - configuration - diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index d40b70bcaf20deecaaa26e05aaeeea5e32323dad..aa6233e9f8a038e92f2d0baf0a5967a5c85df5c5 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -119,6 +119,15 @@ # Uncomment the following line if you do not need disk serial numbers. # skip_serial_number = true +# Returns ethtool statistics for given interfaces +[[inputs.ethtool]] + interval = "60s" + # List of interfaces to pull metrics for + # interface_include = ["mlx0", "p1p1"] + + # List of interfaces to ignore when pulling metrics. + interface_exclude = ["eth0", "eth1", "lo", "virbr0", "virbr0-nic"] + # Read metrics about the number of files in /home [[inputs.filecount]] directories = ["/home"] @@ -126,6 +135,12 @@ regular_only = false interval = "60s" +# Collect statistics about itself +[[inputs.internal]] + ## If true, collect telegraf memory stats. + # collect_memstats = true + interval = "60s" + # Read metrics about memory usage [[inputs.mem]] # no configuration @@ -166,116 +181,26 @@ {% endif %} # Read mlx hardware counters +{% if 'hw_counters' in ansible_local %} +{% for interface in ansible_local['hw_counters'] %} [[inputs.multifile]] - name_override = "infiniband" - base_dir = "/sys/class/infiniband" - interval = "60s" + name_override = 'infiniband' + base_dir = '/sys/class/infiniband' + interval = '60s' [[inputs.multifile.tags]] - device="mlx5_0" - port="1" - type="hw_counters" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/duplicate_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/lifespan" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_cnp_sent" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_buffer" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_sequence" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/packet_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_error" - conversion = "int" + device = '{{ interface }}' + port = '1' + type = 'hw_counters' - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_local_length_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_dct_connect" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_read_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_write_requests" - conversion = "int" + {% for counter in ansible_local['hw_counters'][interface] | sort %} +[[inputs.multifile.file]] + file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' + conversion = 'int' + {% endfor %} +{% endfor %} +{% endif %} ############################################################################### # SERVICE INPUTS # ###############################################################################