diff --git a/roles/telegraf/files/hw_counters.fact b/roles/telegraf/files/hw_counters.fact index 4abbd809187e035e72f456cc4657c7f09ada1231..2172d26ad666d97f82be46d1fa3b38650c763228 100644 --- a/roles/telegraf/files/hw_counters.fact +++ b/roles/telegraf/files/hw_counters.fact @@ -6,10 +6,12 @@ import os def render_data(data): return json.dumps(data) -device_name = 'mlx5_0' hw_counters = {} -path = '/sys/class/infiniband/{}/ports/1/hw_counters/'.format(device_name) +path_prefix = '/sys/class/infiniband' +path_suffix = 'ports/1/hw_counters' -hw_counters[device_name] = os.listdir(path) +for device_name in os.listdir(path_prefix): + path = os.path.join(path_prefix, device_name, path_suffix) + hw_counters[device_name] = os.listdir(path) print(render_data(hw_counters)) \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 317c616ece475b95e056ac0d8fd715363af23665..7a5a032215c9b256b49c880a7377c8a7381cd58e 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -22,6 +22,22 @@ become: true become_user: root +- name: Create custom fact directory + file: + path: /etc/ansible/facts.d + state: directory + become: true + +- name: Insert custom fact file + copy: + src: files/hw_counters.fact + dest: /etc/ansible/facts.d/hw_counters.fact + mode: 0755 + become: true + +- name: Re-run setup to use custom facts + setup: ~ + - name: Make a directory for extra files file: state: directory @@ -47,7 +63,7 @@ dest: '/opt/telegraf/bin/telegraf_slurmstats.py' become: true become_user: root - +# - name: Install Telegraf config template: src: telegraf.conf.j2 @@ -61,4 +77,3 @@ become_user: root tags: - configuration - diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index 59d87c9c95cda9cffde5ddf7eb55587257b61b59..0605499d0312df74faa9309cc43adc172fb93f43 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -172,116 +172,26 @@ {% endif %} # Read mlx hardware counters +{% if 'hw_counters' in ansible_local %} +{% for interface in ansible_local['hw_counters'] %} [[inputs.multifile]] - name_override = "infiniband" - base_dir = "/sys/class/infiniband" - interval = "60s" + name_override = 'infiniband' + base_dir = '/sys/class/infiniband' + interval = '60s' [[inputs.multifile.tags]] - device="mlx5_0" - port="1" - type="hw_counters" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/duplicate_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/lifespan" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_cnp_sent" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_buffer" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_sequence" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/packet_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors" - conversion = "int" + device = '{{ interface }}' + port = '1' + type = 'hw_counters' - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_local_length_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_dct_connect" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_read_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_write_requests" - conversion = "int" + {% for counter in ansible_local['hw_counters'][interface] | sort %} +[[inputs.multifile.file]] + file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' + conversion = 'int' + {% endfor %} +{% endfor %} +{% endif %} ############################################################################### # SERVICE INPUTS # ###############################################################################