From 521a20139190276dad4d609bbd4143e46d3daf4d Mon Sep 17 00:00:00 2001 From: Kerri Wait <kerri.wait@monash.edu> Date: Wed, 16 Sep 2020 18:45:04 +1000 Subject: [PATCH] Update telegraf role to intelligently populate hw_counters plugin config --- roles/telegraf/files/hw_counters.fact | 8 +- roles/telegraf/tasks/main.yml | 19 +++- roles/telegraf/templates/telegraf.conf.j2 | 120 +++------------------- 3 files changed, 37 insertions(+), 110 deletions(-) diff --git a/roles/telegraf/files/hw_counters.fact b/roles/telegraf/files/hw_counters.fact index 4abbd809..2172d26a 100644 --- a/roles/telegraf/files/hw_counters.fact +++ b/roles/telegraf/files/hw_counters.fact @@ -6,10 +6,12 @@ import os def render_data(data): return json.dumps(data) -device_name = 'mlx5_0' hw_counters = {} -path = '/sys/class/infiniband/{}/ports/1/hw_counters/'.format(device_name) +path_prefix = '/sys/class/infiniband' +path_suffix = 'ports/1/hw_counters' -hw_counters[device_name] = os.listdir(path) +for device_name in os.listdir(path_prefix): + path = os.path.join(path_prefix, device_name, path_suffix) + hw_counters[device_name] = os.listdir(path) print(render_data(hw_counters)) \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 317c616e..7a5a0322 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -22,6 +22,22 @@ become: true become_user: root +- name: Create custom fact directory + file: + path: /etc/ansible/facts.d + state: directory + become: true + +- name: Insert custom fact file + copy: + src: files/hw_counters.fact + dest: /etc/ansible/facts.d/hw_counters.fact + mode: 0755 + become: true + +- name: Re-run setup to use custom facts + setup: ~ + - name: Make a directory for extra files file: state: directory @@ -47,7 +63,7 @@ dest: '/opt/telegraf/bin/telegraf_slurmstats.py' become: true become_user: root - +# - name: Install Telegraf config template: src: telegraf.conf.j2 @@ -61,4 +77,3 @@ become_user: root tags: - configuration - diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index 59d87c9c..0605499d 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -172,116 +172,26 @@ {% endif %} # Read mlx hardware counters +{% if 'hw_counters' in ansible_local %} +{% for interface in ansible_local['hw_counters'] %} [[inputs.multifile]] - name_override = "infiniband" - base_dir = "/sys/class/infiniband" - interval = "60s" + name_override = 'infiniband' + base_dir = '/sys/class/infiniband' + interval = '60s' [[inputs.multifile.tags]] - device="mlx5_0" - port="1" - type="hw_counters" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/duplicate_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/lifespan" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_cnp_sent" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_buffer" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/out_of_sequence" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/packet_seq_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors" - conversion = "int" + device = '{{ interface }}' + port = '1' + type = 'hw_counters' - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_local_length_error" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_dct_connect" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_read_requests" - conversion = "int" - - [[inputs.multifile.file]] - file = "mlx5_0/ports/1/hw_counters/rx_write_requests" - conversion = "int" + {% for counter in ansible_local['hw_counters'][interface] | sort %} +[[inputs.multifile.file]] + file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' + conversion = 'int' + {% endfor %} +{% endfor %} +{% endif %} ############################################################################### # SERVICE INPUTS # ############################################################################### -- GitLab