From 521a20139190276dad4d609bbd4143e46d3daf4d Mon Sep 17 00:00:00 2001
From: Kerri Wait <kerri.wait@monash.edu>
Date: Wed, 16 Sep 2020 18:45:04 +1000
Subject: [PATCH] Update telegraf role to intelligently populate hw_counters
 plugin config

---
 roles/telegraf/files/hw_counters.fact     |   8 +-
 roles/telegraf/tasks/main.yml             |  19 +++-
 roles/telegraf/templates/telegraf.conf.j2 | 120 +++-------------------
 3 files changed, 37 insertions(+), 110 deletions(-)

diff --git a/roles/telegraf/files/hw_counters.fact b/roles/telegraf/files/hw_counters.fact
index 4abbd809..2172d26a 100644
--- a/roles/telegraf/files/hw_counters.fact
+++ b/roles/telegraf/files/hw_counters.fact
@@ -6,10 +6,12 @@ import os
 def render_data(data):
     return json.dumps(data)
 
-device_name = 'mlx5_0'
 hw_counters = {}
-path = '/sys/class/infiniband/{}/ports/1/hw_counters/'.format(device_name)
+path_prefix = '/sys/class/infiniband'
+path_suffix = 'ports/1/hw_counters'
 
-hw_counters[device_name] = os.listdir(path)
+for device_name in os.listdir(path_prefix):
+    path = os.path.join(path_prefix, device_name, path_suffix)
+    hw_counters[device_name] = os.listdir(path)
 
 print(render_data(hw_counters))
\ No newline at end of file
diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml
index 317c616e..7a5a0322 100644
--- a/roles/telegraf/tasks/main.yml
+++ b/roles/telegraf/tasks/main.yml
@@ -22,6 +22,22 @@
   become: true
   become_user: root
 
+- name: Create custom fact directory
+  file:
+    path: /etc/ansible/facts.d
+    state: directory
+  become: true
+
+- name: Insert custom fact file
+  copy:
+    src: files/hw_counters.fact
+    dest: /etc/ansible/facts.d/hw_counters.fact
+    mode: 0755
+  become: true
+
+- name: Re-run setup to use custom facts
+  setup: ~
+
 - name: Make a directory for extra files
   file:
     state: directory
@@ -47,7 +63,7 @@
     dest: '/opt/telegraf/bin/telegraf_slurmstats.py'
   become: true
   become_user: root
-
+#
 - name: Install Telegraf config
   template:
     src: telegraf.conf.j2
@@ -61,4 +77,3 @@
   become_user: root
   tags:
     - configuration
-
diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2
index 59d87c9c..0605499d 100644
--- a/roles/telegraf/templates/telegraf.conf.j2
+++ b/roles/telegraf/templates/telegraf.conf.j2
@@ -172,116 +172,26 @@
 {% endif %}
 
 # Read mlx hardware counters
+{% if 'hw_counters' in ansible_local %}
+{% for interface in ansible_local['hw_counters'] %}
 [[inputs.multifile]]
-  name_override = "infiniband"
-  base_dir = "/sys/class/infiniband"
-  interval = "60s"
+  name_override = 'infiniband'
+  base_dir = '/sys/class/infiniband'
+  interval = '60s'
 
   [[inputs.multifile.tags]]
-    device="mlx5_0"
-    port="1"
-    type="hw_counters"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/duplicate_request"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/lifespan"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
-    conversion = "int"
+    device = '{{ interface }}'
+    port = '1'
+    type = 'hw_counters'
 
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-    file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
-    conversion = "int"
-
-  [[inputs.multifile.file]]
-  file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
-  conversion = "int"
-
-  [[inputs.multifile.file]]
-  file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
-  conversion = "int"
-
-  [[inputs.multifile.file]]
-  file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
-  conversion = "int"
+  {% for counter in ansible_local['hw_counters'][interface] | sort %}
+[[inputs.multifile.file]]
+    file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
+    conversion = 'int'
 
+  {% endfor %}
+{% endfor %}
+{% endif %}
 ###############################################################################
 #                              SERVICE INPUTS                                 #
 ###############################################################################
-- 
GitLab