From 845381800c3522cf1b702eb2459d5a7d4253b432 Mon Sep 17 00:00:00 2001
From: Kerri Wait <kerri.wait@monash.edu>
Date: Wed, 7 Oct 2020 14:55:01 +1100
Subject: [PATCH] Fix deployment of telegraf for monitoring mlx hw counters

---
 roles/telegraf/tasks/main.yml                 | 17 +++++++++++++-
 .../templates/inputs.multifile_mlx.conf.j2    | 22 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 roles/telegraf/templates/inputs.multifile_mlx.conf.j2

diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml
index 9226fa68..5ca8af1f 100644
--- a/roles/telegraf/tasks/main.yml
+++ b/roles/telegraf/tasks/main.yml
@@ -71,6 +71,20 @@
   tags:
     - configuration
 
+- name: Install multifile plugin for mlx hw_counters
+  template:
+    src: inputs.multifile_mlx.conf.j2
+    dest: /etc/telegraf/telegraf.d/inputs.multifile_mlx.conf
+    owner: telegraf
+    group: telegraf
+    mode: '640'
+  notify:
+    - "restart telegraf"
+  become: true
+  become_user: root
+  tags:
+    - configuration
+
 - name: Install nvidia-smi plugin
   template:
     src: inputs.nvidia_smi.conf.j2
@@ -84,4 +98,5 @@
   become_user: root
   tags:
     - configuration
-    - gpu
\ No newline at end of file
+    - gpu
+  when: "'VisNodes' in group_names"
\ No newline at end of file
diff --git a/roles/telegraf/templates/inputs.multifile_mlx.conf.j2 b/roles/telegraf/templates/inputs.multifile_mlx.conf.j2
new file mode 100644
index 00000000..c6adbfd9
--- /dev/null
+++ b/roles/telegraf/templates/inputs.multifile_mlx.conf.j2
@@ -0,0 +1,22 @@
+# Read mlx hardware counters
+{% if hwcounterlist %}
+{% for interface in hwcounterlist %}
+[[inputs.multifile]]
+  name_override = 'infiniband'
+  base_dir = '/sys/class/infiniband'
+  interval = '60s'
+
+  [[inputs.multifile.tags]]
+    device = '{{ interface }}'
+    port = '1'
+    type = 'hw_counters'
+
+  {% for counter in hwcounterlist[interface] | sort %}
+[[inputs.multifile.file]]
+    file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
+    conversion = 'int'
+
+  {% endfor %}
+{% endfor %}
+{% endif %}
+
-- 
GitLab