From 845381800c3522cf1b702eb2459d5a7d4253b432 Mon Sep 17 00:00:00 2001 From: Kerri Wait <kerri.wait@monash.edu> Date: Wed, 7 Oct 2020 14:55:01 +1100 Subject: [PATCH] Fix deployment of telegraf for monitoring mlx hw counters --- roles/telegraf/tasks/main.yml | 17 +++++++++++++- .../templates/inputs.multifile_mlx.conf.j2 | 22 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 roles/telegraf/templates/inputs.multifile_mlx.conf.j2 diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 9226fa68..5ca8af1f 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -71,6 +71,20 @@ tags: - configuration +- name: Install multifile plugin for mlx hw_counters + template: + src: inputs.multifile_mlx.conf.j2 + dest: /etc/telegraf/telegraf.d/inputs.multifile_mlx.conf + owner: telegraf + group: telegraf + mode: '640' + notify: + - "restart telegraf" + become: true + become_user: root + tags: + - configuration + - name: Install nvidia-smi plugin template: src: inputs.nvidia_smi.conf.j2 @@ -84,4 +98,5 @@ become_user: root tags: - configuration - - gpu \ No newline at end of file + - gpu + when: "'VisNodes' in group_names" \ No newline at end of file diff --git a/roles/telegraf/templates/inputs.multifile_mlx.conf.j2 b/roles/telegraf/templates/inputs.multifile_mlx.conf.j2 new file mode 100644 index 00000000..c6adbfd9 --- /dev/null +++ b/roles/telegraf/templates/inputs.multifile_mlx.conf.j2 @@ -0,0 +1,22 @@ +# Read mlx hardware counters +{% if hwcounterlist %} +{% for interface in hwcounterlist %} +[[inputs.multifile]] + name_override = 'infiniband' + base_dir = '/sys/class/infiniband' + interval = '60s' + + [[inputs.multifile.tags]] + device = '{{ interface }}' + port = '1' + type = 'hw_counters' + + {% for counter in hwcounterlist[interface] | sort %} +[[inputs.multifile.file]] + file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' + conversion = 'int' + + {% endfor %} +{% endfor %} +{% endif %} + -- GitLab