From be060aeabdcdc6f5e2e1b739d14993f83bb31cf6 Mon Sep 17 00:00:00 2001 From: Kerri Wait <kerri.wait@monash.edu> Date: Tue, 6 Oct 2020 10:49:38 +1100 Subject: [PATCH] Add nvidia_smi plugin to telegraf config --- .../telegraf/files/inputs.nvidia_smi.conf.j2 | 7 ++++ roles/telegraf/tasks/main.yml | 15 +++++++ roles/telegraf/templates/telegraf.conf.j2 | 42 +++++++++---------- 3 files changed, 43 insertions(+), 21 deletions(-) create mode 100644 roles/telegraf/files/inputs.nvidia_smi.conf.j2 diff --git a/roles/telegraf/files/inputs.nvidia_smi.conf.j2 b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 new file mode 100644 index 00000000..79768fe7 --- /dev/null +++ b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 @@ -0,0 +1,7 @@ +# Pulls statistics from nvidia GPUs attached to the host +[[inputs.nvidia_smi]] +## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# bin_path = "/usr/bin/nvidia-smi" + +## Optional: timeout for GPU polling +# timeout = "5s" \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index f1950914..9226fa68 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -70,3 +70,18 @@ become_user: root tags: - configuration + +- name: Install nvidia-smi plugin + template: + src: inputs.nvidia_smi.conf.j2 + dest: /etc/telegraf/telegraf.d/inputs.nvidia_smi.conf + owner: telegraf + group: telegraf + mode: '640' + notify: + - "restart telegraf" + become: true + become_user: root + tags: + - configuration + - gpu \ No newline at end of file diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index ae81c112..73338ecc 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -180,27 +180,27 @@ influxdb_database="slurm" {% endif %} -# Read mlx hardware counters -{% if 'hw_counters' in ansible_local %} -{% for interface in ansible_local['hw_counters'] %} -[[inputs.multifile]] - name_override = 'infiniband' - base_dir = '/sys/class/infiniband' - interval = '60s' - - [[inputs.multifile.tags]] - device = '{{ interface }}' - port = '1' - type = 'hw_counters' - - {% for counter in ansible_local['hw_counters'][interface] | sort %} -[[inputs.multifile.file]] - file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' - conversion = 'int' - - {% endfor %} -{% endfor %} -{% endif %} +## Read mlx hardware counters +#{% if 'hw_counters' in ansible_local %} +#{% for interface in ansible_local['hw_counters'] %} +#[[inputs.multifile]] +# name_override = 'infiniband' +# base_dir = '/sys/class/infiniband' +# interval = '60s' +# +# [[inputs.multifile.tags]] +# device = '{{ interface }}' +# port = '1' +# type = 'hw_counters' +# +# {% for counter in ansible_local['hw_counters'][interface] | sort %} +#[[inputs.multifile.file]] +# file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' +# conversion = 'int' +# +# {% endfor %} +#{% endfor %} +#{% endif %} {% if 'Karaage' in group_names %} -- GitLab