diff --git a/roles/telegraf/files/inputs.nvidia_smi.conf.j2 b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..79768fe73a5ec4ffa43f3c2446ceba949d1f9df8 --- /dev/null +++ b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 @@ -0,0 +1,7 @@ +# Pulls statistics from nvidia GPUs attached to the host +[[inputs.nvidia_smi]] +## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# bin_path = "/usr/bin/nvidia-smi" + +## Optional: timeout for GPU polling +# timeout = "5s" \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index f19509146bb5caa75bcf05ea4f2942ad76cfab38..9226fa6871d1461340bc519635bdfa11cde60c4e 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -70,3 +70,18 @@ become_user: root tags: - configuration + +- name: Install nvidia-smi plugin + template: + src: inputs.nvidia_smi.conf.j2 + dest: /etc/telegraf/telegraf.d/inputs.nvidia_smi.conf + owner: telegraf + group: telegraf + mode: '640' + notify: + - "restart telegraf" + become: true + become_user: root + tags: + - configuration + - gpu \ No newline at end of file diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index ae81c1128da42c257246c9b0d50041559a66fa0b..73338ecc4cd8a87fdeee03eb8ce17c3928f4a741 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -180,27 +180,27 @@ influxdb_database="slurm" {% endif %} -# Read mlx hardware counters -{% if 'hw_counters' in ansible_local %} -{% for interface in ansible_local['hw_counters'] %} -[[inputs.multifile]] - name_override = 'infiniband' - base_dir = '/sys/class/infiniband' - interval = '60s' - - [[inputs.multifile.tags]] - device = '{{ interface }}' - port = '1' - type = 'hw_counters' - - {% for counter in ansible_local['hw_counters'][interface] | sort %} -[[inputs.multifile.file]] - file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' - conversion = 'int' - - {% endfor %} -{% endfor %} -{% endif %} +## Read mlx hardware counters +#{% if 'hw_counters' in ansible_local %} +#{% for interface in ansible_local['hw_counters'] %} +#[[inputs.multifile]] +# name_override = 'infiniband' +# base_dir = '/sys/class/infiniband' +# interval = '60s' +# +# [[inputs.multifile.tags]] +# device = '{{ interface }}' +# port = '1' +# type = 'hw_counters' +# +# {% for counter in ansible_local['hw_counters'][interface] | sort %} +#[[inputs.multifile.file]] +# file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' +# conversion = 'int' +# +# {% endfor %} +#{% endfor %} +#{% endif %} {% if 'Karaage' in group_names %}