diff --git a/roles/telegraf/files/inputs.nvidia_smi.conf.j2 b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..79768fe73a5ec4ffa43f3c2446ceba949d1f9df8 --- /dev/null +++ b/roles/telegraf/files/inputs.nvidia_smi.conf.j2 @@ -0,0 +1,7 @@ +# Pulls statistics from nvidia GPUs attached to the host +[[inputs.nvidia_smi]] +## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# bin_path = "/usr/bin/nvidia-smi" + +## Optional: timeout for GPU polling +# timeout = "5s" \ No newline at end of file diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index f19509146bb5caa75bcf05ea4f2942ad76cfab38..9226fa6871d1461340bc519635bdfa11cde60c4e 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -70,3 +70,18 @@ become_user: root tags: - configuration + +- name: Install nvidia-smi plugin + template: + src: inputs.nvidia_smi.conf.j2 + dest: /etc/telegraf/telegraf.d/inputs.nvidia_smi.conf + owner: telegraf + group: telegraf + mode: '640' + notify: + - "restart telegraf" + become: true + become_user: root + tags: + - configuration + - gpu \ No newline at end of file diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index ae81c1128da42c257246c9b0d50041559a66fa0b..78a8d5202d4b21931795289eb7c858808217e1aa 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -180,51 +180,6 @@ influxdb_database="slurm" {% endif %} -# Read mlx hardware counters -{% if 'hw_counters' in ansible_local %} -{% for interface in ansible_local['hw_counters'] %} -[[inputs.multifile]] - name_override = 'infiniband' - base_dir = '/sys/class/infiniband' - interval = '60s' - - [[inputs.multifile.tags]] - device = '{{ interface }}' - port = '1' - type = 'hw_counters' - - {% for counter in ansible_local['hw_counters'][interface] | sort %} -[[inputs.multifile.file]] - file = '{{ interface }}/ports/1/hw_counters/{{ counter }}' - conversion = 'int' - - {% endfor %} -{% endfor %} -{% endif %} - - -{% if 'Karaage' in group_names %} -[[inputs.apache]] - interval = "60s" - # An array of URLs to gather from, must be directed at the machine - # readable version of the mod_status page including the auto query string. - # Default is "http://localhost/server-status?auto". - urls = ["http://localhost/server-status?auto"] - - # Credentials for basic HTTP authentication. - # username = "myuser" - # password = "mypassword" - - # Maximum time to receive response. - response_timeout = "5s" - - # Optional TLS Config - # tls_ca = "/etc/telegraf/ca.pem" - # tls_cert = "/etc/telegraf/cert.pem" - # tls_key = "/etc/telegraf/key.pem" - # Use TLS but skip chain & host verification - # insecure_skip_verify = false -{% endif %} ############################################################################### # SERVICE INPUTS # ###############################################################################