diff --git a/plays/mgmtnodes.yml b/plays/mgmtnodes.yml index c890a5456b5306f1478070e3f329fc57adc51340..5d4241194324fe13739e074b4ee749c969935dfb 100644 --- a/plays/mgmtnodes.yml +++ b/plays/mgmtnodes.yml @@ -38,6 +38,7 @@ - { role: slurm-common, tags: [ slurm, slurm-common ] } - { role: slurm_config, tags: [ slurm, slurm-config ] } - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] } + - { role: telegraf, tags: [ monitoring ] } # - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] } # - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] } diff --git a/roles/telegraf/files/telegraf_slurmstats.py b/roles/telegraf/files/telegraf_slurmstats.py new file mode 100644 index 0000000000000000000000000000000000000000..962a2080abbeb1a5532045bce4beb5a51f0033b7 --- /dev/null +++ b/roles/telegraf/files/telegraf_slurmstats.py @@ -0,0 +1,31 @@ +#!/usr/bin/python + +class SlurmStats: + def __init__(self): + self._values = {'backfill':1} + + + def values(self): + values = ",".join(["{}={}".format(key,value) for key,value in self._values.items()]) + return values + + +def print_stats(stats): + print("slurmstats {}".format(stats.values())) + +def get_stats(): + import subprocess + p = subprocess.Popen(['sdiag'],stdout = subprocess.PIPE,stderr=subprocess.PIPE) + (stdout,stderr) = p.communicate() + for l in stdout.decode().splitlines(): + if "Total backfilled jobs (since last stats cycle start):" in l: + v = l.split(':')[1] + stats = SlurmStats() + stats._values = {'backfill':int(v)} + return stats + + + +slurmstats = get_stats() +print_stats(slurmstats) + diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index f6382e35dcd3345c58d50dc935e904fc58ac990e..13701898cd1ae4c091aa148bc8928d091834b0b6 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -40,6 +40,15 @@ become: true become_user: root +- name: copy slurmstats plugin + copy: + mode: 'u=rwx,g=rx,o=rx' + src: telegraf_slurmstats.py + dest: '/opt/telegraf/bin/telegraf_slurmstats.py' + become: true + become_user: root + + - name: Install Telegraf config template: diff --git a/roles/telegraf/templates/telegraf.conf.j2 b/roles/telegraf/templates/telegraf.conf.j2 index 3c27c4b4db028d14d70221f85ce64288f21678a4..52c2fae9c6f193000249ccd1bdcc64574b72d103 100644 --- a/roles/telegraf/templates/telegraf.conf.j2 +++ b/roles/telegraf/templates/telegraf.conf.j2 @@ -67,6 +67,17 @@ # user_agent = "telegraf" # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) # udp_payload = 512 + [outputs.influxdb.tagdrop] + influxdb_database = ["*"] + +[[outputs.influxdb]] + urls = ["{{ influxdb_server }}"] # required + database = "slurm" # required + precision = "s" + username = "{{ influxdb_user }}" + password = "{{ influxdb_password }}" + [outputs.influxdb.tagpass] + influxdb_database = ["slurm"] ############################################################################### @@ -123,6 +134,19 @@ timeout="4s" interval="300s" +# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will +{% if 'ManagementNodes' in group_names %} +[[inputs.exec]] + commands = [ + "/opt/telegraf/bin/telegraf_slurmstats.py" + ] + data_format = "influx" + timeout="4s" + interval="60s" + [inputs.exec.tags] + influxdb_database="slurm" +{% endif %} + ############################################################################### # SERVICE INPUTS #