Skip to content
Snippets Groups Projects
Commit 1bedc922 authored by Andreas Hamacher's avatar Andreas Hamacher
Browse files

Merge branch 'add_slurmstats' into 'master'

log the information on sdiag backfil stats to influx

See merge request hpc-team/ansible_cluster_in_a_box!291
parents a6b355ea ab38bcf0
No related branches found
No related tags found
1 merge request!291log the information on sdiag backfil stats to influx
......@@ -38,6 +38,7 @@
- { role: slurm-common, tags: [ slurm, slurm-common ] }
- { role: slurm_config, tags: [ slurm, slurm-config ] }
- { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] }
- { role: telegraf, tags: [ monitoring ] }
# - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] }
# - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] }
#!/usr/bin/python
class SlurmStats:
def __init__(self):
self._values = {'backfill':1}
def values(self):
values = ",".join(["{}={}".format(key,value) for key,value in self._values.items()])
return values
def print_stats(stats):
print("slurmstats {}".format(stats.values()))
def get_stats():
import subprocess
p = subprocess.Popen(['sdiag'],stdout = subprocess.PIPE,stderr=subprocess.PIPE)
(stdout,stderr) = p.communicate()
for l in stdout.decode().splitlines():
if "Total backfilled jobs (since last stats cycle start):" in l:
v = l.split(':')[1]
stats = SlurmStats()
stats._values = {'backfill':int(v)}
return stats
slurmstats = get_stats()
print_stats(slurmstats)
......@@ -40,6 +40,15 @@
become: true
become_user: root
- name: copy slurmstats plugin
copy:
mode: 'u=rwx,g=rx,o=rx'
src: telegraf_slurmstats.py
dest: '/opt/telegraf/bin/telegraf_slurmstats.py'
become: true
become_user: root
- name: Install Telegraf config
template:
......
......@@ -67,6 +67,17 @@
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
[outputs.influxdb.tagdrop]
influxdb_database = ["*"]
[[outputs.influxdb]]
urls = ["{{ influxdb_server }}"] # required
database = "slurm" # required
precision = "s"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
[outputs.influxdb.tagpass]
influxdb_database = ["slurm"]
###############################################################################
......@@ -123,6 +134,19 @@
timeout="4s"
interval="300s"
# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_slurmstats.py"
]
data_format = "influx"
timeout="4s"
interval="60s"
[inputs.exec.tags]
influxdb_database="slurm"
{% endif %}
###############################################################################
# SERVICE INPUTS #
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment