# Telegraf configuration # Telegraf is entirely plugin driven. All metrics are gathered from the # declared inputs, and sent to the declared outputs. # Plugins must be declared in here to be active. # To deactivate a plugin, comment out the name and any variables. # Use 'telegraf -config telegraf.conf -test' to see what metrics a config # file would generate. # Global tags can be specified here in key="value" format. [tags] hostgroup = "{{ hostgroup | default('undefined') }}" cluster = "{{ clustername | default('undefined') }}" computenodeclass = "{{ computenodeclass | default('undefined') }}" # dc = "us-east-1" # will tag all metrics with dc=us-east-1 # rack = "1a" # Configuration for telegraf agent [agent] # Default data collection interval for all plugins interval = "10s" # Rounds collection interval to 'interval' # ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true # Default data flushing interval for all outputs. You should not set this below # interval. Maximum flush_interval will be flush_interval + flush_jitter flush_interval = "60s" # Jitter the flush interval by a random amount. This is primarily to avoid # large write spikes for users running a large number of telegraf instances. # ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s flush_jitter = "5s" # Run telegraf in debug mode debug = false # Override default hostname, if empty use os.Hostname() hostname = "" ############################################################################### # OUTPUTS # ############################################################################### # Configuration for influxdb server to send metrics to [[outputs.influxdb]] # The full HTTP or UDP endpoint URL for your InfluxDB instance. # Multiple urls can be specified but it is assumed that they are part of the same # cluster, this means that only ONE of the urls will be written to each interval. # urls = ["udp://localhost:8089"] # UDP endpoint example urls = ["{{ influxdb_server }}"] # required # The target database for metrics (telegraf will create it if not exists) database = "telegraf" # required # Precision of writes, valid values are n, u, ms, s, m, and h # note: using second precision greatly helps InfluxDB compression precision = "s" # Connection timeout (for the connection with InfluxDB), formatted as a string. # If not provided, will default to 0 (no timeout) # timeout = "5s" # username = "telegraf" # password = "metricsmetricsmetricsmetrics" username = "{{ influxdb_user }}" password = "{{ influxdb_password }}" # Set the user agent for HTTP POSTs (can be useful for log differentiation) # user_agent = "telegraf" # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) # udp_payload = 512 [outputs.influxdb.tagdrop] influxdb_database = ["*"] [[outputs.influxdb]] urls = ["{{ influxdb_server }}"] # required database = "slurm" # required precision = "s" username = "{{ influxdb_user }}" password = "{{ influxdb_password }}" [outputs.influxdb.tagpass] influxdb_database = ["slurm"] ############################################################################### # INPUTS # ############################################################################### # Read metrics about cpu usage [[inputs.cpu]] # Whether to report per-cpu stats or not percpu = true # Whether to report total system cpu stats or not totalcpu = true # Comment this line if you want the raw CPU time metrics drop = ["time_*"] # Read metrics about disk usage by mount point [[inputs.disk]] # By default, telegraf gather stats for all mountpoints. # Setting mountpoints will restrict the stats to the specified mountpoints. # mount_points=["/"] # Read metrics about disk IO by device [[inputs.diskio]] # By default, telegraf will gather stats for all devices including # disk partitions. # Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb"] # Uncomment the following line if you do not need disk serial numbers. # skip_serial_number = true # Read metrics about memory usage [[inputs.mem]] # no configuration # Read metrics about swap memory usage [[inputs.swap]] # no configuration # Read metrics about system load & uptime [[inputs.system]] # no configuration [[inputs.net]] # no configuration [[inputs.netstat]] # no configuration [[inputs.exec]] commands = [ "/opt/telegraf/bin/telegraf_mountstats.py" ] data_format = "influx" timeout="4s" interval="300s" # Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will {% if 'ManagementNodes' in group_names %} [[inputs.exec]] commands = [ "/opt/telegraf/bin/telegraf_slurmstats.py" ] data_format = "influx" timeout="4s" interval="900s" [inputs.exec.tags] influxdb_database="slurm" {% endif %} ############################################################################### # SERVICE INPUTS # ###############################################################################