telegraf.conf.j2

# Telegraf configuration

# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.

# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.

# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.

# Global tags can be specified here in key="value" format.
[tags]
  hostgroup = "{{ hostgroup | default('undefined') }}"
  cluster = "{{ clustername | default('undefined') }}"
  computenodeclass = "{{ computenodeclass | default('undefined') }}"
  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
  # rack = "1a"

# Configuration for telegraf agent
[agent]
  # Default data collection interval for all plugins
  interval = "10s"
  # Rounds collection interval to 'interval'
  # ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  # Default data flushing interval for all outputs. You should not set this below
  # interval. Maximum flush_interval will be flush_interval + flush_jitter
  flush_interval = "60s"
  # Jitter the flush interval by a random amount. This is primarily to avoid
  # large write spikes for users running a large number of telegraf instances.
  # ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "5s"

  # Run telegraf in debug mode
  debug = false
  # Override default hostname, if empty use os.Hostname()
  hostname = ""

  ## Maximum number of unwritten metrics per output.  Increasing this value
  ## allows for longer periods of output downtime without dropping metrics at the
  ## cost of higher maximum memory usage.
  metric_buffer_limit = 15000

###############################################################################
#                                  OUTPUTS                                    #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
  # The full HTTP or UDP endpoint URL for your InfluxDB instance.
  # Multiple urls can be specified but it is assumed that they are part of the same
  # cluster, this means that only ONE of the urls will be written to each interval.
  # urls = ["udp://localhost:8089"] # UDP endpoint example
  urls = ["{{ influxdb_server }}"] # required
  # The target database for metrics (telegraf will create it if not exists)
  database = "telegraf" # required
  # Precision of writes, valid values are n, u, ms, s, m, and h
  # note: using second precision greatly helps InfluxDB compression
  precision = "s"

  # Connection timeout (for the connection with InfluxDB), formatted as a string.
  # If not provided, will default to 0 (no timeout)
  # timeout = "5s"
  # username = "telegraf"
  # password = "metricsmetricsmetricsmetrics"
  username = "{{ influxdb_user }}" 
  password = "{{ influxdb_password }}"
  # Set the user agent for HTTP POSTs (can be useful for log differentiation)
  # user_agent = "telegraf"
  # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
  # udp_payload = 512
  [outputs.influxdb.tagdrop]
       influxdb_database = ["*"]

[[outputs.influxdb]]
  urls = ["{{ influxdb_server }}"] # required
  database = "slurm" # required
  precision = "s"
  username = "{{ influxdb_user }}" 
  password = "{{ influxdb_password }}"
  [outputs.influxdb.tagpass]
       influxdb_database = ["slurm"]


###############################################################################
#                                  INPUTS                                     #
###############################################################################

# Read metrics about cpu usage
[[inputs.cpu]]
  # Whether to report per-cpu stats or not
  percpu = true
  # Whether to report total system cpu stats or not
  totalcpu = true
  # Comment this line if you want the raw CPU time metrics
  drop = ["time_*"]

# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
  interval = "60s"
  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
  interval = "60s"
{% endif %}
  # By default, telegraf gather stats for all mountpoints.
  # Setting mountpoints will restrict the stats to the specified mountpoints.
  # mount_points=["/"]

# Read metrics about disk IO by device
[[inputs.diskio]]
  # By default, telegraf will gather stats for all devices including
  # disk partitions.
  # Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb"]
  # Uncomment the following line if you do not need disk serial numbers.
  # skip_serial_number = true

# Returns ethtool statistics for given interfaces
[[inputs.ethtool]]
  interval = "60s"
  # List of interfaces to pull metrics for
  # interface_include = ["mlx0", "p1p1"]

  # List of interfaces to ignore when pulling metrics.
  interface_exclude = ["eth0", "eth1", "lo", "virbr0", "virbr0-nic"]

# Read metrics about the number of files in /home
[[inputs.filecount]]
  directories = ["/home"]
  recursive = false
  regular_only = false
  interval = "60s"

# Collect statistics about itself
[[inputs.internal]]
 ## If true, collect telegraf memory stats.
 # collect_memstats = true
  interval = "60s"

# Read metrics about memory usage
[[inputs.mem]]
  # no configuration

# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration

# Read metrics about system load & uptime
[[inputs.system]]
  # no configuration

[[inputs.net]]
  # no configuration
 
[[inputs.netstat]]
  # no configuration

[[inputs.exec]]
  commands = [
    "/opt/telegraf/bin/telegraf_mountstats.py"
  ]
  data_format = "influx"
  timeout="4s"
  interval="300s"

# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
  commands = [
    "/opt/telegraf/bin/telegraf_slurmstats.py"
  ]
  data_format = "influx"
  timeout="4s"
  interval="900s"
  [inputs.exec.tags]
    influxdb_database="slurm"
{% endif %}

# Read mlx hardware counters
{% if 'hw_counters' in ansible_local %}
{% for interface in ansible_local['hw_counters'] %}
[[inputs.multifile]]
  name_override = 'infiniband'
  base_dir = '/sys/class/infiniband'
  interval = '60s'

  [[inputs.multifile.tags]]
    device = '{{ interface }}'
    port = '1'
    type = 'hw_counters'

  {% for counter in ansible_local['hw_counters'][interface] | sort %}
[[inputs.multifile.file]]
    file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
    conversion = 'int'

  {% endfor %}
{% endfor %}
{% endif %}
###############################################################################
#                              SERVICE INPUTS                                 #
###############################################################################