Skip to content
Snippets Groups Projects
telegraf.conf.j2 8.73 KiB
Newer Older
# Telegraf configuration

# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.

# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.

# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.

# Global tags can be specified here in key="value" format.
[tags]
Chris Hines's avatar
Chris Hines committed
  hostgroup = "{{ hostgroup | default('undefined') }}"
  cluster = "{{ clustername | default('undefined') }}"
  computenodeclass = "{{ computenodeclass | default('undefined') }}"
  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
  # rack = "1a"

# Configuration for telegraf agent
[agent]
  # Default data collection interval for all plugins
  interval = "10s"
  # Rounds collection interval to 'interval'
  # ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  # Default data flushing interval for all outputs. You should not set this below
  # interval. Maximum flush_interval will be flush_interval + flush_jitter
  flush_interval = "60s"
  # Jitter the flush interval by a random amount. This is primarily to avoid
  # large write spikes for users running a large number of telegraf instances.
  # ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "5s"

  # Run telegraf in debug mode
  debug = false
  # Override default hostname, if empty use os.Hostname()
  hostname = ""

Kerri Wait's avatar
Kerri Wait committed
  ## Maximum number of unwritten metrics per output.  Increasing this value
  ## allows for longer periods of output downtime without dropping metrics at the
  ## cost of higher maximum memory usage.
  metric_buffer_limit = 15000

###############################################################################
#                                  OUTPUTS                                    #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
  # The full HTTP or UDP endpoint URL for your InfluxDB instance.
  # Multiple urls can be specified but it is assumed that they are part of the same
  # cluster, this means that only ONE of the urls will be written to each interval.
  # urls = ["udp://localhost:8089"] # UDP endpoint example
  urls = ["{{ influxdb_server }}"] # required
  # The target database for metrics (telegraf will create it if not exists)
  database = "telegraf" # required
  # Precision of writes, valid values are n, u, ms, s, m, and h
  # note: using second precision greatly helps InfluxDB compression
  precision = "s"

  # Connection timeout (for the connection with InfluxDB), formatted as a string.
  # If not provided, will default to 0 (no timeout)
  # timeout = "5s"
  # username = "telegraf"
  # password = "metricsmetricsmetricsmetrics"
  username = "{{ influxdb_user }}" 
  password = "{{ influxdb_password }}"
  # Set the user agent for HTTP POSTs (can be useful for log differentiation)
  # user_agent = "telegraf"
  # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
  # udp_payload = 512
  [outputs.influxdb.tagdrop]
       influxdb_database = ["*"]

[[outputs.influxdb]]
  urls = ["{{ influxdb_server }}"] # required
  database = "slurm" # required
  precision = "s"
  username = "{{ influxdb_user }}" 
  password = "{{ influxdb_password }}"
  [outputs.influxdb.tagpass]
       influxdb_database = ["slurm"]


###############################################################################
#                                  INPUTS                                     #
###############################################################################

# Read metrics about cpu usage
[[inputs.cpu]]
  # Whether to report per-cpu stats or not
  percpu = true
  # Whether to report total system cpu stats or not
  totalcpu = true
  # Comment this line if you want the raw CPU time metrics
  drop = ["time_*"]

# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
  # By default, telegraf gather stats for all mountpoints.
  # Setting mountpoints will restrict the stats to the specified mountpoints.
  # mount_points=["/"]

# Read metrics about disk IO by device
[[inputs.diskio]]
  # By default, telegraf will gather stats for all devices including
  # disk partitions.
  # Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb"]
  # Uncomment the following line if you do not need disk serial numbers.
  # skip_serial_number = true

# Read metrics about the number of files in /home
[[inputs.filecount]]
  directories = ["/home"]
  recursive = false
  regular_only = false
  interval = "60s"
# Read metrics about memory usage
[[inputs.mem]]
  # no configuration

# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration

# Read metrics about system load & uptime
[[inputs.system]]
  # no configuration

[[inputs.net]]
  # no configuration
 
[[inputs.netstat]]
  # no configuration
[[inputs.exec]]
  commands = [
    "/opt/telegraf/bin/telegraf_mountstats.py"
  ]
  data_format = "influx"
  timeout="4s"
  interval="300s"

# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
  commands = [
    "/opt/telegraf/bin/telegraf_slurmstats.py"
  ]
  data_format = "influx"
  timeout="4s"
  [inputs.exec.tags]
    influxdb_database="slurm"
{% endif %}

# Read mlx hardware counters
[[inputs.multifile]]
  name_override = "infiniband"
  base_dir = "/sys/class/infiniband"
  interval = "60s"

  [[inputs.multifile.tags]]
    device="mlx5_0"
    port="1"
    type="hw_counters"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/duplicate_request"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/lifespan"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
    conversion = "int"

  [[inputs.multifile.file]]
    file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
    conversion = "int"

  [[inputs.multifile.file]]
  file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
  conversion = "int"

  [[inputs.multifile.file]]
  file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
  conversion = "int"

  [[inputs.multifile.file]]
  file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
  conversion = "int"
###############################################################################
#                              SERVICE INPUTS                                 #
###############################################################################