Newer
Older
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
# Global tags can be specified here in key="value" format.
[tags]
hostgroup = "{{ hostgroup | default('undefined') }}"
cluster = "{{ clustername | default('undefined') }}"
computenodeclass = "{{ computenodeclass | default('undefined') }}"
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "60s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "5s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
## Maximum number of unwritten metrics per output. Increasing this value
## allows for longer periods of output downtime without dropping metrics at the
## cost of higher maximum memory usage.
metric_buffer_limit = 15000
###############################################################################
# OUTPUTS #
###############################################################################
# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
# The full HTTP or UDP endpoint URL for your InfluxDB instance.
# Multiple urls can be specified but it is assumed that they are part of the same
# cluster, this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["{{ influxdb_server }}"] # required
# The target database for metrics (telegraf will create it if not exists)
database = "telegraf" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
[outputs.influxdb.tagdrop]
influxdb_database = ["*"]
[[outputs.influxdb]]
urls = ["{{ influxdb_server }}"] # required
database = "slurm" # required
precision = "s"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
[outputs.influxdb.tagpass]
influxdb_database = ["slurm"]
###############################################################################
# INPUTS #
###############################################################################
# Read metrics about cpu usage
[[inputs.cpu]]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["time_*"]
# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
interval = "60s"
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
interval = "60s"
# By default, telegraf gather stats for all mountpoints.
# Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points=["/"]
# Read metrics about disk IO by device
[[inputs.diskio]]
# By default, telegraf will gather stats for all devices including
# disk partitions.
# Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb"]
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true
# Returns ethtool statistics for given interfaces
[[inputs.ethtool]]
interval = "60s"
# List of interfaces to pull metrics for
# interface_include = ["mlx0", "p1p1"]
# List of interfaces to ignore when pulling metrics.
interface_exclude = ["eth0", "eth1", "lo", "virbr0", "virbr0-nic"]
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
recursive = false
regular_only = false
# Collect statistics about itself
[[inputs.internal]]
## If true, collect telegraf memory stats.
# collect_memstats = true
interval = "60s"
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
# Read metrics about swap memory usage
[[inputs.swap]]
# no configuration
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
[[inputs.net]]
# no configuration
[[inputs.netstat]]
# no configuration
Chris Hines
committed
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_mountstats.py"
]
data_format = "influx"
timeout="4s"
interval="300s"
# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_slurmstats.py"
]
data_format = "influx"
timeout="4s"
[inputs.exec.tags]
influxdb_database="slurm"
{% endif %}
# Read mlx hardware counters
{% if 'hw_counters' in ansible_local %}
{% for interface in ansible_local['hw_counters'] %}
[[inputs.multifile]]
name_override = 'infiniband'
base_dir = '/sys/class/infiniband'
interval = '60s'
[[inputs.multifile.tags]]
device = '{{ interface }}'
port = '1'
type = 'hw_counters'
{% for counter in ansible_local['hw_counters'][interface] | sort %}
[[inputs.multifile.file]]
file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
conversion = 'int'
Chris Hines
committed
{% endfor %}
{% endfor %}
{% endif %}
###############################################################################
# SERVICE INPUTS #
###############################################################################