Newer
Older
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
# Global tags can be specified here in key="value" format.
[tags]
hostgroup = "{{ hostgroup | default('undefined') }}"
cluster = "{{ clustername | default('undefined') }}"
computenodeclass = "{{ computenodeclass | default('undefined') }}"
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "60s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "5s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
## Maximum number of unwritten metrics per output. Increasing this value
## allows for longer periods of output downtime without dropping metrics at the
## cost of higher maximum memory usage.
metric_buffer_limit = 15000
###############################################################################
# OUTPUTS #
###############################################################################
# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
# The full HTTP or UDP endpoint URL for your InfluxDB instance.
# Multiple urls can be specified but it is assumed that they are part of the same
# cluster, this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["{{ influxdb_server }}"] # required
# The target database for metrics (telegraf will create it if not exists)
database = "telegraf" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
[outputs.influxdb.tagdrop]
influxdb_database = ["*"]
[[outputs.influxdb]]
urls = ["{{ influxdb_server }}"] # required
database = "slurm" # required
precision = "s"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
[outputs.influxdb.tagpass]
influxdb_database = ["slurm"]
###############################################################################
# INPUTS #
###############################################################################
# Read metrics about cpu usage
[[inputs.cpu]]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["time_*"]
# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
interval = "60s"
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
interval = "60s"
# By default, telegraf gather stats for all mountpoints.
# Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points=["/"]
# Read metrics about disk IO by device
[[inputs.diskio]]
# By default, telegraf will gather stats for all devices including
# disk partitions.
# Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb"]
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
recursive = false
regular_only = false
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
# Read metrics about swap memory usage
[[inputs.swap]]
# no configuration
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
[[inputs.net]]
# no configuration
[[inputs.netstat]]
# no configuration
Chris Hines
committed
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_mountstats.py"
]
data_format = "influx"
timeout="4s"
interval="300s"
# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_slurmstats.py"
]
data_format = "influx"
timeout="4s"
[inputs.exec.tags]
influxdb_database="slurm"
{% endif %}
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# Read mlx hardware counters
[[inputs.multifile]]
name_override = "infiniband"
base_dir = "/sys/class/infiniband"
interval = "60s"
[[inputs.multifile.tags]]
device="mlx5_0"
port="1"
type="hw_counters"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/duplicate_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/lifespan"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
conversion = "int"
Chris Hines
committed
###############################################################################
# SERVICE INPUTS #
###############################################################################