Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
HPCasCode
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
hpc-team
HPCasCode
Merge requests
!374
Fix hw_counters for telegraf and enable ethtool telegraf plugin
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Fix hw_counters for telegraf and enable ethtool telegraf plugin
telegraf-1.15
into
master
Overview
0
Commits
11
Pipelines
3
Changes
1
Merged
Kerri Wait
requested to merge
telegraf-1.15
into
master
4 years ago
Overview
0
Commits
11
Pipelines
3
Changes
1
Expand
0
0
Merge request reports
Viewing commit
a7715204
Prev
Next
Show latest version
1 file
+
6
−
0
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
a7715204
Monitor telegraf using telegraf
· a7715204
Kerri Wait
authored
4 years ago
roles/telegraf/templates/telegraf.conf.j2
+
30
−
105
Options
@@ -119,6 +119,15 @@
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true
# Returns ethtool statistics for given interfaces
[[inputs.ethtool]]
interval = "60s"
# List of interfaces to pull metrics for
# interface_include = ["mlx0", "p1p1"]
# List of interfaces to ignore when pulling metrics.
interface_exclude = ["eth0", "eth1", "lo", "virbr0", "virbr0-nic"]
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
@@ -126,6 +135,12 @@
regular_only = false
interval = "60s"
# Collect statistics about itself
[[inputs.internal]]
## If true, collect telegraf memory stats.
# collect_memstats = true
interval = "60s"
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
@@ -166,116 +181,26 @@
{% endif %}
# Read mlx hardware counters
{% if 'hw_counters' in ansible_local %}
{% for interface in ansible_local['hw_counters'] %}
[[inputs.multifile]]
name_override =
"
infiniband
"
base_dir =
"
/sys/class/infiniband
"
interval =
"
60s
"
name_override =
'
infiniband
'
base_dir =
'
/sys/class/infiniband
'
interval =
'
60s
'
[[inputs.multifile.tags]]
device="mlx5_0"
port="1"
type="hw_counters"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/duplicate_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/implied_nak_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/lifespan"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_buffer"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_error"
conversion = "int"
device = '{{ interface }}'
port = '1'
type = 'hw_counters'
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/req_remote_invalid_request"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_cqe_flush_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_local_length_error"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/resp_remote_access_errors"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rnr_nak_retry_err"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_atomic_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_dct_connect"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_read_requests"
conversion = "int"
[[inputs.multifile.file]]
file = "mlx5_0/ports/1/hw_counters/rx_write_requests"
conversion = "int"
{% for counter in ansible_local['hw_counters'][interface] | sort %}
[[inputs.multifile.file]]
file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
conversion = 'int'
{% endfor %}
{% endfor %}
{% endif %}
###############################################################################
# SERVICE INPUTS #
###############################################################################
Loading