Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 720 additions and 0 deletions
#!/usr/bin/python
class SlurmStats:
def __init__(self):
self._values = {'backfill':1}
def values(self):
values = ",".join(["{}={}".format(key,value) for key,value in self._values.items()])
return values
def print_stats(stats):
print("slurmstats {}".format(stats.values()))
def get_stats():
import subprocess
p = subprocess.Popen(['/opt/slurm-latest/bin/sdiag'],stdout = subprocess.PIPE,stderr=subprocess.PIPE)
(stdout,stderr) = p.communicate()
for l in stdout.decode().splitlines():
if "Total backfilled jobs (since last stats cycle start):" in l:
v = l.split(':')[1]
stats = SlurmStats()
stats._values = {'backfill':int(v)}
return stats
slurmstats = get_stats()
print_stats(slurmstats)
#! /usr/bin/awk -f
{ printf("softnet,cpu=%d sd_processed=%di,sd_dropped=%di,sd_time_squeeze=%di\n", NR-1, strtonum("0x"$1), strtonum("0x"$2), strtonum("0x"$3)) }
\ No newline at end of file
- name: restart telegraf
service:
name: telegraf
state: restarted
become: true
- name: Install Telegraf from URL [RHEL/CentOS]
yum:
name: "{{ telegraf_install_rpm_url }}"
state: latest
when: ansible_os_family == "RedHat"
become: true
become_user: root
- name: Download Telegraf package via URL [Debian/Ubuntu]
get_url:
url: "{{ telegraf_install_deb_url }}"
dest: /tmp/telegraf-ansible-download.deb
when: ansible_os_family == "Debian"
become: true
become_user: root
- name: Install Telegraf package
apt:
deb: /tmp/telegraf-ansible-download.deb
state: present
when: ansible_os_family == "Debian"
become: true
become_user: root
- name: "register if a mellanox card is present"
stat:
path: "/usr/bin/ofed_info"
register: mellanoxcard
check_mode: no
- name: Get list of hardware counters for interfaces
script: files/hw_counters.py
register: hwcounters
when: mellanoxcard.stat.exists
check_mode: no
- name: Set hardware counters in facts
set_fact:
hwcounterlist: "{{ hwcounters.stdout | from_json }}"
when: not mellanoxcard.failed
- name: Make a directory for extra files
file:
state: directory
mode: 'u=rwx,g=rx,o=rx'
owner: 'telegraf'
group: 'telegraf'
path: '/opt/telegraf/bin'
become: true
become_user: root
- name: copy mountstats plugin
copy:
mode: 'u=rwx,g=rx,o=rx'
src: telegraf_mountstats.py
dest: '/opt/telegraf/bin/telegraf_mountstats.py'
become: true
become_user: root
- name: check if slurm is on the system
stat:
path: '/opt/slurm-latest/bin/sdiag'
register: checkslurmexists
- name: copy slurmstats plugin
copy:
mode: 'u=rwx,g=rx,o=rx'
src: telegraf_slurmstats.py
dest: '/opt/telegraf/bin/telegraf_slurmstats.py'
become: true
become_user: root
when: checkslurmexists.stat.exists
- name: copy softnet_stat script
copy:
mode: 'u=rw,g=r,o=r'
src: telegraf_softnet_stats.awk
dest: '/opt/telegraf/bin/telegraf_softnet_stats.awk'
become: true
become_user: root
- name: Install Telegraf config
template:
src: telegraf.conf.j2
dest: /etc/telegraf/telegraf.conf
owner: telegraf
group: telegraf
mode: '640'
notify:
- "restart telegraf"
become: true
become_user: root
tags:
- configuration
- name: Install ethtool plugin for mlx interfaces
template:
src: inputs.ethtool.conf.j2
dest: /etc/telegraf/telegraf.d/inputs.ethtool.conf
owner: telegraf
group: telegraf
mode: '640'
notify:
- "restart telegraf"
become: true
become_user: root
tags:
- configuration
# Commented out by KW 13th Oct 2020 - will restore shortly
#- name: Install filecount plugin for /home counts
# template:
# src: inputs.filecount.conf.j2
# dest: /etc/telegraf/telegraf.d/inputs.filecount.conf
# owner: telegraf
# group: telegraf
# mode: '640'
# notify:
# - "restart telegraf"
# become: true
# become_user: root
# tags:
# - configuration
- name: Install multifile plugin for mlx hw_counters
template:
src: inputs.multifile_mlx.conf.j2
dest: /etc/telegraf/telegraf.d/inputs.multifile_mlx.conf
owner: telegraf
group: telegraf
mode: '640'
notify:
- "restart telegraf"
become: true
become_user: root
tags:
- configuration
when: not mellanoxcard.failed
- name: Install nvidia-smi plugin
template:
src: inputs.nvidia_smi.conf.j2
dest: /etc/telegraf/telegraf.d/inputs.nvidia_smi.conf
owner: telegraf
group: telegraf
mode: '640'
notify:
- "restart telegraf"
become: true
become_user: root
tags:
- configuration
- gpu
when: "'VisNodes' in group_names or 'DGXRHELNodes' in group_names"
# Returns ethtool statistics for given interfaces
[[inputs.ethtool]]
interval = "60s"
# List of interfaces to pull metrics for
# interface_include = ["mlx0", "p1p1"]
# List of interfaces to ignore when pulling metrics.
interface_exclude = ["eth0", "eth00", "eth1", "eth01", "eth2", "lo", "virbr0", "virbr0-nic"]
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
recursive = false
regular_only = false
interval = "60s"
\ No newline at end of file
# Read metrics about the number of files in /home
[[inputs.filecount]]
directories = ["/home"]
recursive = false
regular_only = false
interval = "60s"
\ No newline at end of file
# Read mlx hardware counters
{% if hwcounterlist %}
{% for interface in hwcounterlist %}
[[inputs.multifile]]
name_override = 'infiniband'
base_dir = '/sys/class/infiniband'
interval = '60s'
[[inputs.multifile.tags]]
device = '{{ interface }}'
port = '1'
type = 'hw_counters'
{% for counter in hwcounterlist[interface] | sort %}
[[inputs.multifile.file]]
file = '{{ interface }}/ports/1/hw_counters/{{ counter }}'
conversion = 'int'
{% endfor %}
{% endfor %}
{% endif %}
# Pulls statistics from nvidia GPUs attached to the host
[[inputs.nvidia_smi]]
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/usr/bin/nvidia-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
\ No newline at end of file
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
# Global tags can be specified here in key="value" format.
[tags]
hostgroup = "{{ hostgroup | default('undefined') }}"
cluster = "{{ clustername | default('undefined') }}"
computenodeclass = "{{ computenodeclass | default('undefined') }}"
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "60s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "5s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
## Maximum number of unwritten metrics per output. Increasing this value
## allows for longer periods of output downtime without dropping metrics at the
## cost of higher maximum memory usage.
metric_buffer_limit = 15000
###############################################################################
# OUTPUTS #
###############################################################################
# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
# The full HTTP or UDP endpoint URL for your InfluxDB instance.
# Multiple urls can be specified but it is assumed that they are part of the same
# cluster, this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["{{ influxdb_server }}"] # required
# The target database for metrics (telegraf will create it if not exists)
database = "telegraf" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
insecure_skip_verify = true
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
[outputs.influxdb.tagdrop]
influxdb_database = ["*"]
[[outputs.influxdb]]
urls = ["{{ influxdb_server }}"] # required
database = "slurm" # required
precision = "s"
insecure_skip_verify = true
username = "{{ influxdb_user }}"
password = "{{ influxdb_password }}"
[outputs.influxdb.tagpass]
influxdb_database = ["slurm"]
###############################################################################
# INPUTS #
###############################################################################
# Read metrics about cpu usage
[[inputs.cpu]]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["time_*"]
# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
interval = "60s"
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
interval = "60s"
{% endif %}
# By default, telegraf gather stats for all mountpoints.
# Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points=["/"]
# Read metrics about disk IO by device
[[inputs.diskio]]
# By default, telegraf will gather stats for all devices including
# disk partitions.
# Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb"]
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true
# Read ethtool for one interface only
[[inputs.ethtool]]
interface_include = ["mlx0"]
fieldpass = ["rx_discards_phy", "tx_discards_phy"]
interval="60s"
# Collect statistics about itself
[[inputs.internal]]
## If true, collect telegraf memory stats.
# collect_memstats = true
interval = "60s"
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
# Read metrics about swap memory usage
[[inputs.swap]]
# no configuration
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
[[inputs.net]]
# no configuration
[[inputs.netstat]]
# no configuration
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_mountstats.py"
]
data_format = "influx"
timeout="4s"
interval="300s"
[[inputs.exec]]
commands = [
"awk -v -f /opt/telegraf/bin/telegraf_softnet_stats.awk /proc/net/softnet_stat"
]
data_format = "influx"
timeout="4s"
interval="600s"
# Both Slurm ManagementNodes will log sdiag stats, but no Compute or Login nodes will
{% if 'ManagementNodes' in group_names %}
[[inputs.exec]]
commands = [
"/opt/telegraf/bin/telegraf_slurmstats.py"
]
data_format = "influx"
timeout="4s"
interval="900s"
[inputs.exec.tags]
influxdb_database="slurm"
{% endif %}
###############################################################################
# SERVICE INPUTS #
###############################################################################
telegraf_install_rpm_url: https://dl.influxdata.com/telegraf/releases/telegraf-1.20.4-1.x86_64.rpm
telegraf_install_deb_url: https://dl.influxdata.com/telegraf/releases/telegraf_1.20.4-1_amd64.deb
---
#https://docs.tenable.com/nessus/Content/InstallNessusAgentLinux.htm
packageurl_Ubuntu: https://www.tenable.com/downloads/api/v1/public/pages/nessus-agents/downloads/17242/download?i_agree_to_tenable_license_agreement=true
packageurl_Rhel: https://www.tenable.com/downloads/api/v1/public/pages/nessus-agents/downloads/17235/download?i_agree_to_tenable_license_agreement=true
package_version: 10.1.1 #NessusAgent-10.1.1-ubuntu1110_amd64.deb
tenableSecretKey: ""
tenableLinkingKey: ""
agentgroup: ""
\ No newline at end of file
---
- name: Install nessus agent debian
apt:
deb: "{{ packageurl_Ubuntu }}"
when: ansible_os_family == 'Debian'
become: true
register: apt_
- name: Install nessus agent rhel
yum:
name: "{{ packageurl_Rhel }}"
state: present
become: true
register: yum_
when: ansible_os_family == "RedHat"
- name: start nessus agent ubuntu
service:
name: nessusagent
state: "{% if apt_.changed %}restarted{% elif not apt_.changed %}started{% endif %}"
enabled: yes
become: true
when: ansible_os_family == 'Debian'
- name: start nessus agent rhel
service:
name: nessusagent
state: "{% if yum_.changed %}restarted{% elif not yum_.changed %}started{% endif %}"
enabled: yes
become: true
when: ansible_os_family == "RedHat"
- name: get nessus agent linking status
command: /opt/nessus_agent/sbin/nessuscli agent status
register: agent_linking_status
become: true
changed_when: false
failed_when: false
- debug:
var: agent_linking_status
- name: link nessus agent
command: '/opt/nessus_agent/sbin/nessuscli agent link --key="{{ tenableLinkingKey }}" --groups="{{ agentgroup }}" --host=cloud.tenable.com --port=443 --name="{{ ansible_hostname }}" '
become: true
when: '"Linked to: cloud.tenable.com:443" not in agent_linking_status.stdout'
register: linking
# unlink via /opt/nessus_agent/sbin/nessuscli agent --help
- debug:
var: linking
---
- name: get kernel version
shell: uname -r
register: uname_r_output
check_mode: no
changed_when: False
- name: remove conflicting packages if kernel is going to changed
block:
- name: Populate service facts
service_facts:
- name: stop lustre
service: name=lustre-client state=stopped
become: true
when: services["lustre-client.service"] is defined
- name: count lustre mounts
shell:
cmd: mount -t lustre | wc -l
register: count_of_lustre_mounts
check_mode: no
become: true
when: services["lustre-client.service"] is defined
- assert:
that:
- "count_of_lustre_mounts.stdout == '0'"
msg: "Number of Lustre mounts is: {{ count_of_lustre_mounts.stdout }}"
when: services["lustre-client.service"] is defined
- name: remove conflicting packages if kernel is going to changed
package:
state: absent
name:
- kmod-lustre-client
- kernel-devel
- lustre-client-modules-dkms
- lustre-client-utils
become: true
- name: remove mellanox packages
package:
state: absent
name:
- mlnx-ofa_kernel
- mlnx-ofa_kernel-devel
- mlnx-ofa_kernel-modules
- libibverbs
- libgpod
- usbmuxd
- libmlx5
- libmlx4
- nvidia-kmod
become: true
when: uname_r_output.stdout != KERNEL_VERSION
- name: install aptitude
apt: name=aptitude state=present
become: true
become_user: root
when: ansible_os_family=="Debian"
#- name: apt-get upgrade
# apt: upgrade=safe
# become: true
# when: ansible_os_family=="Debian"
- name: yum remove
yum:
name:
- ipa-client-common
- kmod-kvdo # found on some older monarch nodes
- iwl*firmware # intel wireless Lan
state: absent
become: true
become_user: root
when: ansible_os_family=="RedHat"
- name: yum upgrade
yum:
name: '*'
state: latest
update_cache: yes
exclude: kernel*,mlnx-ofa_kernel*,kmod-lustre-client*,kmod-mlnx-ofa_kernel*,kmod-lustre-client*,lustre-client*,centos-release*,glusterfs*,redhat-release-server
become: true
become_user: root
when: (( inventory_hostname in groups.ManagementNodes ) or ( inventory_hostname in groups.SQLNodes )) and ansible_os_family=="RedHat"
- name: yum upgrade
yum:
name: '*'
state: latest
update_cache: yes
exclude: kernel*,mlnx-ofa_kernel*,kmod-lustre-client*,kmod-mlnx-ofa_kernel*,kmod-lustre-client*,lustre-client*,centos-release*,redhat-release-server
become: true
become_user: root
when: ( inventory_hostname not in groups.ManagementNodes ) and ( inventory_hostname not in groups.SQLNodes ) and ansible_os_family=="RedHat"
- name: Clear yum pending transaction
command: yum-complete-transaction --cleanup-only
become: true
become_user: root
when: ansible_os_family=="RedHat"
register: yumtransactioncleanup
changed_when: '"No unfinished transactions left." not in yumtransactioncleanup.stdout'
- name: install centos-release
yum:
name:
- centos-release-{{ CENTOS_VERSION }}
state: present
allow_downgrade: true
become: true
when:
- ansible_os_family=="RedHat"
- '"DGX" not in ansible_product_name'
- name: install redhat-release-server
yum:
name:
- redhat-release-server-{{ RHEL_VERSION }}
state: present
allow_downgrade: true
when:
- '"DGX" in ansible_product_name'
- '"RedHat" in ansible_distribution'
become: true
become_user: root
- name: install kernel-devel
yum:
name:
- kernel-devel-{{ KERNEL_VERSION }}
- kernel-{{ KERNEL_VERSION }}
- kernel-headers-{{ KERNEL_VERSION }}
- kernel-tools-{{ KERNEL_VERSION }}
- kernel-tools-libs-{{ KERNEL_VERSION }}
state: present
allow_downgrade: true
become: true
when: ansible_os_family=="RedHat"
- name: get kernel-devel version
shell: rpm -q kernel-devel | cut -f 3,4 -d "-" | sort | tail -n 1
register: rpm_q_output
when: ansible_os_family=="RedHat"
check_mode: no
changed_when: False
args:
warn: False
- name: get kernel-devel version
shell: dpkg -l linux-image* | grep "^ii" | grep "linux-image-[0-9]" | sed 's/\ \ */ /g' | cut -f 2 -d " " | cut -f 3-5 -d "-"
register: dpkg_l_output
when: ansible_os_family=="Debian"
check_mode: no
changed_when: False
- name: get kernel version
shell: uname -r
register: uname_r_output
check_mode: no
changed_when: False
- name: default dont reboot
set_fact:
reboot_now: false
- name: debug1
debug: var=rpm_q_output
- name: debug2
debug: var=uname_r_output
- name: set reboot when kernel has changed
set_fact:
reboot_now: true
when: ansible_os_family=="RedHat" and not uname_r_output.stdout in rpm_q_output.stdout
- name: set reboot when kernel has changed
set_fact:
reboot_now: true
when: ansible_os_family=="Debian" and not uname_r_output.stdout in dpkg_l_output.stdout
- name: debug3
debug: var=reboot_now
- name: restart machine
reboot:
become: true
when: reboot_now
def used(role):
import subprocess
p = subprocess.call("grep {} *".format(role))
return p == 0
with open('roles.txt') as f:
roles = f.readlines()
for r in roles:
print("{},{}".format(r,used(r)))
---
- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
- name: restart ssh
service: name={{ sshd_name }} state=restarted
become: true
---
- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
- name: copy ca cert
copy: src=user_ssh_ca.pub dest=/etc/ssh/server_ca.pub owner=root group=root mode=644
become: true
ignore_errors: true
- name: edit sshd_config
lineinfile:
args:
dest: /etc/ssh/sshd_config
line: TrustedUserCAKeys /etc/ssh/server_ca.pub
state: present
become: true
notify: restart ssh
ignore_errors: true
sshd_name: "ssh"
sshd_name: "sshd"