Skip to content
Snippets Groups Projects
Commit 257c9b58 authored by Kerri Wait's avatar Kerri Wait
Browse files

Merge remote-tracking branch 'origin/master' into telegraf-1.15

parents 1a2e8924 c57eac21
No related branches found
No related tags found
10 merge requests!399Capture extra NFS stats,!393Hotfix: monitor NFS GETATTR stats via telegraf,!392Temporarily disable inputs.filecount in telegraf,!389Update telegraf config to ignore more ethX interfaces in ethtool plugin,!388Fix the telegraf config for mlx hw counters to get rid of errors in logs,!387Telegraf 1.15 nvidia_smi fix,!386Telegraf 1.15 nvidia_smi plugin,!380Telegraf monitoring for Karaage (hpc.erc.monash.edu.au),!374Fix hw_counters for telegraf and enable ethtool telegraf plugin,!371Telegraf 1.15
This commit is part of merge request !371. Comments created here will be created in the context of that merge request.
This role permanently turns off a network interface. This is needed for baremetal
machines, which may have a management interface (i.e. e1p1) that needs to
be disabled for security reasons. We use `ip link set <Name> down` to disable the interface.
To survive a reboot, this role sets up a service file and enables it for starting upon an OS start.
Usage
- {role: disable_interface, interface_name : "eth5" }
- {role: disable_interface }
{{ interface_name }} if not defined, defaults to "e1p1"
---
# This role adds a sytemd services file and enables it
# It disables the {{ interface_name }} interface (Management port) on Baremetal nodes
- set_fact: interface_name="e1p1"
when: interface_name is undefined
- name: Create service file for turning off interace name
template: src=disable_interface.service.j2 dest=/etc/systemd/system/disable_interface.service mode="u=rw,g=r,o=r"
become: true
become_user: root
- name: enable and start device_off service
service: name=disable_interface.service state=started enabled=yes
become: true
become_user: root
[Unit]
Description=Turn off {{ interface_name }} interface (management port)
After=network.target network-online.target openibd.service
Wants=network-online.target
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/sbin/ip link set {{ interface_name }} down
#'ip link show {{ interface_name }} ' is either UP or DOWN
[Install]
WantedBy=multi-user.target
WantedBy=final.target
......@@ -16,15 +16,18 @@
when: ansible_os_family == 'RedHat'
- name: install lua RHEL7
yum: name={{ item }} state=present update_cache=yes enablerepo="Monash_University_EPEL7_EPEL_7_-_x86_64"
with_items:
- lua
- lua-filesystem
- lua-posix
- tcl
- rsync
- gcc
- lua-devel
yum:
state: present
update_cache: yes
enablerepo: "Monash_University_EPEL7_EPEL_7_-_x86_64"
name:
- lua
- lua-filesystem
- lua-posix
- tcl
- rsync
- gcc
- lua-devel
when:
- '"DGX" in ansible_product_name'
- '"RedHat" in ansible_distribution'
......@@ -61,4 +64,4 @@
args:
creates: "{{ soft_dir }}/lmod/{{ lmod_version }}"
become: true
when: ansible_os_family == 'RedHat'
\ No newline at end of file
when: ansible_os_family == 'RedHat'
---
# make sure firewalld is not installed
- name: make sure firewalld is not installed
yum: name={{ item }} state=absent
yum:
name:
- firewalld
- firewall-config
state: absent
become: true
become_user: root
with_items:
- firewalld
- firewall-config
# make sure iptables is installed
- name: make sure iptables-services is installed
......
- name: yum install cgroup
yum: name={{ item }} state=present
with_items:
- libcgroup
yum: name=libcgroup state=present
become: True
when: ansible_os_family == "RedHat"
......
......@@ -99,13 +99,13 @@
# Read metrics about disk usage by mount point
[[inputs.disk]]
{% if 'ComputeNodes' in group_names %}
{% if 'ComputeNodes' in group_names %}
interval = "60s"
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
{% endif %}
{% if 'LoginNodes' in group_names %}
interval = "60s"
{% endif %}
{% endif %}
# By default, telegraf gather stats for all mountpoints.
# Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points=["/"]
......
......@@ -57,6 +57,7 @@
name:
- ipa-client-common
- kmod-kvdo # found on some older monarch nodes
- iwl*firmware # intel wireless Lan
state: absent
become: true
become_user: root
......
#!/usr/bin/python3
# The purpose of this script is to enrich every VM of an ansible inventory file
# in json format with the available hypervisor mapping found in /projects/pMOSP/hypervisor/
hypervisormapping = open('m3-latest', 'r') # coming from /projects/pMOSP/hypervisor/m3-latest
# for monarch see /projects/pMOSP/hypervisor/monarch-vm-hw-mapping-2020
mapping = hypervisormapping.readlines()
mapping=mapping[3:-1]
import json,socket,sys
with open('m3inventory.json') as json_file: # this file was created via m3inventory > m3inventory.json
inv = json.load(json_file)
for map in mapping:
vm=map.split('|')[2].strip()
hyp=map.split('|')[4].strip()
if vm not in inv['_meta']['hostvars'].keys():
sys.stderr.write("Not found in inventory: {}\n".format(vm))
continue
inv['_meta']['hostvars'][vm]['hypervisor_ip']=socket.gethostbyname(hyp+'-1g.erc.monash.edu')
print( "#!/bin/bash\necho '"+json.dumps(inv,indent=4, sort_keys=True)+"'")
......@@ -101,6 +101,63 @@ def load_data(artifactfile="artifacts.zip", nodeclass="compute_ansible_check.log
return df
def bokeh_plot(df,title):
# Create a series of colour bars (i.e. a HeatMap) from a list
# The list should include columns for task, host, change and changestr
# (the value of change sets the colour but the value of changestr is shown in the tool tip)
from bokeh.io import output_file, show
from bokeh.layouts import column
from bokeh.plotting import figure
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar, Div
from bokeh.plotting import figure, save
from math import pi
# this is an abbreviated colormap from a bokeh example
colors = ['#084594', '#2171b5', '#4292c6', "#dfccce", "#550b1d"]
#colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=0, high=4)
#colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
#mapper = LinearColorMapper(palette=colors, low=0, high=2)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
dataxrange = list(df.index.get_level_values(1).unique())
datayrange = list(df.index.get_level_values(0).unique())
p = figure(title=title,
x_range=dataxrange, y_range=datayrange,
x_axis_location="above",
sizing_mode='stretch_width',
tools=TOOLS, toolbar_location='below',
tooltips=[('host', '@host'), ('task', '@task'), ('changed', '@changestr'),('taskid','@taskid') ])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.xaxis.major_tick_line_color = None # turn off x-axis major ticks
p.xaxis.minor_tick_line_color = None # turn off x-axis minor ticks
p.xaxis.major_label_text_color = None # turn off x-axis tick labels leaving space
p.xaxis.major_label_text_font_size = '0pt' # turn off x-axis tick labels
p.yaxis.major_tick_line_color = None # turn off x-axis major ticks
p.yaxis.minor_tick_line_color = None # turn off x-axis minor ticks
p.yaxis.major_label_text_color = None # turn off y-axis tick labels leaving space
p.yaxis.major_label_text_font_size = '0pt' # turn off y-axis tick labels
#p.axis.major_tick_line_color = None
#p.axis.major_label_text_font_size = "5pt"
#p.axis.major_label_standoff = 0
#p.xaxis.major_label_orientation = pi / 3
p.rect(x="taskid", y="host", width=1, height=1,
source=df,
fill_color={'field': 'change', 'transform': mapper},
line_color=None)
save(p)
return p
import logging
from slack_logger import SlackHandler, SlackFormatter
slack_hook = os.environ['SLACK_HOOK']
......@@ -117,25 +174,82 @@ sh.setLevel(logging.DEBUG)
logger.addHandler(sh)
df = load_data(nodeclass="compute_ansible_check.log")
if (len(sys.argv)>1 and sys.argv[1]=='outputChangedNodeList'):
import bokeh.io
from datetime import datetime
strBokehfile="output.html"
#datetime.today().strftime('%Y%m%d')+'.html'
bokeh.io.output_file(strBokehfile)
#from bokeh.io import curdoc
from bokeh.models import Div
from bokeh.layouts import layout, column
import logging
import sys
import os
if (len(sys.argv)>1 and 'outputChangedNodeList' in sys.argv):
print(yaml.dump(list(df[df.change == 2].host.unique())))
sys.exit(0)
df = load_data(nodeclass="compute_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
cmpplot = bokeh_plot(df, "Compute Nodes")
cmd="mv "+strBokehfile+" comp_"+strBokehfile
os.system(cmd)
#cmd='swift upload ansiblechecker comp_'+strBokehfile
#os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Compute nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
df = load_data(nodeclass="login_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
cmpplot = bokeh_plot(df, "login Nodes")
cmd="mv "+strBokehfile+" login_"+strBokehfile
os.system(cmd)
#cmd='swift upload ansiblechecker login_'+strBokehfile
#os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Login nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
df = load_data(nodeclass="mgmt_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
cmpplot = bokeh_plot(df, "mgmt_ Nodes")
cmd="mv "+strBokehfile+" mgmt_"+strBokehfile
os.system(cmd)
#cmd='swift upload ansiblechecker mgmt_'+strBokehfile
#os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Management nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
df = load_data(nodeclass="dgx_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
cmpplot = bokeh_plot(df, "dgx_ Nodes")
cmd="mv "+strBokehfile+" dgx_"+strBokehfile
os.system(cmd)
#cmd='swift upload ansiblechecker dgx_'+strBokehfile
#os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} DGX nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
logger.info("this is defined in .gitlab-ci.yml in ansible_check and the trigger is configured in https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/pipeline_schedules ")
#logger.info("https://swift.rc.nectar.org.au/v1/AUTH_e86c925319094fb2b8cc1bf2373c69dc/ansiblechecker/"+strBokehfile)
str="https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/-/jobs/"+os.environ['CI_JOB_ID']+"/artifacts/browse"
logger.info(str)
#if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
#cmpplot = bokeh_plot(df, "Compute Nodes")
#cmd='swift upload ansiblechecker '+strBokehfile
#os.system(cmd)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment