Merge remote-tracking branch 'origin/master' into telegraf-1.15

257c9b58 · Kerri Wait · 1a2e8924 · c57eac21 · 257c9b58 · 257c9b58
Commit 257c9b58 authored 4 years ago by Kerri Wait
--- a/roles/disable_interface/README.md
+++ b/roles/disable_interface/README.md
+This role permanently turns off a network interface. This is needed for baremetal
+machines, which may have a management interface (i.e. e1p1) that needs to
+be disabled for security reasons. We use `ip link set <Name> down` to disable the interface.
+
+To survive a reboot, this role sets up a service file and enables it for starting upon an OS start.
+
+Usage
+ - {role: disable_interface, interface_name : "eth5" }
+ - {role: disable_interface  }
+
+{{ interface_name }} if not defined, defaults to "e1p1"
--- a/roles/disable_interface/tasks/main.yml
+++ b/roles/disable_interface/tasks/main.yml
+---
+
+# This role adds a sytemd services file and enables it
+# It disables the {{ interface_name }} interface  (Management port) on Baremetal nodes
+- set_fact: interface_name="e1p1"
+  when: interface_name is undefined
+
+- name: Create service file for turning off interace name
+  template: src=disable_interface.service.j2 dest=/etc/systemd/system/disable_interface.service mode="u=rw,g=r,o=r"
+  become: true
+  become_user: root
+
+- name: enable and start device_off service
+  service: name=disable_interface.service state=started enabled=yes
+  become: true
+  become_user: root
--- a/roles/disable_interface/templates/disable_interface.service.j2
+++ b/roles/disable_interface/templates/disable_interface.service.j2
+[Unit]
+Description=Turn off {{ interface_name }} interface (management port)
+After=network.target network-online.target openibd.service
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/sbin/ip link set {{ interface_name }}  down
+#'ip link show {{ interface_name }} ' is either UP or DOWN
+
+[Install]
+WantedBy=multi-user.target
+WantedBy=final.target
+
--- a/roles/lmod/tasks/main.yml
+++ b/roles/lmod/tasks/main.yml
@@ -16,15 +16,18 @@
  when: ansible_os_family == 'RedHat'

 - name: install lua RHEL7
-  yum: name={{ item }} state=present update_cache=yes enablerepo="Monash_University_EPEL7_EPEL_7_-_x86_64"
-  with_items:
-    - lua
-    - lua-filesystem
-    - lua-posix
-    - tcl
-    - rsync
-    - gcc
-    - lua-devel
+  yum:
+    state: present
+    update_cache: yes
+    enablerepo: "Monash_University_EPEL7_EPEL_7_-_x86_64"
+    name:
+      - lua
+      - lua-filesystem
+      - lua-posix
+      - tcl
+      - rsync
+      - gcc
+      - lua-devel
  when:
   - '"DGX" in ansible_product_name'
   - '"RedHat" in ansible_distribution'
@@ -61,4 +64,4 @@
  args:
    creates: "{{ soft_dir }}/lmod/{{ lmod_version }}"
  become: true
-  when: ansible_os_family == 'RedHat'
\ No newline at end of file
+  when: ansible_os_family == 'RedHat'
--- a/roles/nat_server/tasks/main.yml
+++ b/roles/nat_server/tasks/main.yml
 ---
 # make sure firewalld is not installed
 - name: make sure firewalld is not installed
-  yum: name={{ item }} state=absent
+  yum:
+    name:
+      - firewalld
+      - firewall-config
+    state: absent
  become: true
  become_user: root
-  with_items:
-  - firewalld
-  - firewall-config

 # make sure iptables is installed
 - name: make sure iptables-services is installed

--- a/roles/slurm-common/tasks/installCgroup.yml
+++ b/roles/slurm-common/tasks/installCgroup.yml
 - name: yum install cgroup
-  yum: name={{ item }} state=present
-  with_items:
-    - libcgroup
+  yum: name=libcgroup state=present
  become: True
  when: ansible_os_family == "RedHat"


--- a/roles/telegraf/templates/telegraf.conf.j2
+++ b/roles/telegraf/templates/telegraf.conf.j2
@@ -99,13 +99,13 @@

 # Read metrics about disk usage by mount point
 [[inputs.disk]]
-  {% if 'ComputeNodes' in group_names %}
+{% if 'ComputeNodes' in group_names %}
  interval = "60s"
-  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4"]
-  {% endif %}
-  {% if 'LoginNodes' in group_names %}
+  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs", "nfsv4", "nfs4"]
+{% endif %}
+{% if 'LoginNodes' in group_names %}
  interval = "60s"
-  {% endif %}
+{% endif %}
  # By default, telegraf gather stats for all mountpoints.
  # Setting mountpoints will restrict the stats to the specified mountpoints.
  # mount_points=["/"]

--- a/roles/upgrade/tasks/main.yml
+++ b/roles/upgrade/tasks/main.yml
@@ -57,6 +57,7 @@
    name:
      - ipa-client-common
      - kmod-kvdo # found on some older monarch nodes
+      - iwl*firmware # intel wireless Lan
    state: absent
  become: true
  become_user: root

--- a/scripts/addHypervisorsToInventory.py
+++ b/scripts/addHypervisorsToInventory.py
+#!/usr/bin/python3
+
+# The purpose of this script is to enrich every VM of an ansible inventory file
+# in json format with the available hypervisor mapping found in /projects/pMOSP/hypervisor/
+
+hypervisormapping = open('m3-latest', 'r')  # coming from /projects/pMOSP/hypervisor/m3-latest
+# for monarch see /projects/pMOSP/hypervisor/monarch-vm-hw-mapping-2020
+mapping = hypervisormapping.readlines() 
+mapping=mapping[3:-1]
+
+import json,socket,sys
+
+with open('m3inventory.json') as json_file: # this file was created via m3inventory > m3inventory.json
+    inv = json.load(json_file)
+
+for map in mapping:
+    vm=map.split('|')[2].strip()
+    hyp=map.split('|')[4].strip()
+    if vm not in inv['_meta']['hostvars'].keys():
+        sys.stderr.write("Not found in inventory: {}\n".format(vm))
+        continue
+    inv['_meta']['hostvars'][vm]['hypervisor_ip']=socket.gethostbyname(hyp+'-1g.erc.monash.edu')
+
+print( "#!/bin/bash\necho '"+json.dumps(inv,indent=4, sort_keys=True)+"'")
--- a/scripts/check_summary.py
+++ b/scripts/check_summary.py
@@ -101,6 +101,63 @@ def load_data(artifactfile="artifacts.zip", nodeclass="compute_ansible_check.log
    return df


+def bokeh_plot(df,title):
+    # Create a series of colour bars (i.e. a HeatMap) from a list
+    # The list should include columns for task, host, change and changestr
+    # (the value of change sets the colour but the value of changestr is shown in the tool tip)
+    from bokeh.io import output_file, show
+    from bokeh.layouts import column
+    from bokeh.plotting import figure
+    from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar, Div
+    from bokeh.plotting import figure, save
+    from math import pi
+
+
+    # this is an abbreviated colormap from a bokeh example
+    colors = ['#084594', '#2171b5', '#4292c6', "#dfccce", "#550b1d"]
+    #colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
+    mapper = LinearColorMapper(palette=colors, low=0, high=4)
+    #colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
+    #mapper = LinearColorMapper(palette=colors, low=0, high=2)
+
+    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
+
+
+    dataxrange = list(df.index.get_level_values(1).unique())
+    datayrange = list(df.index.get_level_values(0).unique())
+    p = figure(title=title,
+               x_range=dataxrange, y_range=datayrange,
+               x_axis_location="above", 
+               sizing_mode='stretch_width',
+               tools=TOOLS, toolbar_location='below',
+               tooltips=[('host', '@host'), ('task', '@task'), ('changed', '@changestr'),('taskid','@taskid') ])
+
+    p.grid.grid_line_color = None
+    p.axis.axis_line_color = None
+
+    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
+    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
+    p.xaxis.major_label_text_color = None  # turn off x-axis tick labels leaving space
+    p.xaxis.major_label_text_font_size = '0pt'  # turn off x-axis tick labels
+
+    p.yaxis.major_tick_line_color = None  # turn off x-axis major ticks
+    p.yaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
+    p.yaxis.major_label_text_color = None  # turn off y-axis tick labels leaving space
+    p.yaxis.major_label_text_font_size = '0pt'  # turn off y-axis tick labels
+    #p.axis.major_tick_line_color = None
+    #p.axis.major_label_text_font_size = "5pt"
+    #p.axis.major_label_standoff = 0
+    #p.xaxis.major_label_orientation = pi / 3
+
+    p.rect(x="taskid", y="host", width=1, height=1,
+           source=df,
+           fill_color={'field': 'change', 'transform': mapper},
+           line_color=None)
+    save(p)
+
+    return p
+
+
 import logging
 from slack_logger import SlackHandler, SlackFormatter
 slack_hook = os.environ['SLACK_HOOK']
@@ -117,25 +174,82 @@ sh.setLevel(logging.DEBUG)

 logger.addHandler(sh)

-df = load_data(nodeclass="compute_ansible_check.log")

-if (len(sys.argv)>1 and sys.argv[1]=='outputChangedNodeList'):
+import bokeh.io
+from datetime import datetime
+strBokehfile="output.html"  
+#datetime.today().strftime('%Y%m%d')+'.html'
+bokeh.io.output_file(strBokehfile)
+#from bokeh.io import curdoc
+from bokeh.models import Div
+from bokeh.layouts import layout, column
+import logging
+import sys
+import os
+
+
+
+
+if (len(sys.argv)>1 and 'outputChangedNodeList' in sys.argv):
    print(yaml.dump(list(df[df.change == 2].host.unique())))
    sys.exit(0)
-    
+
+df = load_data(nodeclass="compute_ansible_check.log")
+
+
+if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
+    cmpplot = bokeh_plot(df, "Compute Nodes")
+    cmd="mv "+strBokehfile+" comp_"+strBokehfile
+    os.system(cmd)
+    #cmd='swift upload ansiblechecker comp_'+strBokehfile
+    #os.system(cmd)
+
 nodes = len(df.host.unique())
 changed = len(df[df.change == 2].host.unique())
 failed = len(df[df.change == 4].host.unique())
 logger.info("{} Compute nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

 df = load_data(nodeclass="login_ansible_check.log")
+if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
+    cmpplot = bokeh_plot(df, "login Nodes")
+    cmd="mv "+strBokehfile+" login_"+strBokehfile
+    os.system(cmd)
+    #cmd='swift upload ansiblechecker login_'+strBokehfile
+    #os.system(cmd)
 nodes = len(df.host.unique())
 changed = len(df[df.change == 2].host.unique())
 failed = len(df[df.change == 4].host.unique())
 logger.info("{} Login nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

 df = load_data(nodeclass="mgmt_ansible_check.log")
+if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
+    cmpplot = bokeh_plot(df, "mgmt_ Nodes")
+    cmd="mv "+strBokehfile+" mgmt_"+strBokehfile
+    os.system(cmd)
+    #cmd='swift upload ansiblechecker mgmt_'+strBokehfile
+    #os.system(cmd)
 nodes = len(df.host.unique())
 changed = len(df[df.change == 2].host.unique())
 failed = len(df[df.change == 4].host.unique())
 logger.info("{} Management nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
+
+df = load_data(nodeclass="dgx_ansible_check.log")
+if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
+    cmpplot = bokeh_plot(df, "dgx_ Nodes")
+    cmd="mv "+strBokehfile+" dgx_"+strBokehfile
+    os.system(cmd)
+    #cmd='swift upload ansiblechecker dgx_'+strBokehfile
+    #os.system(cmd)
+nodes = len(df.host.unique())
+changed = len(df[df.change == 2].host.unique())
+failed = len(df[df.change == 4].host.unique())
+logger.info("{} DGX nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))
+
+logger.info("this is defined in .gitlab-ci.yml in ansible_check and the trigger is configured in https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/pipeline_schedules ")
+#logger.info("https://swift.rc.nectar.org.au/v1/AUTH_e86c925319094fb2b8cc1bf2373c69dc/ansiblechecker/"+strBokehfile)
+str="https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/-/jobs/"+os.environ['CI_JOB_ID']+"/artifacts/browse"
+logger.info(str)
+#if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
+    #cmpplot = bokeh_plot(df, "Compute Nodes")
+    #cmd='swift upload ansiblechecker '+strBokehfile
+    #os.system(cmd)
\ No newline at end of file