Commit 1ceea7ce authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'master' of gitlab.erc.monash.edu.au:hpc-team/HPCasCode into mlaas-ci

parents d99f67f3 55a3c9e5
......@@ -68,7 +68,7 @@
become: true
tags: [never,sqlverify]
- hosts: 'LoginNodes:!perfsonar01:!GlobusNodes'
- hosts: 'LoginNodes:!perfsonar01'
gather_facts: false
tasks:
- name: set nologin
......@@ -87,7 +87,8 @@
- name: terminate user ssh processes
block:
- { name: kill shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
- { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
- { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp|sftp-server|sshd" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
- { name: kill vscode, shell: 'pgrep -f vscode | xargs -I{} kill -09 {}', become: true, become_user: root, ignore_errors: true }
become: true
tags: [never,terminateusersshscprsync]
......
......@@ -16,9 +16,16 @@
with_items:
- monashhpc_base.repo
- monashhpc_others.repo
- epel.repo
- monashhpc_epel.repo
register: addingrepos
- name: instal yum-utils package
package:
name: yum-utils
state: present
become: true
when: ansible_os_family == 'RedHat'
- name: get enabled repos
#shell: yum repolist | grep -v "repo id" | grep -v "Loaded plugins" | head -n -1 | cut -f 1 -d '/' | sed -s 's/\!//'
#shell: yum repolist all | grep enabled | cut -f 1 -d '/' | sed -s 's/\!//'
......
......@@ -5,21 +5,25 @@ name=MonashHPC base repository mirrored to control the update process
baseurl=https://{{ reposervername }}/{{ repopath }}/$releasever/os/$basearch/
enabled=1
sslverify=false
gpgcheck=0
[monashhpc_updates]
name=MonashHPC base repository mirrored to control the update process
baseurl=https://{{ reposervername }}/{{ repopath }}/$releasever/updates/$basearch/
enabled=1
sslverify=false
gpgcheck=0
[monashhpc_extras]
name=MonashHPC base repository mirrored to control the update process
baseurl=https://{{ reposervername }}/{{ repopath }}/$releasever/extras/$basearch/
enabled=1
sslverify=false
gpgcheck=0
[monashhpc_centosplus]
name=MonashHPC base repository mirrored to control the update process
baseurl=https://{{ reposervername }}/{{ repopath }}/$releasever/centosplus/$basearch/
enabled=1
sslverify=false
gpgcheck=0
# Place this file in your /etc/yum.repos.d/ directory
[epel]
[monashhpc_epel]
name=Extra Packages for Enterprise Linux 7 - $basearch
baseurl=https://{{ reposervername }}/epel/$releasever/$basearch/
enabled=0
......
#!/usr/bin/python3 -E
from jinja2 import Template, Environment, FileSystemLoader
import itertools
import subprocess
import datetime
import os
import sys
import time
import socket
from subprocess import call
import re
import json
def grab_card_ids():
# This method runs nvidia-smi to grab the card ids, then returns a list
if not os.path.isfile("/bin/nvidia-smi"):
print("nvidia-smi binary not found!")
exit(1)
cmd = ["/bin/nvidia-smi", "--query-gpu=pci.bus_id","--format=csv,noheader"]
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
cards = []
for line in p.stdout.readlines():
line = line.decode().rstrip().split(":")[2]
pcibus_num = int(re.sub('[.:]', '', line).rstrip("0"),16)
card = "PCI:0:{}:0".format(str(pcibus_num))
cards.append(card)
return cards
def grab_card_boardname():
if not os.path.isfile("/bin/nvidia-smi"):
print("nvidia-smi binary not found!")
exit(1)
cmd = ["/bin/nvidia-smi", "--query-gpu=name","--format=csv,noheader"]
cards = []
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
line = line.decode().rstrip()
cards.append(line)
return cards
def write_xorg_conf(cards):
num_of_cards = len(cards) + 1
boardname = (grab_card_boardname())[0]
gpus = []
file_loader = FileSystemLoader('/opt/generate-xorg/template')
env = Environment(loader=file_loader)
template = env.get_template('xorg.conf.j2')
for i in range(1, num_of_cards):
monitors = []
screens = []
res = list(itertools.combinations(cards,i))
for j in range(i):
monitors.append("Monitor"+str(j))
screens.append("Screen"+str(j))
for card in res:
filename = "/etc/X11/xorg.conf." + str(i) + str(res.index(card))
template.stream({'boardname':boardname,'monitors':monitors,'screens':screens,'devices':card}).dump(filename)
if __name__ == "__main__":
cards = grab_card_ids()
write_xorg_conf(cards)
---
- name: install dependencies
yum:
name: python-jinja2
state: present
become: true
when: 'ansible_os_family=="RedHat" and "python" in discovered_interpreter_python'
- name: install dependencies
yum:
name: python36-jinja2
enablerepo: epel
enablerepo: monashhpc_epel
state: present
become: true
when: ansible_os_family=="RedHat"
when: 'ansible_os_family=="RedHat" and "python3" in discovered_interpreter_python'
- name: install dependencies
yum:
name: python3-jinja2
state: present
become: true
when: ansible_os_family=="Debian"
- name: create /opt/generate-xorg and template dirs
become: yes
......@@ -17,7 +31,7 @@
- name: copy nvidia-xconf-gen.py
become: yes
copy:
src: files/nvidia-xconf-gen.py
src: ../../scripts/nvidia-xconf-gen.py
dest: /opt/generate-xorg/nvidia-xconf-gen.py
owner: root
mode: '0755'
......
---
- name: disable selinux
selinux: state=disabled
become: True
become_user: root
- block:
- name: disable selinux
selinux: state=disabled
become: true
register: selinuxvar
- name: reboot if needed
reboot:
when: selinuxvar is defined and selinuxvar.reboot_required
become: true
when: ansible_os_family=="RedHat"
%{{ sudo_group }} ALL=(ALL) ALL
{% if nopasswd_user is defined %}
{{ nopasswd_user }} ALL=(ALL) NOPASSWD:ALL
{% endif %}
......@@ -53,7 +53,7 @@
name: "{{ extra_packages_epel }}"
update_cache: yes
state: present
enablerepo: epel # exclude={{ excludes|join(',') }}
enablerepo: monashhpc_epel # exclude={{ excludes|join(',') }}
become: true
become_user: root
when:
......
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig: version 375.66 (buildmeister@swio-display-x86-rhel47-06) Mon May 1 15:45:32 PDT 2017
Section "DRI"
Mode 0666
EndSection
Section "ServerLayout"
#InputDevice "Keyboard0" "CoreKeyboard"
#InputDevice "Mouse0" "CorePointer"
Identifier "Layout0"
{% for screen in screens %}
{% if screens.index(screen) == 0 %}
Screen 0 "Screen{{screens.index(screen)}}"
{% else %}
Screen {{screens.index(screen)}} "Screen{{screens.index(screen)}}" RightOf "Screen{{screens.index(screen)-1}}"
{% endif %}
{% endfor %}
#InputDevice "Keyboard0" "CoreKeyboard"
#InputDevice "Mouse0" "CorePointer"
EndSection
Section "Files"
FontPath "/usr/share/fonts/default/Type1"
EndSection
Section "InputDevice"
# generated from default
Identifier "Mouse0"
Driver "mouse"
Option "Protocol" "auto"
Option "Device" "/dev/input/mice"
Option "Emulate3Buttons" "no"
Option "ZAxisMapping" "4 5"
EndSection
Section "InputDevice"
# generated from default
Identifier "Keyboard0"
Driver "kbd"
EndSection
{% for monitor in monitors %}
Section "Monitor"
Identifier "{{monitor}}"
VendorName "Unknown"
ModelName "Unknown"
HorizSync 28.0 - 33.0
VertRefresh 43.0 - 72.0
Option "DPMS"
EndSection
{% endfor %}
{% for device in devices %}
Section "Device"
Identifier "Device{{devices.index(device)}}"
Driver "nvidia"
VendorName "NVIDIA Corporation"
boardname "{{boardname}}"
BusID "{{device}}"
EndSection
{% endfor %}
{% for screen in screens %}
Section "Screen"
Identifier "Screen{{screens.index(screen)}}"
Device "Device{{screens.index(screen)}}"
Monitor "Monitor{{screens.index(screen)}}"
DefaultDepth 24
Option "ProbeAllGpus" "false"
{% if boardname == 'GRID K1' %}
Option "UseDisplayDevice" "None"
{% endif %}
SubSection "Display"
Virtual 1920 1200
Depth 24
EndSubSection
EndSection
{% endfor -%}
---
- name: create /opt/generate-xorg and template dirs
become: yes
file:
path: /opt/generate-xorg/template
state: directory
mode: '0755'
- name: copy nvidia-xconf-gen.py
become: yes
copy:
src: nvidia-xconf-gen.py
dest: /opt/generate-xorg/nvidia-xconf-gen.py
owner: root
mode: '0755'
- name: copy xorg.conf.j2 template
become: yes
copy:
src: xorg.conf.j2
dest: /opt/generate-xorg/template/xorg.conf.j2
owner: root
mode: '0644'
- name: Creates ansible-generate-xorg file under /etc/cron.d
become: yes
cron:
name: cron job to generate xorg after reboot
special_time: reboot
user: root
job: "/opt/generate-xorg/nvidia-xconf-gen.py"
cron_file: ansible-generate-xorg
#!/usr/bin/env python
from jinja2 import Template
import itertools
import subprocess
import datetime
import os
import sys
import time
import socket
from subprocess import call
import re
import json
def check_nvidia_smi():
try:
smi = subprocess.check_output(["which","nvidia-smi"])
except subprocess.CalledProcessError:
print("nvidia-smi binary not found!")
exit(1)
def grab_card_ids():
# This method runs nvidia-smi to grab the card ids, then returns a list
check_nvidia_smi()
cmd = ["nvidia-smi", "--query-gpu=pci.bus_id","--format=csv,noheader"]
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
cards = []
for line in p.stdout.readlines():
stripped_line = line.rstrip().split(":")[2]
#check for different format of pcibus_id. This happens on baremetals
# i.e. 00000000:06:00.0 not 00000000:00:06.0
pcibus_id = re.sub('[.:]', '', stripped_line).rstrip("0")
if not pcibus_id: # empty string, try the other way
stripped_line = line.rstrip().split(":")[1]
pcibus_id = re.sub('[.:]', '', stripped_line).rstrip("0")
if not pcibus_id:
print("Error in grab_card_ids: we can not parse the line {}".format(line))
print("Command that generated it is: {}".format(cmd))
system.exit(1)
pcibus_num = int(pcibus_id,16)
card = "PCI:0:{}:0".format(str(pcibus_num))
cards.append(card)
return cards
def grab_card_boardname():
check_nvidia_smi()
cmd = ["nvidia-smi", "--query-gpu=name","--format=csv,noheader"]
cards = []
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
line = line.rstrip()
cards.append(line)
return cards
def write_xorg_conf(cards):
num_of_cards = len(cards) + 1
boardname = (grab_card_boardname())[0]
gpus = []
for i in range(1, num_of_cards):
monitors = []
screens = []
res = list(itertools.combinations(cards,i))
for j in range(i):
monitors.append("Monitor"+str(j))
screens.append("Screen"+str(j))
for card in res:
filename = "/etc/X11/xorg.conf." + str(i) + str(res.index(card))
gpus.append({'filename':filename,'boardname':boardname,'monitors':monitors,'screens':screens,'devices':card})
print(json.dumps(gpus))
if __name__ == "__main__":
cards = grab_card_ids()
write_xorg_conf(cards)
......@@ -206,30 +206,6 @@
args:
creates: /etc/X11/xorg.conf
#- name: Template xorg.conf for nodes with one GPU
# template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf
# become: true
# become_user: root
# when: template_xorgconf is defined and template_xorgcon
- name: run nvidia-xconf-gen
script: scripts/nvidia-xconf-gen.py
register: nvidiacards
check_mode: no
changed_when: False
- name: set env for nvidia_card_lists
set_fact:
nvidiacardslist: "{{ nvidiacards.stdout | from_json }}"
- name: generate nvidia-xorg-conf
become: true
template:
src: xorg.conf.j2
dest: "{{ item['filename'] }}"
with_items: "{{ nvidiacardslist }}"
- name: re-start the persistence daemon
service: name=nvidia-persistenced state=started
become: true
......
......@@ -55,8 +55,21 @@
become_user: root
notify: restart sssd
- name: Setting the size of /var/lib/sssd disk for ComputeNodes
set_fact:
ssd_size: "40M"
when: not ( (( inventory_hostname in groups.LoginNodes ) or ( inventory_hostname in groups.BackupNodes )))
- name: Setting the size of /var/lib/sssd disk for LoginNodes or Backup Nodes
set_fact:
ssd_size: "80M"
when: (( inventory_hostname in groups.LoginNodes ) or ( inventory_hostname in groups.BackupNodes ))
- name: Print size of /var/lib/sssd disk
debug: msg="Size of /var/lib/sssd disk is {{ ssd_size }}"
- name: "Make the cache a tmpfs"
mount: name=/var/lib/sss/db/ src=tmpfs fstype=tmpfs opts='size=40m' state=mounted
mount: name=/var/lib/sss/db/ src=tmpfs fstype=tmpfs opts='size={{ ssd_size }}' state=mounted
become: true
become_user: root
......
---
- name: Create a directory for the symlink scripts
ansible.builtin.file:
path: /opt/symlinker/
state: directory
mode: '0755'
become: true
- name: Template project symlink script to /opt/symlinker/symlinker-{{ lustre_storage_type }}.sh
ansible.builtin.template:
src: symlinker.sh.j2
dest: /opt/symlinker/symlinker-{{ lustre_storage_type }}.sh
owner: root
group: root
mode: '0700'
vars:
lustre_storage_type: "{{ item }}"
with_items:
"{{ lustre_storage_types }}"
become: true
- name: Ensure a job that runs every 15 minutes exists. This updates the symlinks
ansible.builtin.cron:
name: "Update lustre symlinks {{ item }} (node local)"
minute: "*/15"
job: "/opt/symlinker/symlinker-{{ item }}.sh"
user: root
with_items:
"{{ lustre_storage_types }}"
become: true
\ No newline at end of file
#!/bin/bash
originallfs={{ lustre_mount }}
symlinkdest={{ local_directory_path }}
# Check that original lustre filesystem is mounted, exit if it isn't
if [ ! -d $originallfs ]; then
echo "Check that the filesystem $originalfs is mounted, exiting...";
exit 1
fi
# Check that the symlink destination exists, has the correct permissions etc
if [ ! -d $symlinkdest ]; then
echo "Creating $symlinkdest as it does not exist yet";
mkdir -p $symlinkdest;
chown root:root $symlinkdest;
chmod 0755 $symlinkdest;
fi
# Iterate over directories inside the original fs, create symlinks if they do _not_ exist
for sourcepath in `find "$originallfs" -maxdepth 1 -mindepth 1 -type d`; do
foldername=`basename "$sourcepath"`;
linkpath="$symlinkdest/$foldername";
if [ ! -L $linkpath ]; then
ln -sT $sourcepath $linkpath;
fi
done
---
- name: rocemode is 1 or 2
assert:
that:
- rocemode is defined
- rocemode == "1" or rocemode == "2"
fail_msg: "'expecting parameter rocemode with values 1 or 2"
success_msg: "continuing with rocemode{{ rocemode }}.yml "
- include_tasks: "rocev{{ rocemode }}.yml"
\ No newline at end of file
---
- name: stat if etc systemd system roce_mode.service exists
stat:
path: /etc/systemd/system/roce_mode.service
register: statrocemode
# todo handle multiple devices found
- name: query ibstat for devicename
shell: /usr/sbin/ibstat | grep mlx5 | awk '{print $2}' | sed -r "s#'##g"
register: qibdevicenames
check_mode: no
#when: not statrocemode.stat.exists
- name: print qibdevicenames
debug:
var: qibdevicenames
- name: set ibdevicename
set_fact:
ibdevicename: "{{ qibdevicenames.stdout }}"
- name: template roce_mode.service file
template: dest=/etc/systemd/system/roce_mode.service src=roce_mode.service.j2 owner=root group=root mode=0644
become: true
become_user: root
register: service_file
- name: Reload systemd
shell: systemctl daemon-reload
become: true
become_user: root
when: service_file.changed
register: reload_service
- name: enable roce_mode setting
service: name=roce_mode enabled=yes
become: true
- name: enable roce_mode setting
service: name=roce_mode state=started enabled=yes
become: true
when: start_roce_service is undefined or start_roce_service
---
- name: stat if etc systemd system roce_mode.service exists
stat:
path: /etc/systemd/system/roce_mode.service
register: statrocemode
# todo handle multiple devices found
- name: query ibstat for devicename
shell: /usr/sbin/ibstat | grep mlx5 | awk '{print $2}' | sed -r "s#'##g"
register: qibdevicenames
check_mode: no
changed_when: false
#when: not statrocemode.stat.exists
- name: print qibdevicenames
debug:
var: qibdevicenames
- name: template pfc-ecn script
become: true
template: