Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 594 additions and 0 deletions
#%PAM-1.0
auth sufficient pam_rootok.so
auth sufficient pam_permit.so
account required pam_permit.so
session optional pam_keyinit.so force revoke
# GPU Update Role
paulmc 21/06/2016
This role was intially created to deploy a new driver to Centos 6 MASSIVE.
This should also work on a new system.
For a update a seperate playbook can be created to deploy on new driver on an existing system.
Note that the particular driver install needs to be in the files dir
e.g.
gpudriver_352.93.yml
# This is a cut down playbook for deploying a new driver to MASSIBE m1 and m2
- hosts: all
strategy: free
roles:
- { role: gpu_update , nvidia_version: 352.93}
---
## Check for jobs and stop if the node is online or running jobs
- name: Check node is offline and no jobs are running
shell: /usr/local/slurm/latest/bin/scontrol show node $HOSTNAME --oneline
register: node_status_result
always_run: yes
- debug: var=node_status_result.stdout_lines
- set_fact:
slurm_state_down_star_drain: "State=DOWN*+DRAIN "
slurm_state_down_drain: "State=DOWN+DRAIN "
slurm_state_down_star: "State=DOWN* "
slurm_state_idle_drain: "State=IDLE+DRAIN "
slurm_state_rsrv_drain: "State=RESERVED+DRAIN "
- name: Fail if jobs are running
fail: msg="The node is not in IDLE+DRAIN, DOWN* or DOWN*+DRAIN, RESERVED+DRAIN we will not continue!"
when: (slurm_state_down_star_drain not in node_status_result.stdout)
and (slurm_state_down_drain not in node_status_result.stdout)
and (slurm_state_down_star not in node_status_result.stdout)
and (slurm_state_idle_drain not in node_status_result.stdout)
and (slurm_state_rsrv_drain not in node_status_result.stdout)
# when: (node_status_result.stdout.find('State=DOWN\*\+DRAIN') != -1)
# or (node_status_result.stdout.find('State=DOWN\* ') != -1)
# or (node_status_result.stdout.find('State=IDLE\+DRAIN') != -1)
## Check Hardware and Stop if we are running on a wrong node
- name: Check for GPU hardware before attempting to install driver
shell: lspci | grep "NVIDIA" | grep "3D controller"
always_run: yes
register: lspci_result
- name: Show what GPUs lspci has found
debug: var=lspci_result.stdout_lines
- name: Check and fail on no GPU
fail: msg="There is no GPU and you are trying to install a driver!?"
when: lspci_result.rc != 0
- name: Set cuda init script
template: dest=/etc/init.d/cuda src=cuda mode="u=rwx,g=rx,o=rx"
become: true
## Install packages
- name: install deps
yum: name={{ item }} state=present
become: true
with_items:
- gcc
- perl
- wget
- pciutils
- kernel-headers
- kernel-devel
- xterm
- libX11-common
- libX11-devel
- libX11
- xorg-x11-server-common
- xorg-x11-util-macros
- xorg-x11-server-utils
- xorg-x11-font-utils
- xorg-x11-server-Xorg
- xorg-x11-glamor
- xorg-x11-xinit
- xorg-x11-utils
- xorg-x11-xauth
- xorg-x11-proto-devel
- xorg-x11-xkb-utils
## Disable Nouveau (only required once on build
# MASSIVE M1 and M2 originaly used this method...
- name: Add nouveau from blacklist (MASSIVE Centos 6 only)
lineinfile:
args:
dest: /etc/modprobe.d/blacklist.conf
line: "blacklist nouveau"
state: present
become: true
when: ansible_os_family == "RedHat" and ansible_lsb.major_release|int == 6
# M3 But this is the preferred method (which is what the installer does
- name: Template nvidia-installer-disable-nouveau.conf
template: dest=/etc/modprobe.d/nvidia-installer-disable-nouveau.conf src=nvidia-installer-disable-nouveau.conf.j2
become: true
when: ansible_os_family != "RedHat" and ansible_lsb.major_release|int != 6
- name: Check if nouveau module is loaded
shell: cat /proc/modules
always_run: yes
register: modules_result
- name: Restart host to remove nouveau module
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
become: true
ignore_errors: true
when: modules_result.stdout.find('nouveau') != -1
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: modules_result.stdout.find('nouveau') != -1
# Removed as this is related to old ways of controlling access to driver files
# - name: Template nvidia.conf
# template: dest=/etc/modprobe.d/nvidia.conf src=nvidia.conf.j2
# become: true
## Install NVIDIA driver
- name: Check nvidia driver version
shell: nvidia-smi
register: nvidia_result
always_run: yes
- debug: var=nvidia_result.stdout_lines
- set_fact:
upgrading_driver: false
- name: Set upgrading_driver flag
set_fact:
upgrading_driver: true
when: nvidia_result.stdout.find("{{ nvidia_version }}") == -1
- debug: var=upgrading_driver
- name: Unload nvidia module
shell: modprobe -r nvidia
ignore_errors: true
when: upgrading_driver
# when: '"{{ nvidia_version }}" not in nvidia_result.stdout'
- name: Check nvidia module is not loaded
shell: cat /proc/modules
register: nvidia_modules_result
always_run: yes
- name: Restart host to unloaded nvidia module
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
become: true
ignore_errors: true
when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
- name: Copy nvidia installer to /tmp
copy:
src=NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
mode=755
# shell: cp -f /usr/local/src/CUDA/driver/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
# become: true
when: upgrading_driver
- name: Install nvidia driver
shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent
become: true
when: upgrading_driver
# when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
## Configure stuff for using the GPU
- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number
shell: /usr/bin/nvidia-xconfig -a --use-display-device=none
become: true
- name: Comment out auth required so xserver can start from slurm job
lineinfile: dest=/etc/pam.d/xserver
regexp='^auth\s+required\s+pam_console.so'
line='#auth required pam_console.so'
backrefs=yes
# state=present
become: true
- name: set persistence mode
lineinfile:
args:
dest: /etc/rc.d/rc.local
line: "nvidia-smi --persistence-mode=1"
state: present
become: true
- name: Restart host to enable new driver
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
become: true
ignore_errors: true
when: upgrading_driver
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: upgrading_driver
# We had one error where the device was not created correctly, this is a check for that
- stat: path=/dev/nvidiactl
register: nvidiactl
- fail: msg="Something is up see RT ticket 9477"
when: not nvidiactl.stat.exists
- name: Check nvidia driver version (takes a while after reboot)
command: nvidia-smi
register: nvidia_result
until: nvidia_result.stdout.find("NVIDIA-SMI has failed") == -1
retries: 5
delay: 5
- name: Check GPU correct version
debug: msg="Correct Driver Version {{ nvidia_version }}"
when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
- name: Start Slurm
service: name=slurm state=started
when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
#!/bin/bash
#
# Startup/shutdown script for nVidia CUDA
#
# chkconfig: 345 80 20
# description: Startup/shutdown script for nVidia CUDA
# Source function library.
. /etc/init.d/functions
DRIVER=nvidia
RETVAL=0
# Create /dev nodes for nvidia devices
function createnodes() {
# Count the number of NVIDIA controllers found.
N3D=`/sbin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l`
NVGA=`/sbin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l`
N=`expr $N3D + $NVGA - 1`
for i in `seq 0 $N`; do
mknod -m 666 /dev/nvidia$i c 195 $i
RETVAL=$?
[ "$RETVAL" = 0 ] || exit $RETVAL
done
mknod -m 666 /dev/nvidiactl c 195 255
RETVAL=$?
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# Remove /dev nodes for nvidia devices
function removenodes() {
rm -f /dev/nvidia*
}
# Start daemon
function start() {
echo -n $"Loading $DRIVER kernel module: "
depmod -a
modprobe $DRIVER && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
echo -n $"Initializing CUDA /dev entries: "
createnodes && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# Stop daemon
function stop() {
echo -n $"Unloading $DRIVER kernel module: "
rmmod -f $DRIVER && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
echo -n $"Removing CUDA /dev entries: "
removenodes && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# See how we were called
case "$1" in
start)
start
;;
stop)
stop
;;
restart)
stop
start
;;
*)
echo $"Usage: $0 {start|stop|restart}"
RETVAL=1
esac
exit $RETVAL
# paulmc - no longer needed, this used to be used to set nouveau on boot on old OS's
# grub.conf generated by anaconda
#
# Note that you do not have to rerun grub after making changes to this file
# NOTICE: You do not have a /boot partition. This means that
# all kernel and initrd paths are relative to /, eg.
# root (hd0,0)
# kernel /boot/vmlinuz-version ro root=/dev/vda1
# initrd /boot/initrd-[generic-]version.img
#boot=/dev/vda
default=0
timeout=5
splashimage=(hd0,0)/boot/grub/splash.xpm.gz
hiddenmenu
title CentOS (2.6.32-504.el6.x86_64)
root (hd0,0)
kernel /boot/vmlinuz-2.6.32-504.el6.x86_64 ro root=/dev/vda1 rd_NO_LUKS KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD SYSFONT=latarcyrheb-sun16 crashkernel=auto elevator=noop biosdevname=0 console=ttyS0 rdblacklist=nouveau nouveau.modeset=0 rd_NO_LVM rd_NO_DM rhgb quiet
initrd /boot/initramfs-2.6.32-504.el6.x86_64.img
# generated by nvidia-installer
blacklist nouveau
options nouveau modeset=0
# paulmc no longer needed - this was for setting user permissions to use driver
options nvidia NVreg_DeviceFileMode=0666
# paulmc - no longer needed, we use nvidia-xconfig to generate this based on the GPU's it finds
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig: version 340.58 (buildmeister@swio-display-x86-rhel47-09) Fri Oct 31 17:40:05 PDT 2014
Section "DRI"
Mode 0660
Group "vglusers"
EndSection
Section "ServerLayout"
Identifier "Layout0"
Screen 0 "Screen0"
InputDevice "Keyboard0" "CoreKeyboard"
InputDevice "Mouse0" "CorePointer"
EndSection
Section "Files"
FontPath "/usr/share/fonts/default/Type1"
EndSection
Section "InputDevice"
# generated from default
Identifier "Mouse0"
Driver "mouse"
Option "Protocol" "auto"
Option "Device" "/dev/input/mice"
Option "Emulate3Buttons" "no"
Option "ZAxisMapping" "4 5"
EndSection
Section "InputDevice"
# generated from data in "/etc/sysconfig/keyboard"
Identifier "Keyboard0"
Driver "kbd"
Option "XkbLayout" "us"
Option "XkbModel" "pc105"
EndSection
Section "Monitor"
Identifier "Monitor0"
VendorName "Unknown"
ModelName "Unknown"
HorizSync 28.0 - 33.0
VertRefresh 43.0 - 72.0
Option "DPMS"
EndSection
Section "Device"
Identifier "Device0"
Driver "nvidia"
VendorName "NVIDIA Corporation"
BusID "PCI:00:06:0"
EndSection
Section "Screen"
Identifier "Screen0"
Device "Device0"
Monitor "Monitor0"
DefaultDepth 24
SubSection "Display"
Depth 24
EndSubSection
EndSection
# paulmc - no longer needed, we use inline changes instead
#%PAM-1.0
auth sufficient pam_rootok.so
auth sufficient pam_permit.so
account required pam_permit.so
session optional pam_keyinit.so force revoke
---
- name: ensure hpcid_ca is in the authorized_keys file
authorized_key: user={{ ansible_user }} key="cert-authority ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCfHlWGrnpirvqvUTySnoQK6ze5oIXz7cYIT+XCBeBCahlK05O38g0erBGrNWFozZwbIXnysVCibaUJqtH0JrYqmcr2NnYA0PoiTeranvaJI7pQsga1gBxfK/D4UItw5yI6V7w9efMT0zpIP8WEubQz6GFtkyiNVgFCHj3+VhLs3RslvYzb35SFcLXEDsGVQM5NdWBUgRaNRqpTPvuMcxTyPvy32wW72kwaYRQioDJFcE2WJ240M2oSsx+dhTWvI8sW1sEUI1qIDfyBPsOgsLofuSpt4ZNgJqBUTp/hW85wVpNzud6A4YJWHpZXSDMtUMYE9QL+x2fw/b26yck9ZPE/ hines@tun"
---
- name: install system dependencies
yum: name={{ item }} state=present
with_items:
- openssl-devel
- openldap-devel
- python-pip
- git
- python-virtualenv
become: true
become_user: root
when: ansible_os_family == "RedHat"
- name: install system dependencies
apt: name={{ item }} state=present
with_items:
- libssl-dev
- libldap2-dev
become: true
become_user: root
when: ansible_os_family == "Debian"
- name: create install dir
file: name={{ item }} state=directory owner={{ ansible_user }}
with_items:
- "/usr/local/hpcsystem"
- "/usr/local/hpcsystem_config"
- "/usr/local/virtualenvs/mercpytools"
become: true
become_user: root
- name: upgrade pip
pip:
virtualenv: "/usr/local/virtualenvs/mercpytools"
name: "pip"
extra_args: "--upgrade"
- name: install mercpytools
pip:
virtualenv: "/usr/local/virtualenvs/mercpytools"
name: "git+https://gitlab.erc.monash.edu.au/hpc-team/mercpytools.git#egg=mercpytools"
extra_args: "--upgrade"
- name: install hpcsystem
git:
repo: git@gitlab.erc.monash.edu.au:hpc-team/hpcsystem.git
dest: /usr/local/hpcsystem
accept_hostkey: True
- name: install hpcsystem_config
git:
repo: git@gitlab.erc.monash.edu.au:hpc-team/m3_hpcsystem_config.git
dest: /usr/local/hpcsystem_config
accept_hostkey: True
- name: cron job to check quotas
cron:
name: "Naggy quota cron job"
value: '/usr/local/hpcsystem/naggy_quota.sh'
hour: 16
minute: 23
become: true
become_user: root
---
- name: Pre installation
shell: "{{ preInstallation }}"
become: true
ignore_errors: true
when: ansible_distribution == 'CentOS' and preInstallation is defined
- name: Add new repo file
shell: "{{ importRepo.command }} {{ importRepo.destination }}"
become: true
run_once: true
args:
creates: "{{ importRepo.destination }}"
when: ansible_distribution == 'CentOS' and importRepo is defined
- name: Install yum packages
yum: name={{ item }} state=present
with_items: yumPackageList
become: true
when: ansible_distribution == 'CentOS' and yumPackageList is defined
- name: Install yum group packages
shell: yum --setopt=protected_multilib=false -y groupinstall "{{ item }}"
with_items: yumGroupPackageList
become: true
when: ansible_distribution == 'CentOS' and yumGroupPackageList is defined
- name: Post installation
shell: "{{ postInstallation }}"
become: true
when: ansible_distribution == 'CentOS' and postInstallation is defined
- name: conditional shell copy command
shell: "{{ cliCopy.run }}"
become: true
run_once: true
args:
creates: "{{ cliCopy.check }}"
when: ansible_distribution == 'CentOS' and cliAction is defined
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBIbQXH8ZHnl7Ht5YMuGqZ80k+nKnds+58y9VcedVeXDobsF7t6wCRe5GDov8XxYxxWbjz0H7xhx6PQYiVsn6GL0= ubuntu@sshauthz-2
---
- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
- name: restart ssh
service: name={{ sshd_name }} state=restarted
become: true
---
- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
- name: copy ca cert
copy: src=server_ca.pub dest=/etc/ssh/server_ca.pub owner=root group=root mode=644
become: true
- name: edit sshd_config
lineinfile:
args:
dest: /etc/ssh/sshd_config
line: TrustedUserCAKeys /etc/ssh/server_ca.pub
state: present
become: true
notify: restart ssh
sshd_name: "ssh"
sshd_name: "sshd"
#!/usr/bin/python
import sys
import json
filename = sys.argv[1]
ansible_hostname = sys.argv[2]
domain = sys.argv[3]
f=open(filename,'r')
s=f.read()
d=json.loads(s)
f.close()
hosts={}
for group in d['groups'].keys():
for h in d['groups'][group]:
if hosts.has_key(h):
pass
else:
hosts[h] = {}
url=""
try:
for host in d['groups']['ldap']:
fqdn="%s.%s"%(host,domain)
url=url+"ldaps://%s"%fqdn
except:
url="ldaps:///"
print url
---
- name: restart apache
service: name=apache2 state=restarted
become: true
- name: restart postfix
service: name=postfix state=restarted
become: true
---
dependencies:
- { role: easy-rsa-certificate, x509_csr_args: "", x509_sign_args: "--server", x509_cacert_file: "/etc/ssl/certs/ca.crt", x509_key_file: "/etc/ssl/private/server.key", x509_cert_file: "/etc/ssl/certs/server.crt", x509_common_name: "{{ ansible_fqdn }}" }