diff --git a/roles/gpu_update/README.md b/roles/gpu_update/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5f4041c563a2cab18b5d0274e2f5477b6f80188 --- /dev/null +++ b/roles/gpu_update/README.md @@ -0,0 +1,19 @@ +# GPU Update Role +paulmc 21/06/2016 + +This role was intially created to deploy a new driver to Centos 6 MASSIVE. + +This should also work on a new system. + +For a update a seperate playbook can be created to deploy on new driver on an existing system. + +Note that the particular driver install needs to be in the files dir + +e.g. +gpudriver_352.93.yml +# This is a cut down playbook for deploying a new driver to MASSIBE m1 and m2 +- hosts: all + strategy: free + roles: + - { role: gpu_update , nvidia_version: 352.93} + diff --git a/roles/gpu_update/files/NVIDIA-Linux-x86_64-352.93.run.REMOVED.git-id b/roles/gpu_update/files/NVIDIA-Linux-x86_64-352.93.run.REMOVED.git-id new file mode 100644 index 0000000000000000000000000000000000000000..38a7bca176fcdd29040fe72acc76d2c44c093cab --- /dev/null +++ b/roles/gpu_update/files/NVIDIA-Linux-x86_64-352.93.run.REMOVED.git-id @@ -0,0 +1 @@ +48758c1a73f2a27c14f351a99923c3aa6e4c0cdf \ No newline at end of file diff --git a/roles/gpu_update/tasks/main.yml b/roles/gpu_update/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..72635811db1b655a6bfedb90227fbbe1961baf23 --- /dev/null +++ b/roles/gpu_update/tasks/main.yml @@ -0,0 +1,220 @@ +--- +## Check for jobs and stop if the node is online or running jobs +- name: Check node is offline and no jobs are running + shell: /usr/local/slurm/latest/bin/scontrol show node $HOSTNAME --oneline + register: node_status_result + always_run: yes + +- debug: var=node_status_result.stdout_lines + +- set_fact: + slurm_state_down_star_drain: "State=DOWN*+DRAIN " + slurm_state_down_drain: "State=DOWN+DRAIN " + slurm_state_down_star: "State=DOWN* " + slurm_state_idle_drain: "State=IDLE+DRAIN " + slurm_state_rsrv_drain: "State=RESERVED+DRAIN " + +- name: Fail if jobs are running + fail: msg="The node is not in IDLE+DRAIN, DOWN* or DOWN*+DRAIN, RESERVED+DRAIN we will not continue!" + when: (slurm_state_down_star_drain not in node_status_result.stdout) + and (slurm_state_down_drain not in node_status_result.stdout) + and (slurm_state_down_star not in node_status_result.stdout) + and (slurm_state_idle_drain not in node_status_result.stdout) + and (slurm_state_rsrv_drain not in node_status_result.stdout) +# when: (node_status_result.stdout.find('State=DOWN\*\+DRAIN') != -1) +# or (node_status_result.stdout.find('State=DOWN\* ') != -1) +# or (node_status_result.stdout.find('State=IDLE\+DRAIN') != -1) + + +## Check Hardware and Stop if we are running on a wrong node +- name: Check for GPU hardware before attempting to install driver + shell: lspci | grep "NVIDIA" | grep "3D controller" + always_run: yes + register: lspci_result + +- name: Show what GPUs lspci has found + debug: var=lspci_result.stdout_lines + +- name: Check and fail on no GPU + fail: msg="There is no GPU and you are trying to install a driver!?" + when: lspci_result.rc != 0 + +- name: Set cuda init script + template: dest=/etc/init.d/cuda src=cuda mode="u=rwx,g=rx,o=rx" + sudo: true + +## Install packages +- name: install deps + yum: name={{ item }} state=installed + sudo: true + with_items: + - gcc + - perl + - wget + - pciutils + - kernel-headers + - kernel-devel + - xterm + - libX11-common + - libX11-devel + - libX11 + - xorg-x11-server-common + - xorg-x11-util-macros + - xorg-x11-server-utils + - xorg-x11-font-utils + - xorg-x11-server-Xorg + - xorg-x11-glamor + - xorg-x11-xinit + - xorg-x11-utils + - xorg-x11-xauth + - xorg-x11-proto-devel + - xorg-x11-xkb-utils + +## Disable Nouveau (only required once on build +# MASSIVE M1 and M2 originaly used this method... +- name: Add nouveau from blacklist (MASSIVE Centos 6 only) + lineinfile: + args: + dest: /etc/modprobe.d/blacklist.conf + line: "blacklist nouveau" + state: present + sudo: true + when: ansible_os_family == "RedHat" and ansible_lsb.major_release|int == 6 +# M3 But this is the preferred method (which is what the installer does +- name: Template nvidia-installer-disable-nouveau.conf + template: dest=/etc/modprobe.d/nvidia-installer-disable-nouveau.conf src=nvidia-installer-disable-nouveau.conf.j2 + sudo: true + when: ansible_os_family != "RedHat" and ansible_lsb.major_release|int != 6 + +- name: Check if nouveau module is loaded + shell: cat /proc/modules + always_run: yes + register: modules_result + +- name: Restart host to remove nouveau module + shell: "sleep 2 && shutdown -r now &" + async: 1 + poll: 1 + sudo: true + ignore_errors: true + when: modules_result.stdout.find('nouveau') != -1 + +- name: Wait for host to reboot + local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900 + when: modules_result.stdout.find('nouveau') != -1 + +# Removed as this is related to old ways of controlling access to driver files +# - name: Template nvidia.conf +# template: dest=/etc/modprobe.d/nvidia.conf src=nvidia.conf.j2 +# sudo: true + +## Install NVIDIA driver +- name: Check nvidia driver version + shell: nvidia-smi + register: nvidia_result + always_run: yes + +- debug: var=nvidia_result.stdout_lines + +- set_fact: + upgrading_driver: false +- name: Set upgrading_driver flag + set_fact: + upgrading_driver: true + when: nvidia_result.stdout.find("{{ nvidia_version }}") == -1 + +- debug: var=upgrading_driver + +- name: Unload nvidia module + shell: modprobe -r nvidia + ignore_errors: true + when: upgrading_driver + # when: '"{{ nvidia_version }}" not in nvidia_result.stdout' + +- name: Check nvidia module is not loaded + shell: cat /proc/modules + register: nvidia_modules_result + always_run: yes + +- name: Restart host to unloaded nvidia module + shell: "sleep 2 && shutdown -r now &" + async: 1 + poll: 1 + sudo: true + ignore_errors: true + when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1) + +- name: Wait for host to reboot + local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900 + when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1) + +- name: Copy nvidia installer to /tmp + copy: + src=NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + mode=755 + # shell: cp -f /usr/local/src/CUDA/driver/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + # sudo: true + when: upgrading_driver + +- name: Install nvidia driver + shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent + sudo: true + when: upgrading_driver + # when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1 + +## Configure stuff for using the GPU +- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number + shell: /usr/bin/nvidia-xconfig -a --use-display-device=none + sudo: true + +- name: Comment out auth required so xserver can start from slurm job + lineinfile: dest=/etc/pam.d/xserver + regexp='^auth\s+required\s+pam_console.so' + line='#auth required pam_console.so' + backrefs=yes + # state=present + sudo: true + +- name: set persistence mode + lineinfile: + args: + dest: /etc/rc.d/rc.local + line: "nvidia-smi --persistence-mode=1" + state: present + sudo: true + +- name: Restart host to enable new driver + shell: "sleep 2 && shutdown -r now &" + async: 1 + poll: 1 + sudo: true + ignore_errors: true + when: upgrading_driver + +- name: Wait for host to reboot + local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900 + when: upgrading_driver + +# We had one error where the device was not created correctly, this is a check for that +- stat: path=/dev/nvidiactl + register: nvidiactl + +- fail: msg="Something is up see RT ticket 9477" + when: not nvidiactl.stat.exists + + +- name: Check nvidia driver version (takes a while after reboot) + command: nvidia-smi + register: nvidia_result + until: nvidia_result.stdout.find("NVIDIA-SMI has failed") == -1 + retries: 5 + delay: 5 + +- name: Check GPU correct version + debug: msg="Correct Driver Version {{ nvidia_version }}" + when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1 + +- name: Start Slurm + service: name=slurm state=started + when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1 diff --git a/roles/gpu_update/templates/cuda b/roles/gpu_update/templates/cuda new file mode 100755 index 0000000000000000000000000000000000000000..6e30c45c65b77a969b06cf35357fc0a919dfb798 --- /dev/null +++ b/roles/gpu_update/templates/cuda @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Startup/shutdown script for nVidia CUDA +# +# chkconfig: 345 80 20 +# description: Startup/shutdown script for nVidia CUDA + +# Source function library. +. /etc/init.d/functions + +DRIVER=nvidia +RETVAL=0 + +# Create /dev nodes for nvidia devices +function createnodes() { + # Count the number of NVIDIA controllers found. + N3D=`/sbin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l` + NVGA=`/sbin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l` + + N=`expr $N3D + $NVGA - 1` + for i in `seq 0 $N`; do + mknod -m 666 /dev/nvidia$i c 195 $i + RETVAL=$? + [ "$RETVAL" = 0 ] || exit $RETVAL + done + + mknod -m 666 /dev/nvidiactl c 195 255 + RETVAL=$? + [ "$RETVAL" = 0 ] || exit $RETVAL +} + +# Remove /dev nodes for nvidia devices +function removenodes() { + rm -f /dev/nvidia* +} + +# Start daemon +function start() { + echo -n $"Loading $DRIVER kernel module: " + depmod -a + modprobe $DRIVER && success || failure + RETVAL=$? + echo + [ "$RETVAL" = 0 ] || exit $RETVAL + + echo -n $"Initializing CUDA /dev entries: " + createnodes && success || failure + RETVAL=$? + echo + [ "$RETVAL" = 0 ] || exit $RETVAL +} + +# Stop daemon +function stop() { + echo -n $"Unloading $DRIVER kernel module: " + rmmod -f $DRIVER && success || failure + RETVAL=$? + echo + [ "$RETVAL" = 0 ] || exit $RETVAL + + echo -n $"Removing CUDA /dev entries: " + removenodes && success || failure + RETVAL=$? + echo + [ "$RETVAL" = 0 ] || exit $RETVAL +} + +# See how we were called +case "$1" in + start) + start + ;; + stop) + stop + ;; + restart) + stop + start + ;; + *) + echo $"Usage: $0 {start|stop|restart}" + RETVAL=1 +esac +exit $RETVAL diff --git a/roles/gpu_update/templates/grub.conf.j2 b/roles/gpu_update/templates/grub.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..48027fa9dfef17eed5d21641319bc28b0398d385 --- /dev/null +++ b/roles/gpu_update/templates/grub.conf.j2 @@ -0,0 +1,18 @@ +# paulmc - no longer needed, this used to be used to set nouveau on boot on old OS's +# grub.conf generated by anaconda +# +# Note that you do not have to rerun grub after making changes to this file +# NOTICE: You do not have a /boot partition. This means that +# all kernel and initrd paths are relative to /, eg. +# root (hd0,0) +# kernel /boot/vmlinuz-version ro root=/dev/vda1 +# initrd /boot/initrd-[generic-]version.img +#boot=/dev/vda +default=0 +timeout=5 +splashimage=(hd0,0)/boot/grub/splash.xpm.gz +hiddenmenu +title CentOS (2.6.32-504.el6.x86_64) + root (hd0,0) + kernel /boot/vmlinuz-2.6.32-504.el6.x86_64 ro root=/dev/vda1 rd_NO_LUKS KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD SYSFONT=latarcyrheb-sun16 crashkernel=auto elevator=noop biosdevname=0 console=ttyS0 rdblacklist=nouveau nouveau.modeset=0 rd_NO_LVM rd_NO_DM rhgb quiet + initrd /boot/initramfs-2.6.32-504.el6.x86_64.img diff --git a/roles/gpu_update/templates/nvidia-installer-disable-nouveau.conf.j2 b/roles/gpu_update/templates/nvidia-installer-disable-nouveau.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..acd23c03aa446606de5251f20c72158893401f87 --- /dev/null +++ b/roles/gpu_update/templates/nvidia-installer-disable-nouveau.conf.j2 @@ -0,0 +1,3 @@ +# generated by nvidia-installer +blacklist nouveau +options nouveau modeset=0 diff --git a/roles/gpu_update/templates/nvidia.conf.j2 b/roles/gpu_update/templates/nvidia.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..bc53739e7df298bcc920bd139faf45bdb6f9cbc6 --- /dev/null +++ b/roles/gpu_update/templates/nvidia.conf.j2 @@ -0,0 +1,3 @@ +# paulmc no longer needed - this was for setting user permissions to use driver +options nvidia NVreg_DeviceFileMode=0666 + diff --git a/roles/gpu_update/templates/xorg.conf.j2 b/roles/gpu_update/templates/xorg.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..389bf97aadab1020efcdaf56f120729552ee905b --- /dev/null +++ b/roles/gpu_update/templates/xorg.conf.j2 @@ -0,0 +1,63 @@ +# paulmc - no longer needed, we use nvidia-xconfig to generate this based on the GPU's it finds +# nvidia-xconfig: X configuration file generated by nvidia-xconfig +# nvidia-xconfig: version 340.58 (buildmeister@swio-display-x86-rhel47-09) Fri Oct 31 17:40:05 PDT 2014 +Section "DRI" + Mode 0660 + Group "vglusers" +EndSection + +Section "ServerLayout" + Identifier "Layout0" + Screen 0 "Screen0" + InputDevice "Keyboard0" "CoreKeyboard" + InputDevice "Mouse0" "CorePointer" +EndSection + +Section "Files" + FontPath "/usr/share/fonts/default/Type1" +EndSection + +Section "InputDevice" + # generated from default + Identifier "Mouse0" + Driver "mouse" + Option "Protocol" "auto" + Option "Device" "/dev/input/mice" + Option "Emulate3Buttons" "no" + Option "ZAxisMapping" "4 5" +EndSection + +Section "InputDevice" + # generated from data in "/etc/sysconfig/keyboard" + Identifier "Keyboard0" + Driver "kbd" + Option "XkbLayout" "us" + Option "XkbModel" "pc105" +EndSection + +Section "Monitor" + Identifier "Monitor0" + VendorName "Unknown" + ModelName "Unknown" + HorizSync 28.0 - 33.0 + VertRefresh 43.0 - 72.0 + Option "DPMS" +EndSection + +Section "Device" + Identifier "Device0" + Driver "nvidia" + VendorName "NVIDIA Corporation" + BusID "PCI:00:06:0" +EndSection + +Section "Screen" + Identifier "Screen0" + Device "Device0" + Monitor "Monitor0" + DefaultDepth 24 + SubSection "Display" + Depth 24 + EndSubSection +EndSection + diff --git a/roles/gpu_update/templates/xserver.j2 b/roles/gpu_update/templates/xserver.j2 new file mode 100644 index 0000000000000000000000000000000000000000..7bd08b9ab3f4f2b343b702b6610a5b336f6cfa79 --- /dev/null +++ b/roles/gpu_update/templates/xserver.j2 @@ -0,0 +1,6 @@ +# paulmc - no longer needed, we use inline changes instead +#%PAM-1.0 +auth sufficient pam_rootok.so +auth sufficient pam_permit.so +account required pam_permit.so +session optional pam_keyinit.so force revoke