Skip to content
Snippets Groups Projects
Commit c299a70e authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'chris_gpu_update' into 'master'

Chris gpu update

See merge request !60


Former-commit-id: 14f50fae
parents dbd01e57 d9cfdc4b
No related branches found
No related tags found
No related merge requests found
blacklist nouveau
options nouveau modeset=0
# GPU Update Role
paulmc 21/06/2016
This role was intially created to deploy a new driver to Centos 6 MASSIVE.
This should also work on a new system.
For a update a seperate playbook can be created to deploy on new driver on an existing system.
Note that the particular driver install needs to be in the files dir
e.g.
gpudriver_352.93.yml
# This is a cut down playbook for deploying a new driver to MASSIBE m1 and m2
- hosts: all
strategy: free
roles:
- { role: gpu_update , nvidia_version: 352.93}
48758c1a73f2a27c14f351a99923c3aa6e4c0cdf
\ No newline at end of file
---
## Check for jobs and stop if the node is online or running jobs
- name: Check node is offline and no jobs are running
shell: /usr/local/slurm/latest/bin/scontrol show node $HOSTNAME --oneline
register: node_status_result
always_run: yes
- debug: var=node_status_result.stdout_lines
- set_fact:
slurm_state_down_star_drain: "State=DOWN*+DRAIN "
slurm_state_down_drain: "State=DOWN+DRAIN "
slurm_state_down_star: "State=DOWN* "
slurm_state_idle_drain: "State=IDLE+DRAIN "
slurm_state_rsrv_drain: "State=RESERVED+DRAIN "
- name: Fail if jobs are running
fail: msg="The node is not in IDLE+DRAIN, DOWN* or DOWN*+DRAIN, RESERVED+DRAIN we will not continue!"
when: (slurm_state_down_star_drain not in node_status_result.stdout)
and (slurm_state_down_drain not in node_status_result.stdout)
and (slurm_state_down_star not in node_status_result.stdout)
and (slurm_state_idle_drain not in node_status_result.stdout)
and (slurm_state_rsrv_drain not in node_status_result.stdout)
# when: (node_status_result.stdout.find('State=DOWN\*\+DRAIN') != -1)
# or (node_status_result.stdout.find('State=DOWN\* ') != -1)
# or (node_status_result.stdout.find('State=IDLE\+DRAIN') != -1)
## Check Hardware and Stop if we are running on a wrong node
- name: Check for GPU hardware before attempting to install driver
shell: lspci | grep "NVIDIA" | grep "3D controller"
always_run: yes
register: lspci_result
- name: Show what GPUs lspci has found
debug: var=lspci_result.stdout_lines
- name: Check and fail on no GPU
fail: msg="There is no GPU and you are trying to install a driver!?"
when: lspci_result.rc != 0
- name: Set cuda init script
template: dest=/etc/init.d/cuda src=cuda mode="u=rwx,g=rx,o=rx"
sudo: true
## Install packages
- name: install deps
yum: name={{ item }} state=installed
sudo: true
with_items:
- gcc
- perl
- wget
- pciutils
- kernel-headers
- kernel-devel
- xterm
- libX11-common
- libX11-devel
- libX11
- xorg-x11-server-common
- xorg-x11-util-macros
- xorg-x11-server-utils
- xorg-x11-font-utils
- xorg-x11-server-Xorg
- xorg-x11-glamor
- xorg-x11-xinit
- xorg-x11-utils
- xorg-x11-xauth
- xorg-x11-proto-devel
- xorg-x11-xkb-utils
## Disable Nouveau (only required once on build
# MASSIVE M1 and M2 originaly used this method...
- name: Add nouveau from blacklist (MASSIVE Centos 6 only)
lineinfile:
args:
dest: /etc/modprobe.d/blacklist.conf
line: "blacklist nouveau"
state: present
sudo: true
when: ansible_os_family == "RedHat" and ansible_lsb.major_release|int == 6
# M3 But this is the preferred method (which is what the installer does
- name: Template nvidia-installer-disable-nouveau.conf
template: dest=/etc/modprobe.d/nvidia-installer-disable-nouveau.conf src=nvidia-installer-disable-nouveau.conf.j2
sudo: true
when: ansible_os_family != "RedHat" and ansible_lsb.major_release|int != 6
- name: Check if nouveau module is loaded
shell: cat /proc/modules
always_run: yes
register: modules_result
- name: Restart host to remove nouveau module
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
sudo: true
ignore_errors: true
when: modules_result.stdout.find('nouveau') != -1
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: modules_result.stdout.find('nouveau') != -1
# Removed as this is related to old ways of controlling access to driver files
# - name: Template nvidia.conf
# template: dest=/etc/modprobe.d/nvidia.conf src=nvidia.conf.j2
# sudo: true
## Install NVIDIA driver
- name: Check nvidia driver version
shell: nvidia-smi
register: nvidia_result
always_run: yes
- debug: var=nvidia_result.stdout_lines
- set_fact:
upgrading_driver: false
- name: Set upgrading_driver flag
set_fact:
upgrading_driver: true
when: nvidia_result.stdout.find("{{ nvidia_version }}") == -1
- debug: var=upgrading_driver
- name: Unload nvidia module
shell: modprobe -r nvidia
ignore_errors: true
when: upgrading_driver
# when: '"{{ nvidia_version }}" not in nvidia_result.stdout'
- name: Check nvidia module is not loaded
shell: cat /proc/modules
register: nvidia_modules_result
always_run: yes
- name: Restart host to unloaded nvidia module
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
sudo: true
ignore_errors: true
when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
- name: Copy nvidia installer to /tmp
copy:
src=NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
mode=755
# shell: cp -f /usr/local/src/CUDA/driver/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
# sudo: true
when: upgrading_driver
- name: Install nvidia driver
shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent
sudo: true
when: upgrading_driver
# when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
## Configure stuff for using the GPU
- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number
shell: /usr/bin/nvidia-xconfig -a --use-display-device=none
sudo: true
- name: Comment out auth required so xserver can start from slurm job
lineinfile: dest=/etc/pam.d/xserver
regexp='^auth\s+required\s+pam_console.so'
line='#auth required pam_console.so'
backrefs=yes
# state=present
sudo: true
- name: set persistence mode
lineinfile:
args:
dest: /etc/rc.d/rc.local
line: "nvidia-smi --persistence-mode=1"
state: present
sudo: true
- name: Restart host to enable new driver
shell: "sleep 2 && shutdown -r now &"
async: 1
poll: 1
sudo: true
ignore_errors: true
when: upgrading_driver
- name: Wait for host to reboot
local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
when: upgrading_driver
# We had one error where the device was not created correctly, this is a check for that
- stat: path=/dev/nvidiactl
register: nvidiactl
- fail: msg="Something is up see RT ticket 9477"
when: not nvidiactl.stat.exists
- name: Check nvidia driver version (takes a while after reboot)
command: nvidia-smi
register: nvidia_result
until: nvidia_result.stdout.find("NVIDIA-SMI has failed") == -1
retries: 5
delay: 5
- name: Check GPU correct version
debug: msg="Correct Driver Version {{ nvidia_version }}"
when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
- name: Start Slurm
service: name=slurm state=started
when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
#!/bin/bash
#
# Startup/shutdown script for nVidia CUDA
#
# chkconfig: 345 80 20
# description: Startup/shutdown script for nVidia CUDA
# Source function library.
. /etc/init.d/functions
DRIVER=nvidia
RETVAL=0
# Create /dev nodes for nvidia devices
function createnodes() {
# Count the number of NVIDIA controllers found.
N3D=`/sbin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l`
NVGA=`/sbin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l`
N=`expr $N3D + $NVGA - 1`
for i in `seq 0 $N`; do
mknod -m 666 /dev/nvidia$i c 195 $i
RETVAL=$?
[ "$RETVAL" = 0 ] || exit $RETVAL
done
mknod -m 666 /dev/nvidiactl c 195 255
RETVAL=$?
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# Remove /dev nodes for nvidia devices
function removenodes() {
rm -f /dev/nvidia*
}
# Start daemon
function start() {
echo -n $"Loading $DRIVER kernel module: "
depmod -a
modprobe $DRIVER && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
echo -n $"Initializing CUDA /dev entries: "
createnodes && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# Stop daemon
function stop() {
echo -n $"Unloading $DRIVER kernel module: "
rmmod -f $DRIVER && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
echo -n $"Removing CUDA /dev entries: "
removenodes && success || failure
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# See how we were called
case "$1" in
start)
start
;;
stop)
stop
;;
restart)
stop
start
;;
*)
echo $"Usage: $0 {start|stop|restart}"
RETVAL=1
esac
exit $RETVAL
# paulmc - no longer needed, this used to be used to set nouveau on boot on old OS's
# grub.conf generated by anaconda
#
# Note that you do not have to rerun grub after making changes to this file
# NOTICE: You do not have a /boot partition. This means that
# all kernel and initrd paths are relative to /, eg.
# root (hd0,0)
# kernel /boot/vmlinuz-version ro root=/dev/vda1
# initrd /boot/initrd-[generic-]version.img
#boot=/dev/vda
default=0
timeout=5
splashimage=(hd0,0)/boot/grub/splash.xpm.gz
hiddenmenu
title CentOS (2.6.32-504.el6.x86_64)
root (hd0,0)
kernel /boot/vmlinuz-2.6.32-504.el6.x86_64 ro root=/dev/vda1 rd_NO_LUKS KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD SYSFONT=latarcyrheb-sun16 crashkernel=auto elevator=noop biosdevname=0 console=ttyS0 rdblacklist=nouveau nouveau.modeset=0 rd_NO_LVM rd_NO_DM rhgb quiet
initrd /boot/initramfs-2.6.32-504.el6.x86_64.img
# generated by nvidia-installer
blacklist nouveau
options nouveau modeset=0
# paulmc no longer needed - this was for setting user permissions to use driver
options nvidia NVreg_DeviceFileMode=0666
# paulmc - no longer needed, we use nvidia-xconfig to generate this based on the GPU's it finds
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig: version 340.58 (buildmeister@swio-display-x86-rhel47-09) Fri Oct 31 17:40:05 PDT 2014
Section "DRI"
Mode 0660
Group "vglusers"
EndSection
Section "ServerLayout"
Identifier "Layout0"
Screen 0 "Screen0"
InputDevice "Keyboard0" "CoreKeyboard"
InputDevice "Mouse0" "CorePointer"
EndSection
Section "Files"
FontPath "/usr/share/fonts/default/Type1"
EndSection
Section "InputDevice"
# generated from default
Identifier "Mouse0"
Driver "mouse"
Option "Protocol" "auto"
Option "Device" "/dev/input/mice"
Option "Emulate3Buttons" "no"
Option "ZAxisMapping" "4 5"
EndSection
Section "InputDevice"
# generated from data in "/etc/sysconfig/keyboard"
Identifier "Keyboard0"
Driver "kbd"
Option "XkbLayout" "us"
Option "XkbModel" "pc105"
EndSection
Section "Monitor"
Identifier "Monitor0"
VendorName "Unknown"
ModelName "Unknown"
HorizSync 28.0 - 33.0
VertRefresh 43.0 - 72.0
Option "DPMS"
EndSection
Section "Device"
Identifier "Device0"
Driver "nvidia"
VendorName "NVIDIA Corporation"
BusID "PCI:00:06:0"
EndSection
Section "Screen"
Identifier "Screen0"
Device "Device0"
Monitor "Monitor0"
DefaultDepth 24
SubSection "Display"
Depth 24
EndSubSection
EndSection
# paulmc - no longer needed, we use inline changes instead
#%PAM-1.0
auth sufficient pam_rootok.so
auth sufficient pam_permit.so
account required pam_permit.so
session optional pam_keyinit.so force revoke
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment