Compare revisions

cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b
--- a/roles/gpu/templates/xserver.j2
+++ b/roles/gpu/templates/xserver.j2
+#%PAM-1.0
+auth       sufficient	pam_rootok.so
+auth sufficient pam_permit.so
+account    required	pam_permit.so
+session    optional	pam_keyinit.so force revoke
--- a/roles/gpu_update/README.md
+++ b/roles/gpu_update/README.md
+# GPU Update Role 
+paulmc 21/06/2016
+
+This role was intially created to deploy a new driver to Centos 6 MASSIVE.
+
+This should also work on a new system.
+
+For a update a seperate playbook can be created to deploy on new driver on an existing system.
+
+Note that the particular driver install needs to be in the files dir
+
+e.g.
+gpudriver_352.93.yml
+# This is a cut down playbook for deploying a new driver to MASSIBE m1 and m2
+- hosts: all
+  strategy: free
+  roles:
+  - { role: gpu_update , nvidia_version: 352.93}
+
--- a/roles/gpu_update/tasks/main.yml
+++ b/roles/gpu_update/tasks/main.yml
+---
+## Check for jobs and stop if the node is online or running jobs
+- name: Check node is offline and no jobs are running
+  shell: /usr/local/slurm/latest/bin/scontrol show node $HOSTNAME --oneline
+  register: node_status_result
+  always_run: yes
+
+- debug: var=node_status_result.stdout_lines
+
+- set_fact:
+    slurm_state_down_star_drain: "State=DOWN*+DRAIN "
+    slurm_state_down_drain: "State=DOWN+DRAIN "
+    slurm_state_down_star: "State=DOWN* "
+    slurm_state_idle_drain: "State=IDLE+DRAIN "
+    slurm_state_rsrv_drain: "State=RESERVED+DRAIN "
+
+- name: Fail if jobs are running
+  fail: msg="The node is not in IDLE+DRAIN, DOWN* or DOWN*+DRAIN, RESERVED+DRAIN we will not continue!"
+  when: (slurm_state_down_star_drain not in node_status_result.stdout)
+     and (slurm_state_down_drain not in node_status_result.stdout)
+     and (slurm_state_down_star not in node_status_result.stdout)
+     and (slurm_state_idle_drain not in node_status_result.stdout)
+     and (slurm_state_rsrv_drain not in node_status_result.stdout)
+#   when: (node_status_result.stdout.find('State=DOWN\*\+DRAIN') != -1)
+#     or (node_status_result.stdout.find('State=DOWN\* ') != -1)
+#     or (node_status_result.stdout.find('State=IDLE\+DRAIN') != -1)
+
+
+## Check Hardware and Stop if we are running on a wrong node
+- name: Check for GPU hardware before attempting to install driver
+  shell: lspci | grep "NVIDIA" | grep "3D controller"
+  always_run: yes
+  register: lspci_result
+
+- name: Show what GPUs lspci has found
+  debug: var=lspci_result.stdout_lines
+
+- name: Check and fail on no GPU
+  fail: msg="There is no GPU and you are trying to install a driver!?"
+  when: lspci_result.rc != 0
+
+- name: Set cuda init script
+  template: dest=/etc/init.d/cuda src=cuda mode="u=rwx,g=rx,o=rx"
+  become: true
+
+## Install packages
+- name: install deps
+  yum: name={{ item }} state=present
+  become: true
+  with_items:
+    - gcc
+    - perl
+    - wget
+    - pciutils
+    - kernel-headers
+    - kernel-devel
+    - xterm
+    - libX11-common
+    - libX11-devel
+    - libX11
+    - xorg-x11-server-common
+    - xorg-x11-util-macros
+    - xorg-x11-server-utils
+    - xorg-x11-font-utils
+    - xorg-x11-server-Xorg
+    - xorg-x11-glamor
+    - xorg-x11-xinit
+    - xorg-x11-utils
+    - xorg-x11-xauth
+    - xorg-x11-proto-devel
+    - xorg-x11-xkb-utils
+
+## Disable Nouveau (only required once on build
+# MASSIVE M1 and M2 originaly used this method...
+- name: Add nouveau from blacklist (MASSIVE Centos 6 only)
+  lineinfile:
+  args:
+    dest: /etc/modprobe.d/blacklist.conf
+    line: "blacklist nouveau"
+    state: present
+  become: true
+  when: ansible_os_family == "RedHat" and ansible_lsb.major_release|int == 6
+# M3 But this is the preferred method (which is what the installer does
+- name: Template nvidia-installer-disable-nouveau.conf
+  template: dest=/etc/modprobe.d/nvidia-installer-disable-nouveau.conf src=nvidia-installer-disable-nouveau.conf.j2
+  become: true
+  when: ansible_os_family != "RedHat" and ansible_lsb.major_release|int != 6
+
+- name: Check if nouveau module is loaded
+  shell: cat /proc/modules
+  always_run: yes
+  register: modules_result
+
+- name: Restart host to remove nouveau module
+  shell: "sleep 2 && shutdown -r now &"
+  async: 1
+  poll: 1
+  become: true
+  ignore_errors: true
+  when: modules_result.stdout.find('nouveau') != -1
+
+- name: Wait for host to reboot
+  local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
+  when: modules_result.stdout.find('nouveau') != -1
+
+# Removed as this is related to old ways of controlling access to driver files
+# - name: Template nvidia.conf
+#  template: dest=/etc/modprobe.d/nvidia.conf src=nvidia.conf.j2
+#  become: true
+
+## Install NVIDIA driver
+- name: Check nvidia driver version
+  shell: nvidia-smi
+  register: nvidia_result
+  always_run: yes
+
+- debug: var=nvidia_result.stdout_lines
+
+- set_fact:
+      upgrading_driver: false
+- name: Set upgrading_driver flag
+  set_fact:
+      upgrading_driver: true
+  when: nvidia_result.stdout.find("{{ nvidia_version }}") == -1
+
+- debug: var=upgrading_driver
+
+- name: Unload nvidia module
+  shell: modprobe -r nvidia
+  ignore_errors: true
+  when: upgrading_driver
+  # when: '"{{ nvidia_version }}" not in nvidia_result.stdout'
+
+- name: Check nvidia module is not loaded
+  shell: cat /proc/modules
+  register: nvidia_modules_result
+  always_run: yes
+
+- name: Restart host to unloaded nvidia module
+  shell: "sleep 2 && shutdown -r now &"
+  async: 1
+  poll: 1
+  become: true
+  ignore_errors: true
+  when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
+
+- name: Wait for host to reboot
+  local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
+  when: upgrading_driver and (nvidia_modules_result.stdout.find('nvidia') != -1)
+
+- name: Copy nvidia installer to /tmp
+  copy:
+     src=NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
+     dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
+     mode=755
+  # shell: cp -f /usr/local/src/CUDA/driver/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
+  # become: true
+  when: upgrading_driver
+
+- name: Install nvidia driver
+  shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent
+  become: true
+  when: upgrading_driver
+  # when: nvidia_result.stdout.find("{{ nvidia_version }}") != -1
+
+## Configure stuff for using the GPU
+- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number
+  shell: /usr/bin/nvidia-xconfig -a --use-display-device=none
+  become: true
+
+- name: Comment out auth required so xserver can start from slurm job
+  lineinfile: dest=/etc/pam.d/xserver
+    regexp='^auth\s+required\s+pam_console.so'
+    line='#auth       required    pam_console.so'
+    backrefs=yes
+    # state=present
+  become: true
+
+- name: set persistence mode
+  lineinfile:
+  args:
+    dest: /etc/rc.d/rc.local
+    line: "nvidia-smi --persistence-mode=1"
+    state: present
+  become: true
+
+- name: Restart host to enable new driver
+  shell: "sleep 2 && shutdown -r now &"
+  async: 1
+  poll: 1
+  become: true
+  ignore_errors: true
+  when: upgrading_driver
+
+- name: Wait for host to reboot
+  local_action: wait_for host="{{ inventory_hostname }}" search_regex=OpenSSH port=22 delay=60 timeout=900
+  when: upgrading_driver
+
+# We had one error where the device was not created correctly, this is a check for that
+- stat: path=/dev/nvidiactl
+  register: nvidiactl
+
+- fail: msg="Something is up see RT ticket 9477"
+  when: not nvidiactl.stat.exists
+
+
+- name: Check nvidia driver version (takes a while after reboot)
+  command: nvidia-smi
+  register: nvidia_result
+  until: nvidia_result.stdout.find("NVIDIA-SMI has failed") == -1
+  retries: 5
+  delay: 5
+
+- name: Check GPU correct version
+  debug: msg="Correct Driver Version {{ nvidia_version }}"
+  when: nvidia_result.stdout.find("{{ nvidia_version }}") !=  -1
+
+- name: Start Slurm
+  service: name=slurm state=started
+  when: nvidia_result.stdout.find("{{ nvidia_version }}") !=  -1
--- a/roles/gpu_update/templates/cuda
+++ b/roles/gpu_update/templates/cuda
+#!/bin/bash
+#
+# Startup/shutdown script for nVidia CUDA
+#
+# chkconfig: 345 80 20
+# description: Startup/shutdown script for nVidia CUDA
+
+# Source function library.
+. /etc/init.d/functions
+
+DRIVER=nvidia
+RETVAL=0
+
+# Create /dev nodes for nvidia devices
+function createnodes() {
+   # Count the number of NVIDIA controllers found.
+   N3D=`/sbin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l`
+   NVGA=`/sbin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l`
+
+   N=`expr $N3D + $NVGA - 1`
+   for i in `seq 0 $N`; do
+       mknod -m 666 /dev/nvidia$i c 195 $i
+       RETVAL=$?
+       [ "$RETVAL" = 0 ] || exit $RETVAL
+   done
+
+   mknod -m 666 /dev/nvidiactl c 195 255
+   RETVAL=$?
+   [ "$RETVAL" = 0 ] || exit $RETVAL
+}
+
+# Remove /dev nodes for nvidia devices
+function removenodes() {
+   rm -f /dev/nvidia*
+}
+
+# Start daemon
+function start() {
+   echo -n $"Loading $DRIVER kernel module: "
+   depmod -a
+   modprobe $DRIVER && success || failure
+   RETVAL=$?
+   echo
+   [ "$RETVAL" = 0 ] || exit $RETVAL
+
+   echo -n $"Initializing CUDA /dev entries: "
+   createnodes && success || failure
+   RETVAL=$?
+   echo
+   [ "$RETVAL" = 0 ] || exit $RETVAL
+}
+
+# Stop daemon
+function stop() {
+   echo -n $"Unloading $DRIVER kernel module: "
+   rmmod -f $DRIVER && success || failure
+   RETVAL=$?
+   echo
+   [ "$RETVAL" = 0 ] || exit $RETVAL
+
+   echo -n $"Removing CUDA /dev entries: "
+   removenodes && success || failure
+   RETVAL=$?
+   echo
+   [ "$RETVAL" = 0 ] || exit $RETVAL
+}
+
+# See how we were called
+case "$1" in
+   start)
+       start
+      ;;
+   stop)
+       stop
+      ;;
+   restart)
+       stop
+       start
+      ;;
+   *)
+       echo $"Usage: $0 {start|stop|restart}"
+       RETVAL=1
+esac
+exit $RETVAL
--- a/roles/gpu_update/templates/grub.conf.j2
+++ b/roles/gpu_update/templates/grub.conf.j2
+# paulmc - no longer needed, this used to be used to set nouveau on boot on old OS's
+# grub.conf generated by anaconda
+#
+# Note that you do not have to rerun grub after making changes to this file
+# NOTICE:  You do not have a /boot partition.  This means that
+#          all kernel and initrd paths are relative to /, eg.
+#          root (hd0,0)
+#          kernel /boot/vmlinuz-version ro root=/dev/vda1
+#          initrd /boot/initrd-[generic-]version.img
+#boot=/dev/vda
+default=0
+timeout=5
+splashimage=(hd0,0)/boot/grub/splash.xpm.gz
+hiddenmenu
+title CentOS (2.6.32-504.el6.x86_64)
+	root (hd0,0)
+	kernel /boot/vmlinuz-2.6.32-504.el6.x86_64 ro root=/dev/vda1 rd_NO_LUKS  KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD SYSFONT=latarcyrheb-sun16 crashkernel=auto elevator=noop biosdevname=0 console=ttyS0 rdblacklist=nouveau nouveau.modeset=0 rd_NO_LVM rd_NO_DM rhgb quiet 
+	initrd /boot/initramfs-2.6.32-504.el6.x86_64.img
--- a/roles/gpu_update/templates/nvidia-installer-disable-nouveau.conf.j2
+++ b/roles/gpu_update/templates/nvidia-installer-disable-nouveau.conf.j2
+# generated by nvidia-installer
+blacklist nouveau
+options nouveau modeset=0
--- a/roles/gpu_update/templates/nvidia.conf.j2
+++ b/roles/gpu_update/templates/nvidia.conf.j2
+# paulmc no longer needed - this was for setting user permissions to use driver
+options nvidia NVreg_DeviceFileMode=0666
+
--- a/roles/gpu_update/templates/xorg.conf.j2
+++ b/roles/gpu_update/templates/xorg.conf.j2
+# paulmc - no longer needed, we use nvidia-xconfig to generate this based on the GPU's it finds
+# nvidia-xconfig: X configuration file generated by nvidia-xconfig
+# nvidia-xconfig:  version 340.58  (buildmeister@swio-display-x86-rhel47-09)  Fri Oct 31 17:40:05 PDT 2014
+Section "DRI"
+	Mode 0660
+	Group "vglusers"
+EndSection
+
+Section "ServerLayout"
+    Identifier     "Layout0"
+    Screen      0  "Screen0"
+    InputDevice    "Keyboard0" "CoreKeyboard"
+    InputDevice    "Mouse0" "CorePointer"
+EndSection
+
+Section "Files"
+    FontPath        "/usr/share/fonts/default/Type1"
+EndSection
+
+Section "InputDevice"
+    # generated from default
+    Identifier     "Mouse0"
+    Driver         "mouse"
+    Option         "Protocol" "auto"
+    Option         "Device" "/dev/input/mice"
+    Option         "Emulate3Buttons" "no"
+    Option         "ZAxisMapping" "4 5"
+EndSection
+
+Section "InputDevice"
+    # generated from data in "/etc/sysconfig/keyboard"
+    Identifier     "Keyboard0"
+    Driver         "kbd"
+    Option         "XkbLayout" "us"
+    Option         "XkbModel" "pc105"
+EndSection
+
+Section "Monitor"
+    Identifier     "Monitor0"
+    VendorName     "Unknown"
+    ModelName      "Unknown"
+    HorizSync       28.0 - 33.0
+    VertRefresh     43.0 - 72.0
+    Option         "DPMS"
+EndSection
+
+Section "Device"
+    Identifier     "Device0"
+    Driver         "nvidia"
+    VendorName     "NVIDIA Corporation"
+    BusID          "PCI:00:06:0"
+EndSection
+
+Section "Screen"
+    Identifier     "Screen0"
+    Device         "Device0"
+    Monitor        "Monitor0"
+    DefaultDepth    24
+    SubSection     "Display"
+        Depth       24
+    EndSubSection
+EndSection
+
--- a/roles/gpu_update/templates/xserver.j2
+++ b/roles/gpu_update/templates/xserver.j2
+# paulmc - no longer needed, we use inline changes instead
+#%PAM-1.0
+auth       sufficient	pam_rootok.so
+auth sufficient pam_permit.so
+account    required	pam_permit.so
+session    optional	pam_keyinit.so force revoke
--- a/roles/hpcid_ca/tasks/main.yml
+++ b/roles/hpcid_ca/tasks/main.yml
+---
+
+- name: ensure hpcid_ca is in the authorized_keys file
+  authorized_key: user={{ ansible_user }} key="cert-authority ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCfHlWGrnpirvqvUTySnoQK6ze5oIXz7cYIT+XCBeBCahlK05O38g0erBGrNWFozZwbIXnysVCibaUJqtH0JrYqmcr2NnYA0PoiTeranvaJI7pQsga1gBxfK/D4UItw5yI6V7w9efMT0zpIP8WEubQz6GFtkyiNVgFCHj3+VhLs3RslvYzb35SFcLXEDsGVQM5NdWBUgRaNRqpTPvuMcxTyPvy32wW72kwaYRQioDJFcE2WJ240M2oSsx+dhTWvI8sW1sEUI1qIDfyBPsOgsLofuSpt4ZNgJqBUTp/hW85wVpNzud6A4YJWHpZXSDMtUMYE9QL+x2fw/b26yck9ZPE/ hines@tun"
--- a/roles/hpcsystems/tasks/main.yml
+++ b/roles/hpcsystems/tasks/main.yml
+---
+- name: install system dependencies
+  yum: name={{ item }} state=present
+  with_items:
+    - openssl-devel
+    - openldap-devel
+    - python-pip
+    - git
+    - python-virtualenv
+  become: true
+  become_user: root
+  when: ansible_os_family == "RedHat"
+
+- name: install system dependencies
+  apt: name={{ item }} state=present
+  with_items:
+    - libssl-dev
+    - libldap2-dev
+  become: true
+  become_user: root
+  when: ansible_os_family == "Debian"
+
+- name: create install dir
+  file: name={{ item }} state=directory owner={{ ansible_user }}
+  with_items:
+   - "/usr/local/hpcsystem"
+   - "/usr/local/hpcsystem_config"
+   - "/usr/local/virtualenvs/mercpytools"
+  become: true
+  become_user: root
+
+- name: upgrade pip
+  pip:
+    virtualenv: "/usr/local/virtualenvs/mercpytools"
+    name: "pip"
+    extra_args: "--upgrade"
+
+- name: install mercpytools
+  pip:
+    virtualenv: "/usr/local/virtualenvs/mercpytools"
+    name: "git+https://gitlab.erc.monash.edu.au/hpc-team/mercpytools.git#egg=mercpytools"
+    extra_args: "--upgrade"
+
+- name: install hpcsystem
+  git:
+    repo: git@gitlab.erc.monash.edu.au:hpc-team/hpcsystem.git
+    dest: /usr/local/hpcsystem
+    accept_hostkey: True
+
+- name: install hpcsystem_config
+  git:
+    repo: git@gitlab.erc.monash.edu.au:hpc-team/m3_hpcsystem_config.git
+    dest: /usr/local/hpcsystem_config
+    accept_hostkey: True
+
+- name: cron job to check quotas
+  cron:
+    name: "Naggy quota cron job"
+    value: '/usr/local/hpcsystem/naggy_quota.sh'
+    hour: 16
+    minute: 23
+  become: true
+  become_user: root
--- a/roles/installPackage/tasks/main.yml
+++ b/roles/installPackage/tasks/main.yml
+---
+- name: Pre installation
+  shell: "{{ preInstallation }}"
+  become: true
+  ignore_errors: true
+  when: ansible_distribution == 'CentOS' and preInstallation is defined
+
+- name: Add new repo file
+  shell: "{{ importRepo.command }} {{ importRepo.destination }}"
+  become: true
+  run_once: true
+  args:
+    creates: "{{ importRepo.destination }}"
+  when: ansible_distribution == 'CentOS' and importRepo is defined
+
+- name: Install yum packages
+  yum: name={{ item }} state=present
+  with_items: yumPackageList
+  become: true
+  when: ansible_distribution == 'CentOS' and yumPackageList is defined
+
+- name: Install yum group packages
+  shell: yum --setopt=protected_multilib=false -y groupinstall "{{ item }}"
+  with_items: yumGroupPackageList
+  become: true
+  when: ansible_distribution == 'CentOS' and yumGroupPackageList is defined
+
+- name: Post installation
+  shell: "{{ postInstallation }}"
+  become: true
+  when: ansible_distribution == 'CentOS' and postInstallation is defined
+
+- name: conditional shell copy command
+  shell: "{{ cliCopy.run }}"
+  become: true
+  run_once: true
+  args:
+    creates: "{{ cliCopy.check }}"
+  when: ansible_distribution == 'CentOS' and cliAction is defined
+
+
--- a/roles/jasons_ssh_ca/files/server_ca.pub
+++ b/roles/jasons_ssh_ca/files/server_ca.pub
+ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBIbQXH8ZHnl7Ht5YMuGqZ80k+nKnds+58y9VcedVeXDobsF7t6wCRe5GDov8XxYxxWbjz0H7xhx6PQYiVsn6GL0= ubuntu@sshauthz-2
--- a/roles/jasons_ssh_ca/handlers/main.yml
+++ b/roles/jasons_ssh_ca/handlers/main.yml
+---
+- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
+
+- name: restart ssh
+  service: name={{ sshd_name }} state=restarted
+  become: true
--- a/roles/jasons_ssh_ca/tasks/main.yml
+++ b/roles/jasons_ssh_ca/tasks/main.yml
+---
+- include_vars: "{{ ansible_os_family }}_{{ ansible_architecture }}.yml"
+
+- name: copy ca cert
+  copy: src=server_ca.pub dest=/etc/ssh/server_ca.pub owner=root group=root mode=644
+  become: true
+
+- name: edit sshd_config
+  lineinfile:
+  args:
+    dest: /etc/ssh/sshd_config
+    line: TrustedUserCAKeys /etc/ssh/server_ca.pub
+    state: present
+  become: true
+  notify: restart ssh
--- a/roles/jasons_ssh_ca/vars/Debian_x86_64.yml
+++ b/roles/jasons_ssh_ca/vars/Debian_x86_64.yml
+sshd_name: "ssh"
+
--- a/roles/jasons_ssh_ca/vars/RedHat_x86_64.yml
+++ b/roles/jasons_ssh_ca/vars/RedHat_x86_64.yml
+sshd_name: "sshd"
+
--- a/roles/karaage2.7/files/get_ldap_url.py
+++ b/roles/karaage2.7/files/get_ldap_url.py
+#!/usr/bin/python
+import sys
+import json
+filename = sys.argv[1]
+ansible_hostname = sys.argv[2]
+domain = sys.argv[3]
+f=open(filename,'r')
+s=f.read()
+d=json.loads(s)
+f.close()
+hosts={}
+for group in d['groups'].keys():
+    for h in d['groups'][group]:
+        if hosts.has_key(h):
+            pass
+        else:
+            hosts[h] = {}
+
+
+url=""
+try:
+    for host in d['groups']['ldap']:
+        fqdn="%s.%s"%(host,domain)
+        url=url+"ldaps://%s"%fqdn
+except:
+    url="ldaps:///"
+print url
+	
--- a/roles/karaage2.7/handlers/main.yml
+++ b/roles/karaage2.7/handlers/main.yml
+---
+- name: restart apache
+  service: name=apache2 state=restarted
+  become: true
+
+- name: restart postfix
+  service: name=postfix state=restarted
+  become: true
--- a/roles/karaage2.7/meta/main.yml
+++ b/roles/karaage2.7/meta/main.yml
+---
+dependencies:
+    - { role: easy-rsa-certificate, x509_csr_args: "", x509_sign_args: "--server", x509_cacert_file: "/etc/ssl/certs/ca.crt", x509_key_file: "/etc/ssl/private/server.key", x509_cert_file: "/etc/ssl/certs/server.crt", x509_common_name: "{{ ansible_fqdn }}" }
No results found