From 3f8a4500de0665433244368a32572247ba8a5831 Mon Sep 17 00:00:00 2001
From: "Trung Nguyen (Monash University)" <trung.nguyen1@monash.edu>
Date: Fri, 19 Oct 2018 12:56:39 +1100
Subject: [PATCH] Add improvement to roles

---
 roles/config_repos/tasks/main.yml     |   6 +-
 roles/gpu/tasks/main.yml.bkup         | 206 ++++++++++++++++++++++++++
 roles/mellanox_drivers/tasks/main.yml |  13 +-
 roles/upgrade/tasks/main.yml          |   8 +
 4 files changed, 225 insertions(+), 8 deletions(-)
 create mode 100644 roles/gpu/tasks/main.yml.bkup

diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml
index 9dde0fc4..fe8f56f6 100644
--- a/roles/config_repos/tasks/main.yml
+++ b/roles/config_repos/tasks/main.yml
@@ -43,11 +43,7 @@
   check_mode: no
 
 - name: disable unwanted repos
-  shell: yum-config-manager --disable {{ item }}
-#  yum_repository: 
-#    name: "{{ item }}" 
-#    enabled: False
-#    state: 'absent'
+  shell: yum-config-manager --disable "{{ item }}"
   with_items: "{{repolist.stdout_lines|difference(yumenablerepo)}}"
   become: true
   become_user: root
diff --git a/roles/gpu/tasks/main.yml.bkup b/roles/gpu/tasks/main.yml.bkup
new file mode 100644
index 00000000..0cdf5047
--- /dev/null
+++ b/roles/gpu/tasks/main.yml.bkup
@@ -0,0 +1,206 @@
+---
+- name: install deps 
+  yum: name={{ item }} state=installed
+  sudo: true
+  with_items:
+    - gcc
+    - perl
+    - wget
+    - pciutils
+    - kernel-headers
+    - kernel-devel 
+    - xterm
+    - libX11-common
+    - libX11-devel
+    - libX11
+    - xorg-x11-server-common
+    - xorg-x11-util-macros
+    - xorg-x11-server-utils
+    - xorg-x11-font-utils
+    - xorg-x11-server-Xorg
+    - xorg-x11-glamor
+    - xorg-x11-xinit
+    - xorg-x11-utils
+    - xorg-x11-xauth
+    - xorg-x11-proto-devel
+    - xorg-x11-xkb-utils
+
+- name: install development tools
+  yum: name="@Development Tools" state=installed
+  become: true
+  become_user: root
+
+- name: disable nouveau
+  template: src=blacklist-nouveau.conf.j2 dest=/etc/modprobe.d/blacklist-nouveau.conf
+  become: true
+  become_user: root
+
+- name: template unit for for persistenced
+  template: src=nvidia-persistenced.service dest=/etc/systemd/system/nvidia-persistenced.service
+  become: true
+  become_user: root
+
+- name: create the nvidia-persistenced user
+  user: name=nvidia-persistenced state=present system=yes shell=/bin/false
+  become: true
+  become_user: root
+
+
+- name: remove nouveau
+  modprobe: name=nouveau state=absent
+  become: true 
+  become_user: root
+
+- name: get kernel version
+  shell: uname -r
+  register: kernel_version
+  check_mode: no
+
+
+- name: check nvidia driver
+  stat: path="/lib/modules/{{ kernel_version.stdout }}/kernel/drivers/video/nvidia.ko"
+  register: nvidia_driver
+  ignore_errors: true
+
+- name: set default driver version
+  set_fact: 
+    installed_driver_version: '0.0'
+
+- name: check nvidia driver version
+  shell: 'nvidia-smi | grep -Po "Driver Version: \K\S+"'
+  register: installed_driver_version
+  when: nvidia_driver.stat.exists
+  check_mode: no
+
+- name: debug - installed nvidia driver version
+  debug:
+    msg: "{{ installed_driver_version }} "
+
+- name: set install default
+  set_fact: 
+    install_driver: false
+
+- name: set uninstall default
+  set_fact: 
+    uninstall_driver: false
+
+- name: set install
+  set_fact: 
+    install_driver: true
+  when: not nvidia_driver.stat.exists or not installed_driver_version.stdout_lines[0] == nvidia_version
+
+- name: set uninstall
+  set_fact: 
+    uninstall_driver: true
+  when: nvidia_driver.stat.exists and not installed_driver_version.stdout_lines[0] == nvidia_version
+
+- name: Unload nvidia driver
+  shell: rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia || true
+  sudo: true
+  when: install_driver
+
+- name: stop the persistence daemon
+  service: name=nvidia-persistenced state=stopped
+  become: true
+  become_user: root
+  when: uninstall_driver
+
+- name: kill any X processes
+  shell:  ps ax | grep "X :0" | grep -v grep | cut -f 1 -d " " | xargs -I{} kill -9 {}
+  become: true
+  become_user: root
+  when: uninstall_driver
+
+- name: get old nvidia driver
+  get_url: url=http://consistency0/src/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run dest=/tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run
+  become: true
+  become_user: root
+  when: uninstall_driver
+  ignore_errors: true  
+
+- name: uninstall old nvidia driver
+  shell: chmod 755 /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run; /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run --uninstall --silent
+  become: true
+  become_user: root
+  when: uninstall_driver
+
+- name: clean up old driver installation file
+  file:
+    state: absent
+    path: /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run
+  become: true
+  become_user: root
+  ignore_errors: true
+
+- name: get nvidia driver 
+  get_url: url=http://consistency0/src/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
+  become: true
+  become_user: root
+  when: install_driver
+
+#- name: Copy boot file
+#  template: src=grub.conf.j2 dest=/boot/grub/grub.conf 
+#  sudo: true
+#
+#- name: Copy X config file
+#  template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf 
+#  sudo: true
+
+- name: Copy xserver file
+  template: src=xserver.j2 dest=/etc/pam.d/xserver
+  become: true
+  become_user: root
+
+- name: build nvidia driver 
+  shell: chmod 755 /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run; /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent 
+  become: true
+  become_user: root
+  when: install_driver
+
+- name: set the GOM
+  shell: nvidia-smi --gom=0
+  become: true
+  become_user: root
+
+- name: enable persistenced on boot
+  service: name=nvidia-persistenced state=started enabled=yes
+  become: true
+  become_user: root
+
+- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number
+  shell: /usr/bin/nvidia-xconfig -a --use-display-device=none --preserve-busid
+  become: true
+  become_user: root
+  args: 
+    creates: /etc/X11/xorg.conf
+
+#- name: Template xorg.conf for nodes with one GPU
+#  template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf
+#  become: true
+#  become_user: root
+#  when: template_xorgconf is defined and template_xorgcon
+
+- name: run nvidia-xconf-gen
+  script: scripts/nvidia-xconf-gen.py
+  register: nvidiacards
+  check_mode: no
+
+- name: set env for nvidia_card_lists
+  set_fact: 
+    nvidiacardslist: "{{ nvidiacards.stdout | from_json }}"
+
+- name: generate nvidia-xorg-conf
+  sudo: true
+  template:
+    src: xorg.conf.j2
+    dest: "{{ item['filename'] }}"
+  with_items: "{{ nvidiacardslist }}"
+
+- name: clean up nvidia driver installation file
+  file:
+    state: absent
+    path: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run
+  become: true
+  become_user: root
+  ignore_errors: true
+
diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml
index 9425296e..146f1a70 100644
--- a/roles/mellanox_drivers/tasks/main.yml
+++ b/roles/mellanox_drivers/tasks/main.yml
@@ -9,29 +9,33 @@
   when: ansible_os_family == "RedHat"
   
 - name: test for existing installation of drivers
-  command: ibv_devinfo
+  shell: '/bin/ibv_devinfo'
   become: true
   become_user: root
   register: drivers_installed
   ignore_errors: true
   check_mode: no
 
+- name: debug - print out installed driver
+  debug: var=drivers_installed
+
 - name: default dont install
   set_fact: 
     install_now: false 
     reboot_now: false
 
 - name: get driver version
-  command: 'ofed_info -l | head -n 1 | cut -f 1 -d " "'
+  shell: '/bin/ofed_info -l | head -n 1 | cut -f 1 -d " "'
   register: driver_version
   ignore_errors: true
   check_mode: no
+  changed_when: False
 
 - name: get desired driver version
   shell: 'echo {{ MELLANOX_DRIVER_SRC }} | cut -f 1,2,3 -d "-"'
   register: desired_driver_version
   check_mode: no
-
+  changed_when: False
 
 - name: set install due to drivers not installed
   set_fact: 
@@ -51,6 +55,9 @@
     reboot_now: true
   when: driver_version | failed or not desired_driver_version.stdout in driver_version.stdout
 
+- name: debug - print out value of install_now
+  debug: var=install_now
+
 - name: copy driver source
   unarchive: copy=no src="http://consistency0/src/{{ MELLANOX_DRIVER_SRC }}.tgz" dest=/tmp 
   become: true
diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml
index a730c502..4f29a072 100644
--- a/roles/upgrade/tasks/main.yml
+++ b/roles/upgrade/tasks/main.yml
@@ -23,6 +23,12 @@
   become_user: root
   when: ansible_os_family=="RedHat" and yumdisablerepo is defined
 
+- name: Clear yum pending transaction
+  command: yum-complete-transaction --cleanup-only
+  become: true
+  become_user: root
+  when: ansible_os_family == 'RedHat'
+
 - name: yum upgrade
   yum: name=* state=latest
   become: true
@@ -46,11 +52,13 @@
   register: rpm_q_output
   when: ansible_os_family=="RedHat"
   check_mode: no
+  changed_when: False
 
 - name: get kernel version
   shell: uname -r
   register: uname_r_output
   check_mode: no
+  changed_when: False 
 
 - name: default dont reboot
   set_fact: 
-- 
GitLab