From 3f8a4500de0665433244368a32572247ba8a5831 Mon Sep 17 00:00:00 2001 From: "Trung Nguyen (Monash University)" <trung.nguyen1@monash.edu> Date: Fri, 19 Oct 2018 12:56:39 +1100 Subject: [PATCH] Add improvement to roles --- roles/config_repos/tasks/main.yml | 6 +- roles/gpu/tasks/main.yml.bkup | 206 ++++++++++++++++++++++++++ roles/mellanox_drivers/tasks/main.yml | 13 +- roles/upgrade/tasks/main.yml | 8 + 4 files changed, 225 insertions(+), 8 deletions(-) create mode 100644 roles/gpu/tasks/main.yml.bkup diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 9dde0fc4..fe8f56f6 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -43,11 +43,7 @@ check_mode: no - name: disable unwanted repos - shell: yum-config-manager --disable {{ item }} -# yum_repository: -# name: "{{ item }}" -# enabled: False -# state: 'absent' + shell: yum-config-manager --disable "{{ item }}" with_items: "{{repolist.stdout_lines|difference(yumenablerepo)}}" become: true become_user: root diff --git a/roles/gpu/tasks/main.yml.bkup b/roles/gpu/tasks/main.yml.bkup new file mode 100644 index 00000000..0cdf5047 --- /dev/null +++ b/roles/gpu/tasks/main.yml.bkup @@ -0,0 +1,206 @@ +--- +- name: install deps + yum: name={{ item }} state=installed + sudo: true + with_items: + - gcc + - perl + - wget + - pciutils + - kernel-headers + - kernel-devel + - xterm + - libX11-common + - libX11-devel + - libX11 + - xorg-x11-server-common + - xorg-x11-util-macros + - xorg-x11-server-utils + - xorg-x11-font-utils + - xorg-x11-server-Xorg + - xorg-x11-glamor + - xorg-x11-xinit + - xorg-x11-utils + - xorg-x11-xauth + - xorg-x11-proto-devel + - xorg-x11-xkb-utils + +- name: install development tools + yum: name="@Development Tools" state=installed + become: true + become_user: root + +- name: disable nouveau + template: src=blacklist-nouveau.conf.j2 dest=/etc/modprobe.d/blacklist-nouveau.conf + become: true + become_user: root + +- name: template unit for for persistenced + template: src=nvidia-persistenced.service dest=/etc/systemd/system/nvidia-persistenced.service + become: true + become_user: root + +- name: create the nvidia-persistenced user + user: name=nvidia-persistenced state=present system=yes shell=/bin/false + become: true + become_user: root + + +- name: remove nouveau + modprobe: name=nouveau state=absent + become: true + become_user: root + +- name: get kernel version + shell: uname -r + register: kernel_version + check_mode: no + + +- name: check nvidia driver + stat: path="/lib/modules/{{ kernel_version.stdout }}/kernel/drivers/video/nvidia.ko" + register: nvidia_driver + ignore_errors: true + +- name: set default driver version + set_fact: + installed_driver_version: '0.0' + +- name: check nvidia driver version + shell: 'nvidia-smi | grep -Po "Driver Version: \K\S+"' + register: installed_driver_version + when: nvidia_driver.stat.exists + check_mode: no + +- name: debug - installed nvidia driver version + debug: + msg: "{{ installed_driver_version }} " + +- name: set install default + set_fact: + install_driver: false + +- name: set uninstall default + set_fact: + uninstall_driver: false + +- name: set install + set_fact: + install_driver: true + when: not nvidia_driver.stat.exists or not installed_driver_version.stdout_lines[0] == nvidia_version + +- name: set uninstall + set_fact: + uninstall_driver: true + when: nvidia_driver.stat.exists and not installed_driver_version.stdout_lines[0] == nvidia_version + +- name: Unload nvidia driver + shell: rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia || true + sudo: true + when: install_driver + +- name: stop the persistence daemon + service: name=nvidia-persistenced state=stopped + become: true + become_user: root + when: uninstall_driver + +- name: kill any X processes + shell: ps ax | grep "X :0" | grep -v grep | cut -f 1 -d " " | xargs -I{} kill -9 {} + become: true + become_user: root + when: uninstall_driver + +- name: get old nvidia driver + get_url: url=http://consistency0/src/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run dest=/tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run + become: true + become_user: root + when: uninstall_driver + ignore_errors: true + +- name: uninstall old nvidia driver + shell: chmod 755 /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run; /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run --uninstall --silent + become: true + become_user: root + when: uninstall_driver + +- name: clean up old driver installation file + file: + state: absent + path: /tmp/NVIDIA-Linux-x86_64-{{ installed_driver_version.stdout_lines[0] }}.run + become: true + become_user: root + ignore_errors: true + +- name: get nvidia driver + get_url: url=http://consistency0/src/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + become: true + become_user: root + when: install_driver + +#- name: Copy boot file +# template: src=grub.conf.j2 dest=/boot/grub/grub.conf +# sudo: true +# +#- name: Copy X config file +# template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf +# sudo: true + +- name: Copy xserver file + template: src=xserver.j2 dest=/etc/pam.d/xserver + become: true + become_user: root + +- name: build nvidia driver + shell: chmod 755 /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run; /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent + become: true + become_user: root + when: install_driver + +- name: set the GOM + shell: nvidia-smi --gom=0 + become: true + become_user: root + +- name: enable persistenced on boot + service: name=nvidia-persistenced state=started enabled=yes + become: true + become_user: root + +- name: Configure xorg.conf with nvidia-xconfig so xorg.conf matches gpu number + shell: /usr/bin/nvidia-xconfig -a --use-display-device=none --preserve-busid + become: true + become_user: root + args: + creates: /etc/X11/xorg.conf + +#- name: Template xorg.conf for nodes with one GPU +# template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf +# become: true +# become_user: root +# when: template_xorgconf is defined and template_xorgcon + +- name: run nvidia-xconf-gen + script: scripts/nvidia-xconf-gen.py + register: nvidiacards + check_mode: no + +- name: set env for nvidia_card_lists + set_fact: + nvidiacardslist: "{{ nvidiacards.stdout | from_json }}" + +- name: generate nvidia-xorg-conf + sudo: true + template: + src: xorg.conf.j2 + dest: "{{ item['filename'] }}" + with_items: "{{ nvidiacardslist }}" + +- name: clean up nvidia driver installation file + file: + state: absent + path: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + become: true + become_user: root + ignore_errors: true + diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml index 9425296e..146f1a70 100644 --- a/roles/mellanox_drivers/tasks/main.yml +++ b/roles/mellanox_drivers/tasks/main.yml @@ -9,29 +9,33 @@ when: ansible_os_family == "RedHat" - name: test for existing installation of drivers - command: ibv_devinfo + shell: '/bin/ibv_devinfo' become: true become_user: root register: drivers_installed ignore_errors: true check_mode: no +- name: debug - print out installed driver + debug: var=drivers_installed + - name: default dont install set_fact: install_now: false reboot_now: false - name: get driver version - command: 'ofed_info -l | head -n 1 | cut -f 1 -d " "' + shell: '/bin/ofed_info -l | head -n 1 | cut -f 1 -d " "' register: driver_version ignore_errors: true check_mode: no + changed_when: False - name: get desired driver version shell: 'echo {{ MELLANOX_DRIVER_SRC }} | cut -f 1,2,3 -d "-"' register: desired_driver_version check_mode: no - + changed_when: False - name: set install due to drivers not installed set_fact: @@ -51,6 +55,9 @@ reboot_now: true when: driver_version | failed or not desired_driver_version.stdout in driver_version.stdout +- name: debug - print out value of install_now + debug: var=install_now + - name: copy driver source unarchive: copy=no src="http://consistency0/src/{{ MELLANOX_DRIVER_SRC }}.tgz" dest=/tmp become: true diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml index a730c502..4f29a072 100644 --- a/roles/upgrade/tasks/main.yml +++ b/roles/upgrade/tasks/main.yml @@ -23,6 +23,12 @@ become_user: root when: ansible_os_family=="RedHat" and yumdisablerepo is defined +- name: Clear yum pending transaction + command: yum-complete-transaction --cleanup-only + become: true + become_user: root + when: ansible_os_family == 'RedHat' + - name: yum upgrade yum: name=* state=latest become: true @@ -46,11 +52,13 @@ register: rpm_q_output when: ansible_os_family=="RedHat" check_mode: no + changed_when: False - name: get kernel version shell: uname -r register: uname_r_output check_mode: no + changed_when: False - name: default dont reboot set_fact: -- GitLab