diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index fe8f56f6c46f9f755625d3ca32fc0f855d6ce2a8..fd13eb52916874e2c500602c01025ecae8470c1a 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -41,6 +41,8 @@ shell: yum repolist all | grep enabled | cut -f 1 -d '/' | sed -s 's/\!//' register: repolist check_mode: no + args: + warn: False - name: disable unwanted repos shell: yum-config-manager --disable "{{ item }}" diff --git a/roles/deploy-xorg/templates/xorg.conf.j2 b/roles/deploy-xorg/templates/xorg.conf.j2 index 2fc5f043c03e710be306d3b740cb6f5963216860..7f8b0f82076f6cfa81b34c4dd00a0d460cbe81a6 100644 --- a/roles/deploy-xorg/templates/xorg.conf.j2 +++ b/roles/deploy-xorg/templates/xorg.conf.j2 @@ -66,8 +66,10 @@ Section "Screen" Device "Device{{item.screens.index(screen)}}" Monitor "Monitor{{item.screens.index(screen)}}" DefaultDepth 24 - Option "UseDisplayDevice" "None" Option "ProbeAllGpus" "false" +{% if item.boardname == 'GRID K1' %} + Option "UseDisplayDevice" "None" +{% endif %} SubSection "Display" Virtual 1920 1200 Depth 24 diff --git a/roles/gpu/files/scripts/nvidia-xconf-gen.py b/roles/gpu/files/scripts/nvidia-xconf-gen.py index 7cd9cb551f348d608b583466322b1acc137e9b8d..6993d3339bd57d42fb2860dc3a7ac87a79c9e71b 100755 --- a/roles/gpu/files/scripts/nvidia-xconf-gen.py +++ b/roles/gpu/files/scripts/nvidia-xconf-gen.py @@ -29,8 +29,18 @@ def grab_card_ids(): cards = [] for line in p.stdout.readlines(): - line = line.rstrip().split(":")[2] - pcibus_num = int(re.sub('[.:]', '', line).rstrip("0"),16) + stripped_line = line.rstrip().split(":")[2] + #check for different format of pcibus_id. This happens on baremetals + # i.e. 00000000:06:00.0 not 00000000:00:06.0 + pcibus_id = re.sub('[.:]', '', stripped_line).rstrip("0") + if not pcibus_id: # empty string, try the other way + stripped_line = line.rstrip().split(":")[1] + pcibus_id = re.sub('[.:]', '', stripped_line).rstrip("0") + if not pcibus_id: + print("Error in grab_card_ids: we can not parse the line {}".format(line)) + print("Command that generated it is: {}".format(cmd)) + system.exit(1) + pcibus_num = int(pcibus_id,16) card = "PCI:0:{}:0".format(str(pcibus_num)) cards.append(card) return cards diff --git a/roles/gpu/templates/xorg.conf.j2 b/roles/gpu/templates/xorg.conf.j2 index 2fc5f043c03e710be306d3b740cb6f5963216860..7f8b0f82076f6cfa81b34c4dd00a0d460cbe81a6 100644 --- a/roles/gpu/templates/xorg.conf.j2 +++ b/roles/gpu/templates/xorg.conf.j2 @@ -66,8 +66,10 @@ Section "Screen" Device "Device{{item.screens.index(screen)}}" Monitor "Monitor{{item.screens.index(screen)}}" DefaultDepth 24 - Option "UseDisplayDevice" "None" Option "ProbeAllGpus" "false" +{% if item.boardname == 'GRID K1' %} + Option "UseDisplayDevice" "None" +{% endif %} SubSection "Display" Virtual 1920 1200 Depth 24 diff --git a/roles/ldapclient/tasks/installOpenLdap.yml b/roles/ldapclient/tasks/installOpenLdap.yml index b11a480d754812d7c4f7139611502dcb1219fcf4..f0db145ecbf8f310695cdcaebb672d10baacf9d9 100644 --- a/roles/ldapclient/tasks/installOpenLdap.yml +++ b/roles/ldapclient/tasks/installOpenLdap.yml @@ -1,14 +1,8 @@ --- - name: "Install open ldap package yum" - yum: name={{ item }} state=present - with_items: - - openldap - - openldap-clients - - sssd - - sssd-common - - sssd-client - - nss - - nss-tools + yum: + name: ['openldap', 'openldap-clients', 'sssd', 'sssd-common', 'sssd-client', 'nss', 'nss-tools'] + state: present sudo: true when: ansible_os_family == 'RedHat' diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml index 146f1a70433c56ad5806f3cf2c39090064ca6b1b..c6c1f4a30fc014c000a433ba4bfe4db629f370af 100644 --- a/roles/mellanox_drivers/tasks/main.yml +++ b/roles/mellanox_drivers/tasks/main.yml @@ -53,7 +53,7 @@ set_fact: install_now: true reboot_now: true - when: driver_version | failed or not desired_driver_version.stdout in driver_version.stdout + when: driver_version is failed or not desired_driver_version.stdout in driver_version.stdout - name: debug - print out value of install_now debug: var=install_now diff --git a/roles/p100_firmware/README.md b/roles/p100_firmware/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb5b0479846737672576214a275b578870ba50c6 --- /dev/null +++ b/roles/p100_firmware/README.md @@ -0,0 +1,5 @@ +Role to upgrade firwmare on P100 nodes + +Usage + - { role: p100_firmware, BINARY_NAME: "P100_PCN204260.bin" , tags: [p100] } + diff --git a/roles/p100_firmware/tasks/main.yml b/roles/p100_firmware/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..44580f195db1bab74e0bf39dabd725c233c5d8c0 --- /dev/null +++ b/roles/p100_firmware/tasks/main.yml @@ -0,0 +1,69 @@ +--- +- name: "stop nvidia persistence daemon" + service: name=nvidia-persistenced state=stopped + become: true + become_user: root + ignore_errors: true +- name: stop collectd + service: name=collectd state=stopped + become: true + become_user: root + ignore_errors: true +- name: stop create-dev-uvm + service: name=collectd state=stopped + become: true + become_user: root + ignore_errors: true +- name: remove nvidia_drm from kernel + modprobe: + name: nvidia_drm + state: absent + become: true + become_user: root +- name: remove nvidia_modeset from kernel + modprobe: + name: nvidia_modeset + state: absent + become: true + become_user: root +- name: remove nvidia from kernel + modprobe: + name: nvidia + state: absent + become: true + become_user: root +- name: check for nvidia modules + shell: /usr/sbin/lsmod | grep nvidia + ignore_errors: true +#- meta: end_play +- name: download Firwmare + get_url: + url: "http://consistency0/src/{{ BINARY_NAME }}" + dest: "/tmp/{{ BINARY_NAME }}" + mode: "0755" +- name: Run the binary command + command: "/tmp/{{ BINARY_NAME }}" + #command: "ls -l /tmp/{{ BINARY_NAME }}" + become: true + become_user: root + register: upgrade_out +- name: stdout of upgrade is + debug: var=upgrade_out.stdout +- name: stderr of upgrade is + debug: var=upgrade_out.stderr +- name: enable persistenced on boot + service: name=nvidia-persistenced state=started enabled=yes + become: true + become_user: root +- name: start collectd + service: name=collectd state=started + become: true + become_user: root + ignore_errors: true +- name: start create-dev-uvm + service : name=create-dev-uvm state=started + become: true + become_user: root + ignore_errors: true +- name: DON'T FORGET TO REBOOT + debug: msg="And I really mean it." diff --git a/roles/pam_slurm/templates/sshd.j2 b/roles/pam_slurm/templates/sshd.j2 index a1218458728bb47fea1d4f73194191a516cb6214..fea4fda0e8db16351917037f681ae82a4795d5df 100644 --- a/roles/pam_slurm/templates/sshd.j2 +++ b/roles/pam_slurm/templates/sshd.j2 @@ -6,7 +6,7 @@ auth include postlogin -auth optional pam_reauthorize.so prepare account required pam_nologin.so account include password-auth -account sufficient pam_slurm.so +account sufficient pam_slurm_adopt.so account required pam_access.so password include password-auth # pam_selinux.so close should be the first session rule diff --git a/roles/rsyslog_client/tasks/main.yml b/roles/rsyslog_client/tasks/main.yml index 2aec4a9c6688a96994edb563b14c15b017cd599e..9b087381192f7818bd9a61467dea29614dab0ac7 100644 --- a/roles/rsyslog_client/tasks/main.yml +++ b/roles/rsyslog_client/tasks/main.yml @@ -22,4 +22,4 @@ service: name=rsyslog state=restarted become: true become_user: root - when: config_changed | changed + when: config_changed is changed diff --git a/roles/slurm-common/tasks/installCgroup.yml b/roles/slurm-common/tasks/installCgroup.yml index 9b21e1b4d7fba85c0b47e8ce12663faedd8b19f4..c7f4253d3dfcb0540421c27249d7aee0a4920118 100644 --- a/roles/slurm-common/tasks/installCgroup.yml +++ b/roles/slurm-common/tasks/installCgroup.yml @@ -2,7 +2,8 @@ yum: name={{ item }} state=installed with_items: - libcgroup - sudo: true + become: True + become_method: sudo when: ansible_os_family == "RedHat" - name: apt install cgroup @@ -11,14 +12,16 @@ - cgmanager - cgmanager-utils - libcgmanager0 - sudo: true when: ansible_os_family == "Debian" - sudo: true + become: True + become_method: sudo - name: config cgroup.conf file template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644 - sudo: true + become: True + become_method: sudo - name: config cgroup_allowed_devices.conf file template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644 - sudo: true + become: True + become_method: sudo diff --git a/roles/slurm-common/tasks/installSlurmFromSource.yml b/roles/slurm-common/tasks/installSlurmFromSource.yml index 167994b2dc000568ee739480d28a07679f86685c..8785c3692b1d6e26e6b6a0caed85f92942e185c1 100644 --- a/roles/slurm-common/tasks/installSlurmFromSource.yml +++ b/roles/slurm-common/tasks/installSlurmFromSource.yml @@ -8,20 +8,20 @@ sudo: true when: force_slurm_recompile is defined - - name: unarchive slurm unarchive: - args: src: "http://consistency0/src/slurm-{{ slurm_version }}.tar.bz2" - copy: no dest: /tmp + remote_src: yes creates: "{{ slurm_dir }}/bin/srun" - name: stat srun stat: path="{{ slurm_dir }}/bin/srun" register: stat_srun - +- name: stat pam_slurm_adopt + stat: path="/lib64/security/pam_slurm_adopt.so" + register: stat_pam_slurm_adopt - name: configure slurm command: /tmp/slurm-{{ slurm_version }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam @@ -45,6 +45,19 @@ creates: "{{ slurm_dir }}/bin/srun" when: force_slurm_recompile is defined or not stat_srun.stat.exists +- name: build pmi + command: make + args: + chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi + when: force_slurm_recompile is defined or not stat_srun.stat.exists + +- name: install pmi + shell: make install + sudo: true + args: + chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi + when: force_slurm_recompile is defined or not stat_srun.stat.exists + - name: build pam_slurm command: make args: @@ -58,6 +71,18 @@ chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam when: force_slurm_recompile is defined or not stat_srun.stat.exists +- name: build pam_slurm_adopt + make: + chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt + when: force_slurm_recompile is defined or not stat_pam_slurm_adopt.stat.exists + +- name: install pam_slurm_adopt + make: + chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt + target: install + when: force_slurm_recompile is defined or not stat_pam_slurm_adopt.stat.exists + sudo: true + - name: add slurm log rotate config template: src=slurmlog.j2 dest=/etc/logrotate.d/slurm mode=644 sudo: true diff --git a/roles/slurm-mysql-config/tasks/main.yml b/roles/slurm-mysql-config/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..52f06b184ac0f5487e09b633a97b2db40e712f2a --- /dev/null +++ b/roles/slurm-mysql-config/tasks/main.yml @@ -0,0 +1,4 @@ +- name: "Copy slurm db tuning config" + template: src=slurm.cnf.j2 dest=/etc/my.cnf.d/slurm.cnf + become: true + become_user: root diff --git a/roles/slurm-mysql-config/templates/slurm.cnf.j2 b/roles/slurm-mysql-config/templates/slurm.cnf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..56c0038550e7389916d14c7b350d1aa6c574dc32 --- /dev/null +++ b/roles/slurm-mysql-config/templates/slurm.cnf.j2 @@ -0,0 +1,4 @@ +[mysqld] +innodb_buffer_pool_size=1024M +innodb_log_file_size=256M +innodb_lock_wait_timeout=900 diff --git a/roles/slurm_config/tasks/main.yml b/roles/slurm_config/tasks/main.yml index 8a6768ab999e1b30bade948452f4e5f5f2f9b6f0..feec10209a05172fcf9f887384233a553444b5e5 100644 --- a/roles/slurm_config/tasks/main.yml +++ b/roles/slurm_config/tasks/main.yml @@ -2,3 +2,7 @@ - name: install slurm.conf copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf sudo: true + +- name: install job_submit.lua + copy: src=files/job_submit.lua dest={{ slurm_dir }}/etc/job_submit.lua + sudo: true diff --git a/roles/slurmdb-config/readme.md b/roles/slurmdb-config/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..ee21d6bcc8a88049435ff9c4c589e27093a7c29c --- /dev/null +++ b/roles/slurmdb-config/readme.md @@ -0,0 +1,12 @@ +If the database is already up and running, running this role will not work. +To make a change to the log file size without data loss before applying the config. + +1. While mariadb is still runing + MySQL> SET GLOBAL innodb_fast_shutdown=0; +2. Stop mariadb + systemctl stop mariadb +3. Run this role to copy the config to /etc/my.cnf.d +4. Go to /var/lib/mysql + mv ib_logfile0 ib_logfile0_orig + mv ib_logfile1 ib_logfile1_orig +5. systemctl start mariadb diff --git a/roles/slurmdb-config/tasks/main.yml b/roles/slurmdb-config/tasks/main.yml index a31f5ad72b0a21cc1ebb67d654eea977205b33b1..3e23046fe6d1011f6bb23b4c937089c8724ec053 100644 --- a/roles/slurmdb-config/tasks/main.yml +++ b/roles/slurmdb-config/tasks/main.yml @@ -22,13 +22,23 @@ sudo: true - name: install slurmdb.conf - copy: src=files/slurmdbd.conf dest={{ slurm_dir }}/etc/slurmdbd.conf + copy: + src: files/slurmdbd.conf + dest: "{{ slurm_dir }}/etc/slurmdbd.conf" + owner: slurm + group: slurm + mode: u+rw,g-wx,o-rwx sudo: true when: slurm_dir is defined - name: install slurmdbd.conf - copy: src=slurmdbd.conf dest=/etc/slurm/slurmdbd.conf + copy: + src: slurmdbd.conf + dest: /etc/slurm/slurmdbd.conf + owner: slurm + group: slurm + mode: u+rw,g-wx,o-rwx sudo: true when: slurm_dir is not defined diff --git a/roles/ssh-nopassword-login/handlers/main.yml b/roles/ssh-nopassword-login/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..df0d3deeca457c10a9805a439cb4a61087cac8d3 --- /dev/null +++ b/roles/ssh-nopassword-login/handlers/main.yml @@ -0,0 +1,9 @@ +- name: "restart sshd" + service: name=sshd state=restarted + sudo: true + when: ansible_os_family == "RedHat" + +- name: "restart ssh" + service: name=ssh state=restarted + sudo: true + when: ansible_os_family == "Debian" diff --git a/roles/ssh-nopassword-login/tasks/main.yml b/roles/ssh-nopassword-login/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..f8594e1902a904b5be06ab3575c1ae697532b854 --- /dev/null +++ b/roles/ssh-nopassword-login/tasks/main.yml @@ -0,0 +1,24 @@ +- name: "Disable Challenge Response" + lineinfile: + args: + dest: /etc/ssh/sshd_config + regexp: "ChallengeResponseAuthentication yes" + line: "ChallengeResponseAuthentication no" + backrefs: yes + sudo: true + notify: + - restart sshd + - restart ssh + +- name: "Disable Password" + lineinfile: + args: + dest: /etc/ssh/sshd_config + regexp: "PasswordAuthentication yes" + line: "PasswordAuthentication no" + backrefs: yes + sudo: true + notify: + - restart sshd + - restart ssh + diff --git a/roles/ssh-nopassword-login/tasks/main.yml~ b/roles/ssh-nopassword-login/tasks/main.yml~ new file mode 100644 index 0000000000000000000000000000000000000000..f8594e1902a904b5be06ab3575c1ae697532b854 --- /dev/null +++ b/roles/ssh-nopassword-login/tasks/main.yml~ @@ -0,0 +1,24 @@ +- name: "Disable Challenge Response" + lineinfile: + args: + dest: /etc/ssh/sshd_config + regexp: "ChallengeResponseAuthentication yes" + line: "ChallengeResponseAuthentication no" + backrefs: yes + sudo: true + notify: + - restart sshd + - restart ssh + +- name: "Disable Password" + lineinfile: + args: + dest: /etc/ssh/sshd_config + regexp: "PasswordAuthentication yes" + line: "PasswordAuthentication no" + backrefs: yes + sudo: true + notify: + - restart sshd + - restart ssh + diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml index 4f29a0726abe5481518a0cabf31ca7f96c111482..0d0a6041a98f6d641e0e4aecd38479e6063f444e 100644 --- a/roles/upgrade/tasks/main.yml +++ b/roles/upgrade/tasks/main.yml @@ -53,6 +53,8 @@ when: ansible_os_family=="RedHat" check_mode: no changed_when: False + args: + warn: False - name: get kernel version shell: uname -r