diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4e20cd92aefd7c1be8795d33794e6bf3efd356fb..354bef662f796db607f7c07b75c4907d7d5026fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -170,14 +170,14 @@ tests: - ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sudo ls" all - echo -e '[defaults]\r\nallow_world_readable_tmpfiles = True' > ansible.cfg - ansible-playbook -i files/inventory.$STACKNAME --key-file ../gc_key.pem ./tests/mockSlurmData.yml - # Need to find a better check for sinfo - #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name sinfo -type f" ManagementNodes - #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name squeue -type f" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "/opt/slurm-latest/bin/sinfo" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "/opt/slurm-latest/bin/squeue" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "/opt/slurm-latest/bin/scontrol ping" LoginNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet ntpd" all - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet mariadb" SQLNodes - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmctld" ManagementNodes - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmdbd" ManagementNodes - - + - bash -e ./tests/run_tests.sh all "files/inventory.$STACKNAME" "../gc_key.pem" - bash -e ./tests/run_tests.sh ComputeNodes "files/inventory.$STACKNAME" "../gc_key.pem" - bash -e ./tests/run_tests.sh LoginNodes "files/inventory.$STACKNAME" "../gc_key.pem" diff --git a/CICD/ansible.cfg b/CICD/ansible.cfg new file mode 100644 index 0000000000000000000000000000000000000000..98dfd39a6d72b3dfa28f27e2e564c2fb46af8b45 --- /dev/null +++ b/CICD/ansible.cfg @@ -0,0 +1,3 @@ +[defaults] +remote_tmp = /tmp/.ansible/tmp +host_key_checking = False diff --git a/CICD/plays/allnodes.yml b/CICD/plays/allnodes.yml index beca31d3ab444b2b334d092cfff05d315a8b99be..dd901dc04306d2fdb6083192432b9ab88bc48082 100644 --- a/CICD/plays/allnodes.yml +++ b/CICD/plays/allnodes.yml @@ -19,6 +19,7 @@ - vars/filesystems.yml - vars/slurm.yml - vars/vars.yml + - vars/vars_centos78.yml strategy: free roles: # - { role: disable_selinux, tags: [ disableselinux ] } diff --git a/CICD/plays/mgmtnodes.yml b/CICD/plays/mgmtnodes.yml index 691b6ff058c6b8375ee4d28c7cd1d31516a7f242..838ae35575bb70f73b8dc4597c2849e773beed25 100644 --- a/CICD/plays/mgmtnodes.yml +++ b/CICD/plays/mgmtnodes.yml @@ -10,6 +10,7 @@ - vars/filesystems.yml - vars/slurm.yml - vars/vars.yml + - vars/vars_centos78.yml tasks: # - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' } - { name: set use shared state, set_fact: usesharedstatedir=True } @@ -24,6 +25,7 @@ - vars/filesystems.yml - vars/slurm.yml - vars/vars.yml + - vars/vars_centos78.yml roles: # - { role: ldapclient, tags: [ authentication ] } # - { role: ssh-password-login } @@ -33,7 +35,7 @@ - { role: slurmdb-config, tags: [ slurm, slurmdb-config ] } - { role: slurm-common, tags: [ slurm, slurm-common ] } - { role: slurm_config, tags: [ slurm, slurm-config ] } - - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, slurmd_enabled: False, tags: [ slurm-start ] } + - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] } - { role: telegraf, telegraf_install_rpm_url: 'http://consistency0/src/telegraf-1.12.6-1.x86_64.rpm', tags: [ monitoring, SiteSpecific ] } # - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] } # - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] } diff --git a/CICD/plays/nfssqlnodes.yml b/CICD/plays/nfssqlnodes.yml index 9284f83246fa7e0913a303fd1928b8cea935555e..e1b66e13a48f8a9a4734dc1128aac45ced1431c2 100644 --- a/CICD/plays/nfssqlnodes.yml +++ b/CICD/plays/nfssqlnodes.yml @@ -17,6 +17,7 @@ - include_vars: vars/filesystems.yml - include_vars: vars/slurm.yml - include_vars: vars/vars.yml + - include_vars: vars/vars_centos78.yml - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' } - { name: set use shared state, set_fact: usesharedstatedir=True } tags: [ always ] @@ -41,6 +42,7 @@ - vars/filesystems.yml - vars/slurm.yml - vars/vars.yml + - vars/vars_centos78.yml strategy: free gather_facts: True roles: diff --git a/CICD/plays/testlustre/testlustre.yml b/CICD/plays/testlustre/testlustre.yml index 265539317a19bea87e2faf505fd0c0940a010eff..6011c7aad2dc22b2303d86e95b40ea392e55da32 100644 --- a/CICD/plays/testlustre/testlustre.yml +++ b/CICD/plays/testlustre/testlustre.yml @@ -93,11 +93,15 @@ state: mounted - hosts: 'LoginNodes' # this does not work. we need to config_repos first -# vars_files: -# - vars/vars.yml + tasks: + - include_vars: vars/vars_centos78.yml + +- hosts: 'LoginNodes' roles: - { role: config_repos, tags: [ repos ] } - + - { role: upgrade, tags: [ upgrade ] } # upgrade from centos7.6 base image to centos7.8 + - { role: mellanox_drivers, start_roce_service: false } + - hosts: 'LoginNodes' # this does not work. we need to config_repos first tasks: - name: install rpms diff --git a/CICD/plays/testlustre/vars b/CICD/plays/testlustre/vars index e8d9a6429b3aaab679b98557469104f0f7cc952b..8559d2e08fc22c0c466a5bf80d980a2c6a859437 120000 --- a/CICD/plays/testlustre/vars +++ b/CICD/plays/testlustre/vars @@ -1 +1 @@ -../vars \ No newline at end of file +../../vars \ No newline at end of file diff --git a/CICD/vars/vars.yml b/CICD/vars/vars.yml index 5648bc3bb89ecc816ae625760cb2725a8aef801f..8620f51c950e19534b8e44781775641892464047 100644 --- a/CICD/vars/vars.yml +++ b/CICD/vars/vars.yml @@ -1,25 +1,7 @@ --- -#centos77 -LUSTRE_VERSION: 2.13.0-1.el7 -KERNEL_VERSION: 3.10.0-1062.12.1.el7.x86_64 -repopath: centos-staging -MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-4.7-3.2.9.0-rhel7.7-x86_64{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" - - sudo_group: systems nagios_home: "/var/lib/nagios" -nvidia_version: "367.134" - -yumdisablerepo: - - 'base' - - 'extras' - - 'updates' -yumenablerepo: - - 'monashhpc_base' - - 'monashhpc_updates' - - 'monashhpc_extras' - - 'monashhpc_centosplus' - - 'monashhpc_otherstuff' +nvidia_version: "450.51.06" gpumap: 'K1': 'K1' diff --git a/CICD/vars/vars_centos78.yml b/CICD/vars/vars_centos78.yml new file mode 100644 index 0000000000000000000000000000000000000000..abe6b526be1b18214a5b51744c90bdba085e3aa6 --- /dev/null +++ b/CICD/vars/vars_centos78.yml @@ -0,0 +1,24 @@ +--- + +#centos7.8 +KERNEL_VERSION: 3.10.0-1127.18.2.el7.x86_64 +LUSTRE_VERSION: 2.13.55-1.el7 +MELLANOX_DRIVER_SRC: MLNX_OFED_LINUX-4.9-0.1.7.0-rhel7.8-ext +repopath: centos7.7 +centos_release: centos7.8 + +yumdisablerepo: [] + +yumenablerepo: + - 'base' + - 'updates' + - 'extras' + - 'monashhpc_lustreversions' + - 'monashhpc_base' + - 'monashhpc_updates' + - 'monashhpc_extras' + - 'monashhpc_centosplus' + - 'monashhpc_otherstuff' + - 'monashhpc_k1gpusupport' + + diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 7f260588ab68b9fbcc95a76de48a52c125beb8b9..9e258fc0b8357780b0c4acddc0abd72789085fc3 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -8,28 +8,6 @@ group: root become: True -#- name: remove default repos -# file: -# path: /etc/yum.repos.d/{{ item }} -# state: absent -# become: true -# become_user: root -# with_items: -# - CentOS-Base.repo -# - CentOS-Debuginfo.repo -# - CentOS-fasttrack.repo -# - CentOS-Sources.repo -# - CentOS-Vault.repo -# - foreman.repo -# - puppetlabs.repo -# - rdo-release.repo -# - CentOS-CR.repo -# - CentOS-Media.repo -# - CentOS-OpenStack-kilo.repo -# - epel.repo -# - monashhpc_mellanox.repo -# - epel-testing.repo -# when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == "7" - name: add our repos template: src={{ item }}.j2 dest=/etc/yum.repos.d/{{ item }} @@ -45,33 +23,42 @@ #shell: yum repolist | grep -v "repo id" | grep -v "Loaded plugins" | head -n -1 | cut -f 1 -d '/' | sed -s 's/\!//' shell: yum repolist all | grep enabled | cut -f 1 -d '/' | sed -s 's/\!//' when: ansible_os_family == 'RedHat' - register: repolist + register: repolistenabled check_mode: no changed_when: False args: warn: False -- name: disable unwanted repos - shell: yum-config-manager --disable "{{ item }}" - with_items: "{{ repolist.stdout_lines|difference(yumenablerepo) }}" - become: true - become_user: root - ignore_errors: true +- name: get disabled repos + shell: yum repolist all | grep disabled | cut -f 1 -d '/' | sed -s 's/\!//' when: ansible_os_family == 'RedHat' + register: repolistdisabled + check_mode: no + changed_when: False + args: + warn: False - name: enable wanted repos shell: yum-config-manager --enable "{{ item }}" - with_items: "{{ yumenablerepo }}" + #with_items: "{{ repolistenabled.stdout_lines|difference(yumenablerepo) }}" + with_items: "{{ yumenablerepo | symmetric_difference(repolistenabled.stdout_lines) }}" become: true become_user: root - ignore_errors: true + #ignore_errors: true when: ansible_os_family == 'RedHat' + register: repoenable + + +- name: disable unwanted repos + shell: yum-config-manager --disable "{{ item }}" + with_items: "{{ repolistenabled.stdout_lines|difference(yumenablerepo) }}" + become: true + become_user: root + #ignore_errors: true + when: ansible_os_family == 'RedHat' + register: repodisable -#- name: Enable epel -# command: yum-config-manager --enable epel -# become: true -# when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" # Use mate DE on systems that have moved to gnome3, since there is no gpu acceleration by default on NeCTAR openstack # Trusty (Ubuntu 14.04 LTS) needs repos added. Wheezy (Debian Stable) gets mate from backports, Utopic (Ubuntu 14.10) Jessie (Debian testing) and Sid (Debian unstable) get it by default @@ -89,9 +76,11 @@ apt: update_cache=True become: true when: ansible_os_family=="Debian" + - name: force refresh of the repository cache shell: | + yum clean metadata yum clean all yum updateinfo yum makecache @@ -99,6 +88,6 @@ async: 600 poll: 5 check_mode: no - when: addingrepos.changed and ansible_os_family == 'RedHat' + when: ansible_os_family == 'RedHat' and ( addingrepos.changed or repoenable.changed or repodisable.changed) args: warn: False diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml index 1b7f585811a5b345ed92d249c8e0123c607ecf01..7046800e15610a4aa2b9a94ae96b6a463ae55360 100644 --- a/roles/mellanox_drivers/tasks/main.yml +++ b/roles/mellanox_drivers/tasks/main.yml @@ -147,4 +147,5 @@ - name: enable roce_mode setting service: name=roce_mode state=started enabled=yes - become: true \ No newline at end of file + become: true + when: start_roce_service is undefined or start_roce_service diff --git a/roles/ntp/tasks/main.yml b/roles/ntp/tasks/main.yml index a3d428ae3b94e99799608daf832c93a3470bd2ba..a023263d3880acf037ba17093c08918cfe04cef2 100644 --- a/roles/ntp/tasks/main.yml +++ b/roles/ntp/tasks/main.yml @@ -24,8 +24,14 @@ become_user: root notify: restart ntpd -- name: "enable ntp" +- name: "enable ntpd" service: name=ntpd enabled=yes state=started become: true become_user: root when: ansible_os_family == 'RedHat' + +- name: "enable ntp" + service: name=ntp enabled=yes state=started + become: true + become_user: root + when: ansible_os_family == 'Debian' \ No newline at end of file diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml index c3e4c3e5157b686768f1691f6b6521f2d6ac2773..a81a25a7a0c8ebaf6596ec3a8b9c084259225d54 100644 --- a/roles/slurm-start/tasks/main.yml +++ b/roles/slurm-start/tasks/main.yml @@ -8,7 +8,7 @@ - name: set slurmd_enabled (default enabled) set_fact: slurmd_enabled: True - when: slurmd_enabled is not defined and start_slurmd + when: slurmd_enabled is not defined and start_slurmd is defined and start_slurmd - name: install slurmdbd initt template: src=slurmdbd.initd.j2 dest=/etc/init.d/slurmdbd mode=755 @@ -101,14 +101,14 @@ when: slurm_cluster_count.stdout == '1' and slurmctrl == inventory_hostname - name: start slurmd - service: name=slurmd state=started enabled={{ slurmd_enabled }} + service: name=slurmd enabled={{ slurmd_enabled }} become: true - when: use_systemd is defined and start_slurmd is defined + when: use_systemd is defined and slurmd_enabled is defined - name: restart slurmd service: name=slurmd state=restarted enabled={{ slurmd_enabled }} become: true - when: use_systemd is defined and start_slurmd is defined and slurmd_service_installed.changed + when: use_systemd is defined and start_slurmd is defined and slurmd_service_installed.changed and start_slurmd - name: start slurm service: name=slurm state=restarted enabled={{ slurmd_enabled }}