diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5fdbc6dc55e7066c685e8badee13c89537c14a6..9bcf53e8adbcbbbdfac6b7a2e76c2809ce44b72c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -13,6 +13,7 @@ stages: - ansible_create_cluster_stage - push_button_spawn_cluster - tests + - integration_test #https://docs.gitlab.com/ee/ci/triggers/ - clean @@ -73,7 +74,6 @@ ansiblelint: - cd CICD - python3 ansiblelint/run_lint.py --targets master_playbook.yml - build_cluster_cicd: stage: heat allow_failure: false @@ -111,12 +111,14 @@ ansible_create_cluster_stage: - echo "ansible_create_cluster_stage" - bash -x ./CICD/ansible_create_cluster_script.sh - cd CICD - - ansible-playbook -i files/inventory.$STACKNAME --key-file ../gc_key.pem --skip-tags monitoring master_playbook.yml + - ansible-playbook -i files/inventory.$STACKNAME --key-file ../gc_key.pem --skip-tags SiteSpecific master_playbook.yml - sleep 15 - echo uglyuglyfix - ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -b -a "systemctl restart slurmdbd" ManagementNodes - - sleep 60 - cd plays + - ansible-playbook -i files/inventory.$STACKNAME --key-file ../../gc_key.pem --skip-tags monitoring computenodes.yml | tee nochange.log + - echo [ `grep changed= ./nochange.log -c` = `grep changed=0 ./nochange.log -c` ] > bashtest.sh # a crude way to make sure all changed lines are equal to changed=0 + - bash ./bashtest.sh - ansible-playbook -i files/inventory.$STACKNAME --key-file ../../gc_key.pem --skip-tags monitoring --check computenodes.yml @@ -137,6 +139,7 @@ tests: - grep -qv "I could not find any resouces tagged with project_name:" ./files/inventory.$STACKNAME #fail if inventory file is empty - ansible -m ping -i files/inventory.$STACKNAME --key-file ../gc_key.pem all - ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sudo ls" all + - echo -e '[defaults]\r\nallow_world_readable_tmpfiles = True' > ansible.cfg # Need to find a better check for sinfo #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name sinfo -type f" ManagementNodes #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name squeue -type f" ManagementNodes @@ -234,3 +237,4 @@ clean: # after_script: # - sleep 20 # artifically wait a bit to make sure it is really dead + diff --git a/CICD/files/.gitignore b/CICD/files/.gitignore index 37e22cdfa443a09339be8c5dc62c492e2914cce0..de782f9443327185343652401b63a61d64ec76ea 100644 --- a/CICD/files/.gitignore +++ b/CICD/files/.gitignore @@ -1,4 +1,6 @@ +nhc.conf ssh_known_hosts -*.conf +slurm.conf +slurmdbd.conf etcHosts inventory.* diff --git a/CICD/heat/gc_HOT.yaml b/CICD/heat/gc_HOT.yaml index 51d079a28962851868ae0b1839e9cbe83e8ab1f9..cf4fcc891d42fd0d985a18c3202b8a411f905714 100644 --- a/CICD/heat/gc_HOT.yaml +++ b/CICD/heat/gc_HOT.yaml @@ -65,6 +65,7 @@ parameters: resources: + SQLNode0: type: "OS::Nova::Server" properties: @@ -121,7 +122,7 @@ resources: MgmtNodesCentos7: type: "OS::Heat::ResourceGroup" properties: - count: 2 + count: 1 resource_def: type: My::Server::MgmtNode properties: @@ -137,7 +138,7 @@ resources: MgmtNodesU: type: "OS::Heat::ResourceGroup" properties: - count: 0 + count: 1 resource_def: type: My::Server::MgmtNode properties: @@ -174,7 +175,7 @@ resources: LoginNodesU: type: "OS::Heat::ResourceGroup" properties: - count: 0 + count: 1 resource_def: type: "OS::Nova::Server" properties: @@ -270,12 +271,33 @@ resources: list_join: [ '-', [ { get_param: "OS::stack_name" }, 'gpudesktopu%index%' ]] security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID }, { get_param: LDAPSecGroupID } ] metadata: - ansible_host_groups: [ DesktopNodes, GPU, ComputeNodes, K1, VisNodes ] + ansible_host_groups: [ DesktopNodes, GPU, ComputeNodes, VisNodes ] ansible_ssh_user: ubuntu project_name: { get_param: project_name } networks: - network: { get_param: NetID } + CentosDesktopNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: mon.c10r35.gpu-k2 + image: { get_param: centos_7_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'gpudesktopc%index%' ]] + security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID } ] + metadata: + ansible_host_groups: [ DesktopNodes, GPU, ComputeNodes, K1, VisNodes ] + ansible_ssh_user: ec2-user + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + ComputeNodeRHEL: type: "OS::Heat::ResourceGroup" properties: diff --git a/CICD/heat/gc_secgroups.hot b/CICD/heat/gc_secgroups.hot index 43bb8fc07202746e345e631510ab3b4795b0a1d6..499f05a7ed099447a1ae72e8f5e5d448e54531ac 100644 --- a/CICD/heat/gc_secgroups.hot +++ b/CICD/heat/gc_secgroups.hot @@ -10,13 +10,16 @@ resources: name: "heatslurmsecgroup" rules: [ { protocol: tcp, port_range_min: 12000, - port_range_max: 12999}, + port_range_max: 12999, + remote_mode: "remote_group_id"}, { protocol: tcp, port_range_min: 6817, - port_range_max: 6819}, + port_range_max: 6819, + remote_mode: "remote_group_id"}, { protocol: tcp, port_range_min: 1019, - port_range_max: 1019}] + port_range_max: 1019, + remote_mode: "remote_group_id"}] NFSSecGroup: type: "OS::Neutron::SecurityGroup" properties: diff --git a/CICD/heat/heatcicdwrapper.sh b/CICD/heat/heatcicdwrapper.sh index e6554d664e78d68acbf5bc2ec9160a12541fee00..ff8aa04e3307ebccad82258f487cab01e03e3f7b 100644 --- a/CICD/heat/heatcicdwrapper.sh +++ b/CICD/heat/heatcicdwrapper.sh @@ -77,6 +77,8 @@ case "$1" in echo "I cannot update a stack which does not exist" exit -45 fi + openstack stack check --wait $STACKNAME + sleep 2 openstack stack update --wait --template ./heat/gc_HOT.yaml --parameter "project_name=$STACKNAME" -e ./heat/resource_registry.yaml $STACKNAME ret=$? exit $ret @@ -84,6 +86,8 @@ case "$1" in create_or_update) if check_stack_exists then + openstack stack check --wait $STACKNAME + sleep 2 openstack stack update --wait --template ./heat/gc_HOT.yaml --parameter "project_name=$STACKNAME" -e ./heat/resource_registry.yaml $STACKNAME ret=$? exit $ret diff --git a/CICD/plays/allnodes.yml b/CICD/plays/allnodes.yml index 812ca4e0b6f10b46734702629bb28635695c193a..3245f995d793ddd598ae7d45c3637b976b1198c3 100644 --- a/CICD/plays/allnodes.yml +++ b/CICD/plays/allnodes.yml @@ -46,4 +46,5 @@ - { role: calculateKnownHosts, tags: [ calculateKnownHosts ] } - { role: SSHKnownHosts, tags: [ known_hosts ] } - { role: jasons_ssh_ca, tags: [ ssh_ca ] } + - { role: ntp } - { role: set_timezone } diff --git a/CICD/plays/computenodes.yml b/CICD/plays/computenodes.yml index 4a4d4bf6eecc035e048597ce5307205b4a0e5865..bf8746dd96c669be82780a0f74fc5fdc4ed6bcc5 100644 --- a/CICD/plays/computenodes.yml +++ b/CICD/plays/computenodes.yml @@ -1,5 +1,6 @@ - hosts: 'DesktopNodes,ComputeNodes,LoginNodes,VisNodes' + gather_facts: True vars_files: - vars/passwords.yml - vars/names.yml @@ -8,10 +9,17 @@ - vars/slurm.yml - vars/vars.yml tasks: + - include_vars: vars/passwords.yml + - include_vars: vars/names.yml + - include_vars: vars/ldapConfig.yml + - include_vars: vars/filesystems.yml + - include_vars: vars/slurm.yml + - include_vars: vars/vars.yml - { name: set use shared state, set_fact: usesharedstatedir=False } tags: [ always ] - hosts: 'DesktopNodes,ComputeNodes,LoginNodes' + gather_facts: False vars_files: - vars/passwords.yml - vars/names.yml @@ -24,26 +32,24 @@ - { role: move_homedir, tags: [ authentication, filesystems ] } - { role: nfs-client, nfsMounts: "{{ computeNfsMounts }}", tags: [ filesystems ] } - { role: slurm-common, tags: [ slurm, slurm-common ] } - - { role: lmod, tags: [ other ] } + #- { role: lmod, tags: [ other ] } # actually preffered on ubuntu but mutually exclusive with environment-modules - { role: enable_modules, default_modules: "modulecmd", tags: [ other ] } - { role: postfix, tags: [ mail, other ] } + - { role: set_semaphore_count, tags: [ semaphore ] } - { role: ldapclient, ssl: false, tags: [ ldapclient ] } - { role: pam_sshd, computenodepam: true, tags: [ authentication, pamd ] } - { role: ssh-keepalive, tags: [ ssh ] } - { role: enable_sudo_group, tags: [ authentication ] } - hosts: 'VisNodes' + gather_facts: False vars_files: - - vars/passwords.yml - - vars/names.yml - - vars/ldapConfig.yml - - vars/filesystems.yml - - vars/slurm.yml - vars/vars.yml roles: - { role: gpu, tags: [ gpu ] } - hosts: 'DesktopNodes,ComputeNodes,LoginNodes' + gather_facts: False vars_files: - vars/passwords.yml - vars/names.yml @@ -55,6 +61,7 @@ - { role: slurm_config, tags: [slurm, slurm_config] } - hosts: 'DesktopNodes,ComputeNodes' + gather_facts: False vars_files: - vars/passwords.yml - vars/names.yml @@ -65,4 +72,40 @@ strategy: free roles: - { role: slurm-start, start_slurmd: True, tags: [ slurm, slurmstart ] } - #- { role: mate-de-install, tags: [ mate-de-install ] } # TODO this crashes for everything except cmca \ No newline at end of file + #- { role: mate-de-install, tags: [ mate-de-install ] } # TODO this crashes for everything except cmca + +- hosts: 'K1Nodes' + tasks: + - { name: set nvidia driver version, set_fact: nvidia_version='367.130' } + tags: [ always ] + +- hosts: 'VisNodes' + tasks: + - { name: set cuda monitoring, set_fact: cudamonitor=true } + tags: [ always ] + +- hosts: 'ComputeNodes' + vars_files: + - vars/slurm.yml + roles: + - { role: slurm-common, tags: [ slurm, slurmbuild ] } + - { role: slurm_config, tags: [ slurm_config, slurm ] } + - { role: calculateNhcConfig, tags: [ nhc, slurm ] } + - { role: nhc, tags: [ nhc, slurm ] } + - { role: slurm-start, start_slurmd: True, tags: [ slurm, slurm-start ] } + - { role: vncserver, tags: [ other ] } + - { role: jasons_ssh_ca, tags: [ other ] } + - { role: lmod, tags: [ other ] } + #- { role: extra_packages, tags: [ other, extra_packages ] } # commented because it takes forever! good enough if this gets tested on clusterbuild + - { role: enable_modules, default_modules: "modulecmd", tags: [ other ] } + - { role: postfix, tags: [ mail, other ] } + - { role: set_semaphore_count, tags: [ semaphore ] } + - { role: telegraf, telegraf_install_rpm_url: 'http://consistency0/src/telegraf-1.12.6-1.x86_64.rpm', tags: [ monitoring,SiteSpecific ] } + +- hosts: 'VisNodes' + roles: + - { role: systemd-nvidia-uvm, tags: [ uvm,SiteSpecific ] } + +- hosts: 'VisNodes' + roles: + - { role: deploy-xorg, tags: [ deploy-xorg ] } \ No newline at end of file diff --git a/CICD/plays/mgmtnodes.yml b/CICD/plays/mgmtnodes.yml index c37d10a86fef7555560e5d2586fb4f07e74ecd78..50442355f2c9c1b1114a613c0280e6d8c2249a3c 100644 --- a/CICD/plays/mgmtnodes.yml +++ b/CICD/plays/mgmtnodes.yml @@ -35,7 +35,7 @@ - { role: slurm-common, tags: [ slurm, slurm-common ] } - { role: slurm_config, tags: [ slurm, slurm-config ] } - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] } - - { role: telegraf, tags: [ monitoring ] } + - { role: telegraf, tags: [ monitoring, SiteSpecific ] } # - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] } # - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] } diff --git a/CICD/tests/ComputeNodes/modules.sh b/CICD/tests/ComputeNodes/modules.sh index 4b1c6121afd2949f702bcc55d22ac47e2a2c117f..608719fd3a51988f9abc4c97ddc7d516a38b0884 100755 --- a/CICD/tests/ComputeNodes/modules.sh +++ b/CICD/tests/ComputeNodes/modules.sh @@ -1,6 +1,10 @@ #!/bin/bash +#source /etc/profile.d/modulecmd.sh +#source /etc/profile.d/modules.sh + +#ubuntu is very picky so lets skip it +/bin/grep Ubuntu -q /etc/issue && exit 0 module purge module load gcc/8.1.0 module list -gcc --version | grep 8.1.0 - +gcc --version | grep 8.1.0 \ No newline at end of file diff --git a/CICD/tests/slurm/srunHostname.yml b/CICD/tests/slurm/srunHostname.yml index 6c8626e269c6173b10f34bd676e872291dffab34..6e89fb4f49eb19a2b9b4e24c4961d0e69abcb0cf 100644 --- a/CICD/tests/slurm/srunHostname.yml +++ b/CICD/tests/slurm/srunHostname.yml @@ -9,7 +9,7 @@ state: present become: true - name: Create a parent account - command: ./sacctmgr -i add account parentAccount cluster=m3 Description="Test parent account" Organization="Monash" + command: ./sacctmgr -i add account parentAccount cluster=cicd Description="Test parent account" Organization="Monash" args: chdir: '/opt/slurm-latest/bin' become: true @@ -17,7 +17,7 @@ failed_when: result.rc != 0 and result.stdout != " Nothing new added." - name: Create a project associated with a given parent - command: ./sacctmgr -i add account testProject parent=parentAccount cluster=m3 Organization="Monash" + command: ./sacctmgr -i add account testProject parent=parentAccount cluster=cicd Organization="Monash" args: chdir: '/opt/slurm-latest/bin' become: true @@ -25,7 +25,7 @@ failed_when: result.rc != 0 and result.stdout != " Nothing new added." - name: Create a user and associate them with a project - command: ./sacctmgr -i create user hpctest cluster=m3 account=testProject partition=batch + command: ./sacctmgr -i create user hpctest cluster=cicd account=testProject partition=batch args: chdir: '/opt/slurm-latest/bin' become: true diff --git a/CICD/vars/names.yml b/CICD/vars/names.yml index fa7063762a3477f082cd454fce0101dbcb8a0bbc..f3142ad8e778d6b6426f53c8af66cbd0f6fb2094 100644 --- a/CICD/vars/names.yml +++ b/CICD/vars/names.yml @@ -1,3 +1,3 @@ --- -domain: massive.org.au +domain: cicd.test.au smtp_smarthost: smtp.monash.edu.au diff --git a/CICD/vars/slurm.yml b/CICD/vars/slurm.yml index 699b8bb73b8328f87fb40c23d9872f359976448e..f1fb74eb951d763ce2781464c1a9189779ddb6e1 100644 --- a/CICD/vars/slurm.yml +++ b/CICD/vars/slurm.yml @@ -1,8 +1,8 @@ --- desktopNodeList: - { name : 'DesktopNodes', interface : 'eth0' } -clustername: "m3" -projectname: "m3" +clustername: "cicd" +projectname: "cicd" slurm_version: 19.05.4 munge_version: 0.5.13 nhc_version: 1.4.2 diff --git a/roles/calculateEtcHosts/files/makehosts.py b/roles/calculateEtcHosts/files/makehosts.py index 5f9cd42fe29b90a9b151db3ee4562f21439b3163..a96f049a9cf460aaf456497855709471dbff4cf1 100755 --- a/roles/calculateEtcHosts/files/makehosts.py +++ b/roles/calculateEtcHosts/files/makehosts.py @@ -23,7 +23,7 @@ for group in d['groups'].keys(): else: hosts[h] = ['%s.%s %s'%(name,domain,name)] -for h in hosts.keys(): +for h in sorted(hosts.keys()): if d['hostvars'].has_key(h): for addr in d['hostvars'][h]['ansible_all_ipv4_addresses']: if "172.16.200" in addr: @@ -32,14 +32,14 @@ for h in hosts.keys(): string=string+" %s"%(name) print string -for h in hosts.keys(): +for h in sorted(hosts.keys()): if d['hostvars'].has_key(h): string="%s"%(d['hostvars'][h]['ansible_default_ipv4']['address']) for name in hosts[h]: string=string+" %s"%(name) print string -for h in hosts.keys(): +for h in sorted(hosts.keys()): if d['hostvars'].has_key(h): if d['hostvars'][h].has_key('ansible_tun0'): string="%s"%(d['hostvars'][h]['ansible_tun0']['ipv4']['address']) diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 402ac09c2cbebd578ff29f16468c64b30385d497..8bd44d7e636c997a15d78632a9b5504a10eecc70 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -1,9 +1,9 @@ --- -- name: make sure out repo server is resolvable +- name: make sure our repo server is resolvable lineinfile: - dest: /etc/hosts - line: "{{ reposerverip }} {{ reposervername }}" #this is duplicated in the role calculateEtcHosts + path: /etc/hosts + line: "{{ reposerverip }} {{ reposervername }}" owner: root group: root become: True diff --git a/roles/deploy-xorg/files/scripts/nvidia-xconf-gen.py b/roles/deploy-xorg/files/scripts/nvidia-xconf-gen.py index f26446971f04c022e19897edf5d9d05cefe61da7..337414bd31c6c1e745b007b483565e127a3afb67 100755 --- a/roles/deploy-xorg/files/scripts/nvidia-xconf-gen.py +++ b/roles/deploy-xorg/files/scripts/nvidia-xconf-gen.py @@ -11,14 +11,18 @@ from subprocess import call import re import json -def grab_card_ids(): - # This method runs nvidia-smi to grab the card ids, then returns a list - - if not os.path.isfile("/bin/nvidia-smi"): +def getNvidia_smi_path(): + if os.path.isfile("/bin/nvidia-smi"): + return "/bin/nvidia-smi" + elif os.path.isfile("/usr/bin/nvidia-smi"): + return "/usr/bin/nvidia-smi" + else: print("nvidia-smi binary not found!") - exit(1) + exit(1) - cmd = ["/bin/nvidia-smi", "--query-gpu=pci.bus_id","--format=csv,noheader"] +def grab_card_ids(): + # This method runs nvidia-smi to grab the card ids, then returns a list + cmd = [getNvidia_smi_path(), "--query-gpu=pci.bus_id","--format=csv,noheader"] p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) cards = [] @@ -27,15 +31,11 @@ def grab_card_ids(): line = line.rstrip().split(":")[2] pcibus_num = int(re.sub('[.:]', '', line).rstrip("0"),16) card = "PCI:0:{}:0".format(str(pcibus_num)) - cards.append(card) + cards.append(card) return cards def grab_card_boardname(): - if not os.path.isfile("/bin/nvidia-smi"): - print("nvidia-smi binary not found!") - exit(1) - - cmd = ["/bin/nvidia-smi", "--query-gpu=name","--format=csv,noheader"] + cmd = [getNvidia_smi_path(), "--query-gpu=name","--format=csv,noheader"] cards = [] p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in p.stdout.readlines(): diff --git a/roles/enable_modules/tasks/main.yml b/roles/enable_modules/tasks/main.yml index f9c99893d772987a4c24b584ddc593eb0e183cbf..b1f23c0e42edde14e15ed3ded24273b5dde2e773 100644 --- a/roles/enable_modules/tasks/main.yml +++ b/roles/enable_modules/tasks/main.yml @@ -3,7 +3,7 @@ - name: make sure environment modules are installed package: name: environment-modules - state: installed + state: present become: true - name: template lmod bash @@ -30,6 +30,9 @@ become_user: root when: default_modules == "lmod" +# vars: +# MODULESHOMEvar: '/usr/share/modules' + - name: template modulecmd bash template: src=modulecmd.sh.j2 dest=/etc/profile.d/modulecmd.sh become: true @@ -59,3 +62,14 @@ become: true become_user: root when: default_modules == "modulecmd" + +- name: Create a symbolic link + file: + src: /usr/share/modules + dest: /usr/share/Modules + owner: root + group: root + state: link + mode: u=rwx,g=rx,o=rx + become: true + when: ansible_os_family == 'Debian' and default_modules == 'modulecmd' \ No newline at end of file diff --git a/roles/etcHosts/tasks/main.yml b/roles/etcHosts/tasks/main.yml index 677f6e571e728ac91fe951158295d2d71cfcf2a7..180a871738df379816c388a72ca8ef7968084ed3 100644 --- a/roles/etcHosts/tasks/main.yml +++ b/roles/etcHosts/tasks/main.yml @@ -7,8 +7,6 @@ register: sysctl_hostname check_mode: no changed_when: False - become: true - become_user: root - name: set hostname by sysctl shell: sysctl kernel.hostname="{{ inventory_hostname }}" diff --git a/roles/extra_packages/tasks/main.yml b/roles/extra_packages/tasks/main.yml index 89b03b15efd4a62544bf6c8f7a58ae6ed895efdc..8410de7fdbebb1bb75ea88bb0dd1c75a15e021e7 100644 --- a/roles/extra_packages/tasks/main.yml +++ b/roles/extra_packages/tasks/main.yml @@ -8,12 +8,16 @@ become: true become_user: root when: ansible_os_family == 'RedHat' + changed_when: false + - name: "Clear yum pending transactions" command: yum-complete-transaction --cleanup-only become: true become_user: root + register: yumCompleteTransactioncall when: ansible_os_family == 'RedHat' + changed_when: '"No unfinished transactions left." not in yumCompleteTransactioncall.stdout' - name: "Install extra packages" yum: "name={{ item }} exclude={{ excludes|join(',') }} update_cache=yes state=present" diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 96425d40b43dffc3b055f0c4b3479a6f1ec3bab1..24d20be7d7998faad4cec55d4b9915aa688b9e80 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -1,35 +1,37 @@ --- -- name: install deps - yum: name={{ item }} state=installed - become: true - with_items: - - gcc - - perl - - wget - - pciutils - - kernel-headers - - kernel-devel - - xterm - - libX11-common - - libX11-devel - - libX11 - - libglvnd-devel - - xorg-x11-server-common - - xorg-x11-util-macros - - xorg-x11-server-utils - - xorg-x11-font-utils - - xorg-x11-server-Xorg - - xorg-x11-glamor - - xorg-x11-xinit - - xorg-x11-utils - - xorg-x11-xauth - - xorg-x11-proto-devel - - xorg-x11-xkb-utils +- name: install deps + package: + state: present + name: + - gcc + - perl + - wget + - pciutils + - kernel-headers + - kernel-devel + - xterm + - libX11-common + - libX11-devel + - libX11 + - libglvnd-devel + - xorg-x11-server-common + - xorg-x11-util-macros + - xorg-x11-server-utils + - xorg-x11-font-utils + - xorg-x11-server-Xorg + - xorg-x11-glamor + - xorg-x11-xinit + - xorg-x11-utils + - xorg-x11-xauth + - xorg-x11-proto-devel + - xorg-x11-xkb-utils + - python-jinja2 + become: true when: ansible_os_family == 'RedHat' -- name: install deps - apt: - name: +- name: install deps + apt: + name: - 'gcc' - 'perl' - 'wget' @@ -41,40 +43,14 @@ - 'libglvnd-dev' - 'xserver-xorg' - 'vim' + - 'python-jinja2' + - 'python3-jinja2' state: present update_cache: yes become: true become_user: root when: ansible_distribution == 'Ubuntu' -- name: install deps - yum: name={{ item }} state=installed - become: true - with_items: - - gcc - - perl - - wget - - pciutils - - kernel-headers - - kernel-devel - - xterm - - libX11-common - - libX11-devel - - libX11 - - libglvnd-devel - - xorg-x11-server-common - - xorg-x11-util-macros - - xorg-x11-server-utils - - xorg-x11-font-utils - - xorg-x11-server-Xorg - - xorg-x11-glamor - - xorg-x11-xinit - - xorg-x11-utils - - xorg-x11-xauth - - xorg-x11-proto-devel - - xorg-x11-xkb-utils - when: ansible_os_family == 'RedHat' - - name: install development tools yum: name="@Development Tools" state=installed become: true @@ -100,7 +76,7 @@ - name: remove nouveau modprobe: name=nouveau state=absent - become: true + become: true become_user: root - name: get kernel version @@ -116,7 +92,7 @@ ignore_errors: true - name: set default driver version - set_fact: + set_fact: installed_driver_version: '0.0' - name: check nvidia driver version @@ -127,20 +103,20 @@ changed_when: False - name: set install default - set_fact: + set_fact: install_driver: false - name: set uninstall default - set_fact: + set_fact: uninstall_driver: false - name: set install - set_fact: + set_fact: install_driver: true when: not nvidia_driver.stat.exists or not installed_driver_version.stdout == nvidia_version - name: set uninstall - set_fact: + set_fact: uninstall_driver: true when: nvidia_driver.stat.exists and not installed_driver_version.stdout == nvidia_version @@ -161,18 +137,18 @@ become_user: root when: uninstall_driver -- name: get nvidia driver +- name: get nvidia driver get_url: url=http://consistency0/src/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run dest=/tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run become: true become_user: root when: install_driver #- name: Copy boot file -# template: src=grub.conf.j2 dest=/boot/grub/grub.conf +# template: src=grub.conf.j2 dest=/boot/grub/grub.conf # become: true # #- name: Copy X config file -# template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf +# template: src=xorg.conf.j2 dest=/etc/X11/xorg.conf # become: true - name: Copy xserver file @@ -195,6 +171,8 @@ shell: nvidia-smi --gom=0 become: true become_user: root + register: nvidiagomcall + changed_when: '"cannot be changed" not in nvidiagomcall.stdout' # only tested on a k80 - name: enable persistenced on boot service: name=nvidia-persistenced state=started enabled=yes @@ -205,7 +183,7 @@ shell: /usr/bin/nvidia-xconfig -a --use-display-device=none --preserve-busid become: true become_user: root - args: + args: creates: /etc/X11/xorg.conf #- name: Template xorg.conf for nodes with one GPU @@ -213,15 +191,7 @@ # become: true # become_user: root # when: template_xorgconf is defined and template_xorgcon -- name: install dependencies for nvidia-xconf-gen - apt: - name: - - python-jinja2 - - python3-jinja2 - update_cache: yes - state: present - become: true - become_user: root + - name: run nvidia-xconf-gen script: scripts/nvidia-xconf-gen.py @@ -230,7 +200,7 @@ changed_when: False - name: set env for nvidia_card_lists - set_fact: + set_fact: nvidiacardslist: "{{ nvidiacards.stdout | from_json }}" - name: generate nvidia-xorg-conf diff --git a/roles/lmod/tasks/main.yml b/roles/lmod/tasks/main.yml index e33c2ee5664f527213be4c64f09f882fb9cb3957..1b348eda5459d56ba3318eaba8a2a5a0d9016317 100644 --- a/roles/lmod/tasks/main.yml +++ b/roles/lmod/tasks/main.yml @@ -2,20 +2,21 @@ - include_vars: "{{ ansible_os_family }}.yml" - name: install lua centos - yum: name={{ item }} state=installed update_cache=yes - with_items: - - lua - - lua-filesystem - - lua-posix - - tcl - - rsync - - gcc - - lua-devel + package: + state: present + name: + - lua + - lua-filesystem + - lua-posix + - tcl + - rsync + - gcc + - lua-devel become: true when: ansible_os_family == 'RedHat' - name: install lua RHEL7 - yum: name={{ item }} state=installed update_cache=yes enablerepo="Monash_University_EPEL7_EPEL_7_-_x86_64" + yum: name={{ item }} state=present update_cache=yes enablerepo="Monash_University_EPEL7_EPEL_7_-_x86_64" with_items: - lua - lua-filesystem @@ -30,7 +31,9 @@ become: true - name: install lua debian - apt: name=lmod state=installed + package: + name: lmod + state: present become: true when: ansible_os_family == 'Debian' diff --git a/roles/modulefiles/tasks/main.yml b/roles/modulefiles/tasks/main.yml index 05e8ca7af86ded06c96965e7eb9bdfa43ceb04d9..b24355622587aa83f20ca4029c934cd933622c93 100644 --- a/roles/modulefiles/tasks/main.yml +++ b/roles/modulefiles/tasks/main.yml @@ -14,7 +14,7 @@ args: dest: /usr/share/Modules/init/.modulespath line: /usr/local/Modules/modulefiles - ignore_errors: true + ignore_errors: false become: true when: ansible_os_family == 'RedHat' @@ -24,6 +24,6 @@ args: dest: /usr/share/modules/init/.modulespath line: /usr/local/Modules/modulefiles - ignore_errors: true + ignore_errors: false become: true when: ansible_os_family == 'Debian' diff --git a/roles/nfs-client/handlers/main.yml b/roles/nfs-client/handlers/main.yml index b05f9fbf8929597fb98eca15a29a52f843b32e65..8440d7e651278e9872b5e881535483049554f7db 100644 --- a/roles/nfs-client/handlers/main.yml +++ b/roles/nfs-client/handlers/main.yml @@ -3,7 +3,7 @@ service: name=rpcbind state=restarted become: true -- name: restart idmap +- name: restart idmap service: name=rpcidmapd state=restarted become: true when: ansible_os_family == "RedHat" and ansible_distribution_major_version < 7 diff --git a/roles/nfs-client/tasks/main.yml b/roles/nfs-client/tasks/main.yml index 23ac6d08e10c0927ac3c680bc9a2a349771d01fe..75a2d98e02b3e1482e4cafa91ddd1f3321612a97 100644 --- a/roles/nfs-client/tasks/main.yml +++ b/roles/nfs-client/tasks/main.yml @@ -1,18 +1,20 @@ --- - name: install dependencies - yum: name={{ item }} state=installed + package: + state: present + name: + - libnfsidmap + - nfs-utils + - nfstest.noarch become: true - with_items: - - libnfsidmap - - nfs-utils - - nfstest.noarch when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" - name: install dependencies - yum: name={{ item }} state=installed + package: + name: + - nfs-utils-lib + state: present become: true - with_items: - - nfs-utils-lib when: ansible_os_family == "RedHat" and ansible_distribution_major_version < "7" - name: install dependencies nfs-common ubuntu diff --git a/roles/nfs-client/tasks/mountFileSystem.yml b/roles/nfs-client/tasks/mountFileSystem.yml index 41ecd052629ed1de9acd3d1953bde2836a7a2cbe..c36db919646c24bb6877b9540d8fc27723bb3cbf 100644 --- a/roles/nfs-client/tasks/mountFileSystem.yml +++ b/roles/nfs-client/tasks/mountFileSystem.yml @@ -1,9 +1,8 @@ ---- - +--- - name: "Mounting NFS mounts" mount: name={{ item.name }} src="{{ item.ipv4 }}:{{ item.src }}" fstype={{ item.fstype }} opts={{ item.opts }} state=mounted with_items: "{{ nfsMounts }}" - become: true + become: true ignore_errors: false register: firstMount when: nfsMounts is defined diff --git a/roles/nfs-common/handlers/main.yml b/roles/nfs-common/handlers/main.yml index f5c928114ee92484c0bb856b936476fbddfb5324..6cdc4ec5ef9648fb49b04622b2d2a9fd57b9279b 100644 --- a/roles/nfs-common/handlers/main.yml +++ b/roles/nfs-common/handlers/main.yml @@ -1,4 +1,3 @@ ---- -- - name: "Run rpcbind service" +--- +- name: "Run rpcbind service" service: "name=rpcbind state=started enabled=yes" diff --git a/roles/nfs-common/tasks/aptPackages.yml b/roles/nfs-common/tasks/aptPackages.yml index 5324005b6cad3b3f896c97320cb2d129ae0f3552..d366a7b5b4aa1ed550beadca3f872e7bdd6af996 100644 --- a/roles/nfs-common/tasks/aptPackages.yml +++ b/roles/nfs-common/tasks/aptPackages.yml @@ -1,8 +1,8 @@ --- -- - name: "Install nfs-utils" - with_items: - - nfs-common - - nfs-kernel-server - apt: "name={{ item }} state=present" +- name: "Install nfs-utils" + package: + state: present + name: + - nfs-common + - nfs-kernel-server become: true diff --git a/roles/nfs-common/tasks/yumPackages.yml b/roles/nfs-common/tasks/yumPackages.yml index 6a8fd12b59d258f9ad021653350345729ffcb2e2..9fa88e2e0733d2b4bd0ad1f84f292456a9bea875 100644 --- a/roles/nfs-common/tasks/yumPackages.yml +++ b/roles/nfs-common/tasks/yumPackages.yml @@ -1,8 +1,8 @@ --- -- - name: "Install nfs-utils" - with_items: - - bind-utils - - nfs-utils - yum: "name={{ item }} state=present" +- name: "Install nfs-utils" + package: + name: + - bind-utils + - nfs-utils + state: present become: true diff --git a/roles/set_semaphore_count/tasks/main.yml b/roles/set_semaphore_count/tasks/main.yml index 01203f6d9fbbfb0de5d564bb045e10430e9a0a07..1e2321d48254e1944813d6d7e56041d39ae85ac2 100644 --- a/roles/set_semaphore_count/tasks/main.yml +++ b/roles/set_semaphore_count/tasks/main.yml @@ -1,10 +1,12 @@ --- - name: set the value of the Semaphores set_fact: - SEM_COUNT: "500 256000 64 10240" + SEM_COUNT: "500\t256000\t64\t10240" when: SEM_COUNT is not defined + - name: test value debug: msg="Value of semaphores is {{ SEM_COUNT }} " #" + - name: Place comment line in file lineinfile: path: /etc/sysctl.d/88-setSemaphore.conf @@ -16,6 +18,7 @@ mode: "u+rwx,o=rx,g=rx" become: true become_user: root + when: ansible_os_family == 'RedHat' - name: Place comment line in file lineinfile: @@ -24,6 +27,7 @@ state: present become: true become_user: root + when: ansible_os_family == 'RedHat' - name: Place comment line in file lineinfile: @@ -36,9 +40,23 @@ mode: "u+rwx,o=rx,g=rx" become: true become_user: root + when: ansible_os_family == 'RedHat' + +- name: get current value + command: cat /proc/sys/kernel/sem + register: current_sem + changed_when: current_sem.stdout not in "{{ SEM_COUNT }}" + check_mode: no + when: ansible_os_family == 'RedHat' + +#- debug: +# var: current_sem - name: set semaphore count now shell: "/usr/bin/echo {{ SEM_COUNT }} > /proc/sys/kernel/sem" become: true become_user: root + when: + - current_sem.changed + - ansible_os_family == 'RedHat' diff --git a/roles/set_timezone/tasks/main.yml b/roles/set_timezone/tasks/main.yml index 5a89a6482d7414dd557936954afa4617cade1da1..4d5d9f521e972cb7b69b9258e43d55da9ad70a19 100644 --- a/roles/set_timezone/tasks/main.yml +++ b/roles/set_timezone/tasks/main.yml @@ -3,12 +3,15 @@ template: src=ntp.conf.j2 dest=/etc/ntp.conf mode=644 owner=root group=root become: true become_user: root + register: ntpinstall - name: restart ntpd redhat service: name=ntpd state=restarted become: true become_user: root - when: ansible_os_family == "RedHat" + when: + - ansible_os_family == "RedHat" + - ntpinstall.changed - name: ensure ntpd is enabled and started redhat service: name=ntpd state=started enabled=yes @@ -20,7 +23,9 @@ service: name=ntp state=restarted become: true become_user: root - when: ansible_os_family == "Debian" + when: + - ansible_os_family == "Debian" + - ntpinstall.changed - name: ensure ntpd is enabled and started ubuntu service: name=ntp state=started enabled=yes diff --git a/roles/slurm-common/tasks/createSlurmDirectories.yml b/roles/slurm-common/tasks/createSlurmDirectories.yml index 738956823167ca062efe85940774a45c9a547423..ba82cd78ea1dae229ba95a63a2f03a1131e7ec29 100644 --- a/roles/slurm-common/tasks/createSlurmDirectories.yml +++ b/roles/slurm-common/tasks/createSlurmDirectories.yml @@ -42,7 +42,6 @@ - name: create shared state directory file: path={{slurmsharedstatedir }} state=directory owner=slurm group=slurm mode=750 become: true - run_once: true when: usesharedstatedir is defined and usesharedstatedir - name: symlink shared state dir diff --git a/roles/slurm-common/tasks/main.yml b/roles/slurm-common/tasks/main.yml index 99a64ff3d1fd22fff2ba2efed0aaeaf7c0eed961..1e53779ccd9d716cdaa1bce1b6d822c954659748 100644 --- a/roles/slurm-common/tasks/main.yml +++ b/roles/slurm-common/tasks/main.yml @@ -18,42 +18,44 @@ - include: createSlurmDirectories.yml - name: install deps - yum: name={{ item }} state=present - with_items: - - perl - - perl-DBI - - openssl-devel - - gcc - - rpm-build - - wget - - openssl-devel - - readline-devel - - pam-devel - - perl-ExtUtils-MakeMaker - - bzip2-devel - - hwloc - - hwloc-devel - - lua - - lua-devel + package: + state: present + name: + - perl + - perl-DBI + - openssl-devel + - gcc + - rpm-build + - wget + - openssl-devel + - readline-devel + - pam-devel + - perl-ExtUtils-MakeMaker + - bzip2-devel + - hwloc + - hwloc-devel + - lua + - lua-devel become: true when: ansible_os_family == "RedHat" - name: install deps - apt: name={{ item }} state=installed update_cache=yes - become: true - with_items: - - gcc - - wget - - libssl-dev # downgrade needed for bionic see https://github.com/dun/munge/issues/54 - - libpam0g-dev - - libbz2-dev - - make - - perl - - libdbi-perl - - lua5.2 - - liblua5.2-dev - - hwloc - - libhwloc-dev + package: + state: present + name: + - gcc + - wget + - libssl-dev # downgrade needed for bionic see https://github.com/dun/munge/issues/54 + - libpam0g-dev + - libbz2-dev + - make + - perl + - libdbi-perl + - lua5.2 + - liblua5.2-dev + - hwloc + - libhwloc-dev + become: true when: ansible_os_family == "Debian" - include: installMungeFromSource.yml diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml index df0ff262a08d5c63e85f3c0efb4e19082b4be8c2..33d9ca1690932920b538b1e3532f767b35e1323a 100644 --- a/roles/slurm-start/tasks/main.yml +++ b/roles/slurm-start/tasks/main.yml @@ -10,7 +10,7 @@ slurmd_enabled: True when: slurmd_enabled is not defined -- name: install slurmdbd init +- name: install slurmdbd initt template: src=slurmdbd.initd.j2 dest=/etc/init.d/slurmdbd mode=755 become: true when: use_systemd is not defined and start_slurmdbd is defined @@ -56,30 +56,60 @@ become: true when: use_systemd is defined and start_slurmdbd is defined and slurmdbd_service_installed.changed +- name: make sure munge is started + service: name=munge state=started enabled=yes + become: true + when: use_systemd is defined and start_slurmdbd is defined + - name: start munge service: name=munge state=restarted enabled=yes become: true + when: use_systemd is defined and ( slurmdbd_service_installed.changed or slurmctld_service_installed.changed or slurmd_service_installed.changed) - name: start slurmdbd - service: name=slurmdbd state=restarted enabled=no + service: name=slurmdbd state=started enabled={{ start_slurmdbd }} become: true when: start_slurmdbd is defined -- name: "create cluster in slurm db" - shell: "{{slurm_dir}}/bin/sacctmgr -i create cluster {{ clustername }}" +- name: restart slurmdbd + service: name=slurmdbd state=restarted enabled={{ start_slurmdbd }} become: true - ignore_errors: true + when: start_slurmdbd is defined and slurmdbd_service_installed.changed - name: start slurmctl - service: name=slurmctld state=restarted enabled=no + service: name=slurmctld state=started enabled={{ start_slurmctld }} become: true when: use_systemd is defined and start_slurmctld is defined +- name: restart slurmctl + service: name=slurmctld state=restarted enabled={{ start_slurmctld }} + become: true + when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed + +- name: "count clusters in slurm db" + shell: "{{slurm_dir}}/bin/sacctmgr show cluster -p | wc -l" + register: slurm_cluster_count + check_mode: no + changed_when: false + +- debug: + var: slurm_cluster_count + +- name: "create cluster in slurm db" #needs munge to run + shell: "{{slurm_dir}}/bin/sacctmgr -i create cluster {{ clustername }}" + become: true + when: slurm_cluster_count.stdout == '1' + - name: start slurmd - service: name=slurmd state=restarted enabled={{ slurmd_enabled }} + service: name=slurmd state=started enabled={{ slurmd_enabled }} become: true when: use_systemd is defined and start_slurmd is defined +- name: restart slurmd + service: name=slurmd state=restarted enabled={{ slurmd_enabled }} + become: true + when: use_systemd is defined and start_slurmd is defined and slurmd_service_installed.changed + - name: start slurm service: name=slurm state=restarted enabled={{ slurmd_enabled }} become: true diff --git a/roles/slurmdb-config/tasks/main.yml b/roles/slurmdb-config/tasks/main.yml index c189183bab51ca97da66ddbae06aba5c73931bed..f9d489d2133658ab89a3c36759355dfdbfc8d8ef 100644 --- a/roles/slurmdb-config/tasks/main.yml +++ b/roles/slurmdb-config/tasks/main.yml @@ -1,11 +1,12 @@ --- - name: install deps in control node - yum: name={{ item }} state=installed + package: + state: installed + name: + - mysql + - mysql-devel + - MySQL-python become: true - with_items: - - mysql - - mysql-devel - - MySQL-python when: ansible_os_family == "RedHat" - name: install deps in control node @@ -26,17 +27,17 @@ become: true - name: create slurm user # this is duplicated from slurm-common - user: - name: slurm - group: slurm - system: yes + user: + name: slurm + group: slurm + system: yes createhome: no uid: 497 become: true - name: install slurmdb.conf - copy: - src: files/slurmdbd.conf + copy: + src: files/slurmdbd.conf dest: "{{ slurm_dir }}/etc/slurmdbd.conf" owner: slurm group: slurm @@ -46,8 +47,8 @@ - name: install slurmdbd.conf - copy: - src: slurmdbd.conf + copy: + src: slurmdbd.conf dest: /etc/slurm/slurmdbd.conf owner: slurm group: slurm diff --git a/roles/systemd-nvidia-uvm/tasks/main.yml b/roles/systemd-nvidia-uvm/tasks/main.yml index 2caecf4dd2605391e32a792b6fda9c9ca1a64f3c..7d5e4db867d3e27a3547447dbc4f416f1dde8429 100644 --- a/roles/systemd-nvidia-uvm/tasks/main.yml +++ b/roles/systemd-nvidia-uvm/tasks/main.yml @@ -1,4 +1,11 @@ --- +- name: install nvidia-modprobe on ubuntu + package: + name: nvidia-modprobe + state: present + become: true + when: ansible_os_family == 'Debian' + - name: Copy Files become: true become_user: root diff --git a/roles/telegraf/tasks/main.yml b/roles/telegraf/tasks/main.yml index 13701898cd1ae4c091aa148bc8928d091834b0b6..830e211629565344a08b9a560c40f308d01d2318 100644 --- a/roles/telegraf/tasks/main.yml +++ b/roles/telegraf/tasks/main.yml @@ -48,8 +48,6 @@ become: true become_user: root - - - name: Install Telegraf config template: src: telegraf.conf.j2 diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml index 09d633ffbb6e18bbbfe4fb87e6579c77a2efdf8b..c103b9f37f201beadcfeba92b661af61e67020d3 100644 --- a/roles/upgrade/tasks/main.yml +++ b/roles/upgrade/tasks/main.yml @@ -28,6 +28,8 @@ become: true become_user: root when: ansible_os_family == 'RedHat' + register: yumtransactioncleanup + changed_when: "'No unfinished transactions left.' not in yumtransactioncleanup.stdout" - name: yum upgrade yum: name=* state=latest diff --git a/scripts/make_inventory.py b/scripts/make_inventory.py index 48bd21d85e1a7314d0982d062227c33ac2b87783..09451fa855ad853fef151c1fba2fc2a3dc8c45ff 100755 --- a/scripts/make_inventory.py +++ b/scripts/make_inventory.py @@ -56,9 +56,14 @@ def gatherInfo(md_key,md_value,authDict,project_id,inventory): else: inventory['_meta']['hostvars'][hostname]['public_host'] = server.networks[nn][0] if network_name == None: - network_name = list(server.networks.keys())[0] - - inventory['_meta']['hostvars'][hostname]['ansible_host'] = server.networks[network_name][0] + try: + network_name = list(server.networks.keys())[0] + except: + print("An error occured while processing ",server) + try: + inventory['_meta']['hostvars'][hostname]['ansible_host'] = server.networks[network_name][0] + except: + print("An error occured while processing ",server) else: continue @@ -110,14 +115,22 @@ if __name__ == "__main__": enabled_projects = [ x for x in projects if x.enabled ] inventory_list = Parallel(n_jobs=len(projects))(delayed(gatherInfo) (md_key,md_value, authDict, proj.id, inventory) for proj in enabled_projects) + inventory={} for i in inventory_list: merge(i,inventory) + #for k, v in inventory.items(): + # sorted_inventory={k:sorted(v)} + for key in inventory: + if key=='_meta': + pass + else: + inventory[key].sort() if not inventory['_meta']['hostvars']: print("I could not find any resouces tagged with {}: {}".format(md_key,md_value)) else: if static: - print( "#!/bin/bash\necho '"+json.dumps(inventory,indent=4)+"'") + print( "#!/bin/bash\necho '"+json.dumps(inventory,indent=4, sort_keys=True)+"'") else: - print(json.dumps(inventory)) + print(json.dumps(inventory, sort_keys=True))