- hosts: 'ComputeNodes,DGXRHELNodes' gather_facts: smart # not sure if false is clever here tasks: - include_vars: vars/ldapConfig.yml - include_vars: vars/filesystems.yml - include_vars: vars/slurm.yml - include_vars: vars/vars.yml - { name: set use shared state, set_fact: usesharedstatedir=False } tags: [ always ] # these are just templates. Not the tag never! Everything with never is only executed if called explicitly aka ansible-playbook --tags=foo,bar OR -tags=tag_group - hosts: 'ComputeNodes,DGXRHELNodes' gather_facts: false tasks: - { name: template_shell, shell: ls, tags: [never,tag_group,uniquetag_foo] } - { name: template_command, command: uname chdir=/bin, tags: [never,tag_group,uniquetag_bar] } - hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes' gather_facts: false tasks: - { name: kill user bash shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root, tags: [never,kickshells]} - hosts: 'ManagementNodes' gather_facts: false tasks: - name: prep a mgmt node for shutdown DO NOT FORGET TO LIMIT gluster needs 2 out of 3 to run block: # the failover actually works. but it only takes down the primary. so if this would be called from the backup all of slurm would go down #- { name: force a failover shell: /opt/slurm-19.05.4/bin/scontrol takeover } - { name: stop slurmdbd service, service: name=slurmdbd state=stopped } - { name: stop slurmctld service, service: name=slurmctld state=stopped } - { name: stop glusterd service, service: name=glusterd state=stopped } - { name: stop glusterfsd service, service: name=glusterfsd state=stopped } become: true tags: [never,prepmgmtshutdown] - name: verify a mgmt node came up well block: # TODO verify vdb is mounted - { name: start glusterd service, service: name=glusterd state=started } - { name: start glusterfsd service, service: name=glusterfsd state=started } - { name: start slurmctld service, service: name=slurmctld state=started } - { name: start slurmdbd service, service: name=slurmdbd state=started } become: true tags: [never,verifymgmtNode16Aug] - hosts: 'SQLNodes' gather_facts: false tasks: - name: prep a sqlnode node for shutdown block: - { name: stop mariadb service, service: name=mariadb state=stopped } - { name: stop glusterd service, service: name=glusterd state=stopped } - { name: stop glusterfsd service, service: name=glusterfsd state=stopped } become: true tags: [never,prepsqlshutdown] - name: verify an sql node after a restart block: - { name: ensure mariadb service runs, service: name=mariadb state=started } - { name: ensure glusterd service runs, service: name=glusterd state=started } - { name: ensure glusterfsd service runs, service: name=glusterfsd state=started } become: true tags: [never,sqlverify] - hosts: 'LoginNodes' gather_facts: false tasks: - name: verify Loginnodes for 16Aug maintenance block: - { name: make sure lustre service is stopped, service: name=lustre-client enabled=False state=stopped } - { name: make sure nologin is still present, file: path=/etc/nologin state=file } become: true tags: [never,verifyLoginNode16Aug] - hosts: 'LoginNodes,ComputeNodes,DGXRHELNodes' gather_facts: false tasks: - name: stop lustre and disable service block: - { name: stop and disable lustre service some nodes will be rebooted and should not come up with a runnign service, service: name=lustre-client enabled=False state=stopped } become: true tags: [never,stopdisablelustre16Aug] - name: start lustre and enable service block: - { name: start and enable lustre service, service: name=lustre-client enabled=True state=started } become: true tags: [never,startenablelustre16Aug] - hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes' gather_facts: false tasks: - { name: kill squashfs, shell: "pkill -f squashfuse", become: true, become_user: root, tags: [never,umount_home] } - { name: umount /home, mount: path=/home state=unmounted, become: true, become_user: root, tags: [never,umount_home] } #!/bin/sh # #mount | grep gvfs | while read -r line ; #do # read -ra line_array <<< $line # echo "umount ${line_array[2]}" #done #un-stuck yum #mv /var/lib/rpm/__db* /tmp/ #mv /var/lib/rpm/.rpm.lock /tmp/ #mv /var/lib/rpm/.dbenv.lock /tmp #yum clean all