Newer
Older
tasks:
- include_vars: vars/ldapConfig.yml
- include_vars: vars/filesystems.yml
- include_vars: vars/slurm.yml
- include_vars: vars/vars.yml
- { name: set use shared state, set_fact: usesharedstatedir=False }
# these are just templates. Not the tag never! Everything with never is only executed if called explicitly aka ansible-playbook --tags=foo,bar OR -tags=tag_group
- hosts: 'ComputeNodes,DGXRHELNodes'
gather_facts: false
tasks:
- { name: template_shell, shell: ls, tags: [never,tag_group,uniquetag_foo] }
- { name: template_command, command: uname chdir=/bin, tags: [never,tag_group,uniquetag_bar] }
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes'
gather_facts: false
tasks:
- { name: kill user bash shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root, tags: [never,kickshells]}
- { name: Disable MonARCH Lustre Cron Check, cron: name="Check dmesg for lustre errors" state=absent,become_user: root,become: True ,tags: [never, monarch_disable] }
#cron:
# name: "Check dmesg for lustre errors"
# state: absent
# user: root
#become: True
#tags: [never, monarch_disable]
- name: Re-enable MonARCH Lustre Cron Check
cron: name="Check dmesg for lustre errors" minute="*/5" job="/usr/local/sbin/check_lustre_dmesg.sh >> /tmp/check_lustre_output.txt 2>&1"
become: true
become_user: root
tags: [never, monarch_enable ]
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
- hosts: 'ManagementNodes'
gather_facts: false
tasks:
- name: prep a mgmt node for shutdown DO NOT FORGET TO LIMIT gluster needs 2 out of 3 to run
block:
# the failover actually works. but it only takes down the primary. so if this would be called from the backup all of slurm would go down
#- { name: force a failover shell: /opt/slurm-19.05.4/bin/scontrol takeover }
- { name: stop slurmdbd service, service: name=slurmdbd state=stopped }
- { name: stop slurmctld service, service: name=slurmctld state=stopped }
- { name: stop glusterd service, service: name=glusterd state=stopped }
- { name: stop glusterfsd service, service: name=glusterfsd state=stopped }
become: true
tags: [never,prepmgmtshutdown]
- name: verify a mgmt node came up well
block:
# TODO verify vdb is mounted
- { name: start glusterd service, service: name=glusterd state=started }
- { name: start glusterfsd service, service: name=glusterfsd state=started }
- { name: start slurmctld service, service: name=slurmctld state=started }
- { name: start slurmdbd service, service: name=slurmdbd state=started }
become: true
tags: [never,verifymgmtNode16Aug]
- hosts: 'SQLNodes'
gather_facts: false
tasks:
- name: prep a sqlnode node for shutdown
block:
- { name: stop mariadb service, service: name=mariadb state=stopped }
- { name: stop glusterd service, service: name=glusterd state=stopped }
- { name: stop glusterfsd service, service: name=glusterfsd state=stopped }
become: true
tags: [never,prepsqlshutdown]
- name: verify an sql node after a restart
block:
- { name: ensure mariadb service runs, service: name=mariadb state=started }
- { name: ensure glusterd service runs, service: name=glusterd state=started }
- { name: ensure glusterfsd service runs, service: name=glusterfsd state=started }
become: true
tags: [never,sqlverify]
- hosts: 'LoginNodes'
gather_facts: false
tasks:
- { name: populate nologin file, shell: 'echo "MonARCH is down for a scheduled maintenance." > /etc/nologin', become: true, become_user: root }
- { name: set attribute immutable so will not be deleted, shell: 'chattr +i /etc/nologin', become: true, become_user: root }
tags: [never,setnologin]
- name: remove nologin
block:
- { name: unset attribute immutable to allow deletion, shell: 'chattr -i /etc/nologin', become: true, become_user: root }
- { name: remove nologin file, file: path=/etc/nologin state=absent, become: true, become_user: root }
become: true
tags: [never,removenologin]
- name: terminate user ssh processes
block:
- { name: kill shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
- { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
become: true
tags: [never,terminateusersshscprsync]
- hosts: 'LoginNodes,ComputeNodes,DGXRHELNodes'
gather_facts: false
tasks:
- name: stop lustre and disable service
block:
- { name: stop and disable lustre service some nodes will be rebooted and should not come up with a runnign service, service: name=lustre-client enabled=False state=stopped }
become: true
tags: [never,stopdisablelustre16Aug]
- name: start lustre and enable service
block:
- { name: start and enable lustre service, service: name=lustre-client enabled=True state=started }
become: true
tags: [never,startenablelustre16Aug]
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes'
gather_facts: false
tasks:
- { name: disable_lustre_service, service: name=lustre-client enabled=no, tags: [never,disable_lustre_service] }
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes'
gather_facts: false
tasks:
- { name: umount /home, mount: path=/home state=unmounted, become: true, become_user: root, tags: [never,umount_home] }
#this should not really end up in the main branch but it does not hurt if it will
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes'
gather_facts: false
tasks:
- { name: umount local-legacy, mount: path=/usr/local-legacy state=absent, become: true, become_user: root, tags: [never,umount_locallegacy] }
#!/bin/sh
#
#mount | grep gvfs | while read -r line ;
#do
# read -ra line_array <<< $line
# echo "umount ${line_array[2]}"
#done
#un-stuck yum
#mv /var/lib/rpm/__db* /tmp/
#mv /var/lib/rpm/.rpm.lock /tmp/
#mv /var/lib/rpm/.dbenv.lock /tmp
#yum clean all