diff --git a/CICD/vars/passwords.yml b/CICD/vars/passwords.yml index 4777423499ed7f529e306eb109b60887097b06a5..393458dd312fdb17c0909d1729a681d8816e7c68 100644 --- a/CICD/vars/passwords.yml +++ b/CICD/vars/passwords.yml @@ -1,10 +1,10 @@ --- -mungekey: ySdSOpFMyLihx4tQlR0znm07UlvALxB1 -slurmdb_passwd: ySdSOpFMyLihx4tQlR0znm07UlvALxB2 -sqlrootPasswd: ySdSOpFMyLihx4tQlR0znm07UlvALxB3 +mungekey: EXAMPLEMUNGEKEYwithpaddingfill32 +slurmdb_passwd: EXAMPLESLURMDBPASSWORD +sqlrootPasswd: EXAMPLESQLROOTPASSWORD sudo_group: systems -default_user_password_clear: ySdSOpFMyLihx4tQlR0znm07UlvALxBL -default_user_password: ySdSOpFMyLihx4tQlR0znm07UlvALxBL -ldapManagerDNPassword: redhat -ldapManagerPassword: redhat -ldapBindDNPassword: redhat \ No newline at end of file +default_user_password_clear: EXAMPLEDEFAULTUSERPASSWORDCLEAR +default_user_password: EXAMPLEDEFAULTUSERPASSWORD +ldapManagerDNPassword: EXAMPLELDAPMANAGERDNPASSWORD +ldapManagerPassword: EXAMPLELDAPMANAGERPASSWORD +ldapBindDNPassword: EXAMPLEBINDDNPASSWORD diff --git a/maintenance.yml b/maintenance.yml index 7da9736ab16a6b7c8e161f86b127ddb0760d6861..d31c3e9424ac47cf8428cdff47fd98949994b0c8 100644 --- a/maintenance.yml +++ b/maintenance.yml @@ -1,5 +1,5 @@ - hosts: 'ComputeNodes,DGXRHELNodes' - gather_facts: smart # not sure if false is clever here + gather_facts: false # not sure if false is clever here tasks: - include_vars: vars/ldapConfig.yml - include_vars: vars/filesystems.yml @@ -102,17 +102,16 @@ tasks: - { name: disable_lustre_service, service: name=lustre-client enabled=no, tags: [never,disable_lustre_service] } +- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes' + gather_facts: false + tasks: + - { name: umount /home, mount: path=/home state=unmounted, become: true, become_user: root, tags: [never,umount_home] } -#- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes' -# gather_facts: false -# tasks: -# - name: Mount up device by UUID -# mount: -# path: /home -# src: UUID=b3e48f45-f933-4c8e-a700-22a159ec9077 -# fstype: xfs -# opts: noatime -# state: present +#this should not really end up in the main branch but it does not hurt if it will +- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes' + gather_facts: false + tasks: + - { name: umount local-legacy, mount: path=/usr/local-legacy state=absent, become: true, become_user: root, tags: [never,umount_locallegacy] } #!/bin/sh # diff --git a/roles/gluster-monitor/tasks/main.yml b/roles/gluster-monitor/tasks/main.yml index 2f73f3cc9303ccec17b37222f2ad13ff0563f2cc..6641f72314c03b772f366a2067cdddcaf9054bfa 100644 --- a/roles/gluster-monitor/tasks/main.yml +++ b/roles/gluster-monitor/tasks/main.yml @@ -7,8 +7,8 @@ become_user: root - name: template gluster_monitoring template: - src=detect-gluster-problems.sh.j2 - dest=/usr/local/sbin/detect-gluster-problems.sh + src=detect-gluster-problems.py.j2 + dest=/usr/local/sbin/detect-gluster-problems.py mode=755 owner=root group=root @@ -16,6 +16,6 @@ become_user: root - name: gluster_monitoring- install crontab entry #cron: name="Check glust for problems" minute="*/5" job="/usr/local/sbin/detect-gluster-problems.sh >> /tmp/detect-gluster-problems.txt 2>&1" - cron: name="Check gluster for problems" minute="*/5" job="/usr/local/sbin/detect-gluster-problems.sh" + cron: name="Check gluster for problems" minute="*/5" job="/usr/local/sbin/detect-gluster-problems.py" become: true become_user: root diff --git a/roles/gluster-monitor/templates/detect-gluster-problems.py.j2 b/roles/gluster-monitor/templates/detect-gluster-problems.py.j2 new file mode 100644 index 0000000000000000000000000000000000000000..05d2098110f49d19c5ba6138cf2d29f20a6dd48c --- /dev/null +++ b/roles/gluster-monitor/templates/detect-gluster-problems.py.j2 @@ -0,0 +1,78 @@ +#!/bin/env python +# +# +# detect-gluster-problems.py +# Authors simon michnowicz 13 April 2021 + +import os +import sys +import subprocess + +############################## +def error(e): + ''' + we have an error. Send an email + e=error string + ''' + hostname = os.uname()[1] + #print("Error is:\n{}".format(e)) + HEADER="Error Message from {}".format(hostname) + MAILTO="{{ EMAIL_DEST }}" + command="echo -e \"{}\" | mail -s \"{}\" \"{}\" ".format(e,HEADER,MAILTO) + #print("Command is:\n{}".format(command)) + os.system(command) + +############################## +def Test1(): + ''' + This tests looks for the word "split brain" in a general query + ''' + COMMAND="sudo gluster volume heal gv info" + try: + result = subprocess.check_output(COMMAND.split()).decode('UTF-8') + #print("Test1 Output is {}".format(result)) + outList=result.splitlines() + for line in outList: + #print("Test1 line is {}".format(line)) + if 'split' in line: + error(COMMAND+"\n"+outlist) + except subprocess.CalledProcessError as error: + error("Test1: \nPlease contact mcc-help@monash.edu \n error code", error.returncode, error.output) + sys.exit(1) +############################## +def Test2(): + ''' + This test checks for number of split brain entries + + sudo gluster volume heal gv info split-brain + Brick 172.16.227.169:/gbrick/brick + Status: Connected + Number of entries in split-brain: 0 + ''' + COMMAND="sudo gluster volume heal gv info split-brain" + try: + result = subprocess.check_output(COMMAND.split()).decode('UTF-8') + #print("Test2 Output is {}".format(result)) + outList=result.splitlines() + for line in outList: + #print("Line is {}".format(line)) + if 'Number of entries in split-brain' in line: + split=line.split(':') + if len(split)!=2: + error("Logic error in Test2: split is {}".format(split)) + sys.exit(1) + NoOfSplitBrains=int(split[1]) + #print("Number of Split Brains is {}".format(NoOfSplitBrains)) + if (NoOfSplitBrains!=0): + error("Number of Split Brains is {}".format(NoOfSplitBrains)) + except subprocess.CalledProcessError as error: + error("Test2: \nPlease contact mcc-help@monash.edu \n error code", error.returncode, error.output) + sys.exit(1) + +############################## +def main(): + Test1() + Test2() + +if __name__ == "__main__": + main() diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 66397423d847e702d6a667d9ee484eb186dc4946..f5946444d6beef568b872147eb7adba07437963d 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -133,17 +133,17 @@ name: nvidia-persistenced state: stopped become: true - when: uninstall_driver and ( services["nvidia-persistenced.service"].state == "running") + when: uninstall_driver and services["nvidia-persistenced.service"] is defined - name: stop the create-dev-uvm daemon service: name=create-dev-uvm state=stopped become: true - when: uninstall_driver and services["create-dev-uvm.service"].state == "running" + when: uninstall_driver and services["create-dev-uvm.service"] is defined - name: stop the telegraf daemon service: name=telegraf state=stopped become: true - when: uninstall_driver and services["telegraf.service"].state == "running" + when: uninstall_driver and services["telegraf.service"] is defined - name: Unload nvidia driver @@ -234,16 +234,16 @@ service: name=nvidia-persistenced state=started become: true become_user: root - when: uninstall_driver and services["nvidia-persistenced.service"].state == "running" + when: uninstall_driver and services["nvidia-persistenced.service"] is defined and services["nvidia-persistenced.service"].state == "running" - name: re-start the create-dev-uvm daemon service: name=create-dev-uvm state=started become: true become_user: root - when: uninstall_driver and services["create-dev-uvm.service"].state == "running" + when: uninstall_driver and services["create-dev-uvm.service"] is defined and services["create-dev-uvm.service"].state == "running" - name: re-start the telegraf daemon service: name=telegraf state=started become: true become_user: root - when: uninstall_driver and services["telegraf.service"].state == "running" + when: uninstall_driver and services["telegraf.service"] is defined and services["telegraf.service"].state == "running" diff --git a/roles/mellanox_drivers/defaults/main.yml b/roles/mellanox_drivers/defaults/main.yml index c02c5fbf3c3a84161b2389cdd26775e64641eaf8..879b64ed8de2fa8b82005d0736b8393322015397 100644 --- a/roles/mellanox_drivers/defaults/main.yml +++ b/roles/mellanox_drivers/defaults/main.yml @@ -2,5 +2,5 @@ #note. do not add '.tgz' to driver src. done in playbook #MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-4.4-1.0.0.0-rhel7.4-x86_64{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" #MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-4.5-1.0.1.0-rhel7.6-x86_64{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" -MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-4.7-3.2.9.0-rhel7.7-x86_64{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" +MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-4.9-2.2.4.0-rhel7.9-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" buildKMOD: False \ No newline at end of file diff --git a/roles/slurm-start/defaults/main.yml b/roles/slurm-start/defaults/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..36027f747b726168463f3a222e0dc32b2fd49f3b --- /dev/null +++ b/roles/slurm-start/defaults/main.yml @@ -0,0 +1,2 @@ +use_glusterfs: true +#EMAIL_DEST I am not providing a default to force users to override. \ No newline at end of file diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml index 2e41c2237788f98c594dfd581e632038ff09a653..4d0ce6264cf796e4b90c9a6d45f4d1a90ca4ad0c 100644 --- a/roles/slurm-start/tasks/main.yml +++ b/roles/slurm-start/tasks/main.yml @@ -8,7 +8,7 @@ state: stopped enabled: false become: true - when: services["firewalld.service"].state == "running" + when: services["firewalld.service"] is defined - name: set use_systemd set_fact: @@ -43,6 +43,12 @@ when: use_systemd is defined and start_slurmd is defined register: slurmd_service_installed +- name: deploy glusterfsheltest + template: dest=/etc/systemd/system/glusterfsheltest.sh src=glusterfsheltest.j2 mode=744 + become: true + when: use_systemd is defined and start_slurmctld is defined + register: slurmctld_service_installed + - name: slurmctld.service template: dest=/etc/systemd/system/slurmctld.service src=slurmctld.service.j2 mode=644 become: true diff --git a/roles/slurm-start/templates/glusterfsheltest.sh.j2 b/roles/slurm-start/templates/glusterfsheltest.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..1a037e1673fab716f2480490708819f09077db26 --- /dev/null +++ b/roles/slurm-start/templates/glusterfsheltest.sh.j2 @@ -0,0 +1,13 @@ +#!/bin/bash +exitCode=[[ "3" == `gluster volume heal gv info | grep -c "Number of entries: 0"` ]] + +HEADER="glusterfs preventing start of slurmctld on `hostname`" +MAILTO="{{ EMAIL_DEST }}" + +echo $exitCode +if [ $exitCode -eq 0 ] +then + EmailBody="glusterfs preventing start of slurmctld on `hostname` on `date` \ncheck via gluster volume heal gv info" + echo -e "$EmailBody" | mail -s "$HEADER" "$MAILTO" +fi +return $exitCode \ No newline at end of file diff --git a/roles/slurm-start/templates/slurmctld.service.j2 b/roles/slurm-start/templates/slurmctld.service.j2 index 325468aa45600fe813e87827b256bdb66254ca1d..cc914758a5f91d5d4647eb330cb4e220d8ff25ef 100644 --- a/roles/slurm-start/templates/slurmctld.service.j2 +++ b/roles/slurm-start/templates/slurmctld.service.j2 @@ -6,6 +6,7 @@ ConditionPathExists={{ slurm_dir }}/etc/slurm.conf [Service] Type=forking #EnvironmentFile=/etc/default/slurmctld +ExecStartPre=/etc/systemd/system/glusterfsheltest.sh ExecStart={{ slurm_dir }}/sbin/slurmctld $SLURMCTLD_OPTIONS PIDFile={{ slurmpiddir }}/slurmctld.pid [Install]