diff --git a/maintenance.yml b/maintenance.yml index 7da9736ab16a6b7c8e161f86b127ddb0760d6861..d3b4650bdd038266e0e0f54dd3abadb24b043795 100644 --- a/maintenance.yml +++ b/maintenance.yml @@ -1,5 +1,5 @@ - hosts: 'ComputeNodes,DGXRHELNodes' - gather_facts: smart # not sure if false is clever here + gather_facts: false tasks: - include_vars: vars/ldapConfig.yml - include_vars: vars/filesystems.yml @@ -102,17 +102,16 @@ tasks: - { name: disable_lustre_service, service: name=lustre-client enabled=no, tags: [never,disable_lustre_service] } +- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes' + gather_facts: false + tasks: + - { name: umount /home, mount: path=/home state=unmounted, become: true, become_user: root, tags: [never,umount_home] } -#- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes' -# gather_facts: false -# tasks: -# - name: Mount up device by UUID -# mount: -# path: /home -# src: UUID=b3e48f45-f933-4c8e-a700-22a159ec9077 -# fstype: xfs -# opts: noatime -# state: present +#this should not really end up in the main branch but it does not hurt if it will +- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes' + gather_facts: false + tasks: + - { name: umount local-legacy, mount: path=/usr/local-legacy state=absent, become: true, become_user: root, tags: [never,umount_locallegacy] } #!/bin/sh # diff --git a/roles/gluster_server/tasks/main.yml b/roles/gluster_server/tasks/main.yml index d816b32daf6b068a8a2b82a129052dc3dc0e7588..77144e2e9915e3c884c06496fbf958247b88f45e 100644 --- a/roles/gluster_server/tasks/main.yml +++ b/roles/gluster_server/tasks/main.yml @@ -23,6 +23,12 @@ become_user: root when: ansible_os_family == 'RedHat' +- name: start glusterfs + service: name=glusterfsd enabled=yes state=started + become: true + become_user: root + when: ansible_os_family == 'RedHat' + - name: start daemon service: name=glusterfs-server enabled=yes state=started become: true diff --git a/roles/nat_server/tasks/main.yml b/roles/nat_server/tasks/main.yml index 4ff0097db19f6b449cd1ed28f0914c80f181f8b6..40d466f12a7ee05d4b39e295e28d7cec0027f6ec 100644 --- a/roles/nat_server/tasks/main.yml +++ b/roles/nat_server/tasks/main.yml @@ -55,3 +55,38 @@ become: true become_user: root when: sys_conf.changed + +- name: Install keepalived + package: + name: keepalived + state: present + become: true + become_user: root + +- name: Template keepalived config + template: dest=/etc/keepalived/keepalived.conf src=keepalived.conf.j2 + become: true + become_user: root + +- name: Template keepalived service file + template: dest=/etc/systemd/system/keepalived.service src=keepalived.service.j2 + become: true + become_user: root + register: keepalivedconfig + +- name: Enable and start keepalived + service: + name: keepalived + state: started + enabled: yes + become: true + become_user: root + when: keepalivedconfig.changed + +- name: Restart iptables service + service: + name: iptables + state: restarted + become: true + become_user: root + when: keepalivedconfig.changed diff --git a/roles/nat_server/templates/iptables.j2 b/roles/nat_server/templates/iptables.j2 index c311a4463467e00c82cf77dbe0e93263b3fa6e4a..60a3998c82592f0ee905ee6e83160939de5ba929 100644 --- a/roles/nat_server/templates/iptables.j2 +++ b/roles/nat_server/templates/iptables.j2 @@ -27,6 +27,9 @@ COMMIT -A INPUT -p icmp -j ACCEPT -A INPUT -i lo -j ACCEPT -A INPUT -p tcp -m state --state NEW -m tcp --dport 22 -j ACCEPT +# Allow VRRP for NAT service +-A INPUT -p 112 -d 224.0.0.18 -j ACCEPT +-A INPUT -p 51 -d 224.0.0.18 -j ACCEPT -A INPUT -j REJECT --reject-with icmp-host-prohibited -A FORWARD -i mlx0 -j ACCEPT -A FORWARD -s {{ PRIVATE_NETWORK_CIDR }} -j ACCEPT diff --git a/roles/nat_server/templates/keepalived.conf.j2 b/roles/nat_server/templates/keepalived.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..2c51d504248b6c5bd64399a89fc23e5ed818a38a --- /dev/null +++ b/roles/nat_server/templates/keepalived.conf.j2 @@ -0,0 +1,30 @@ +! Configuration File for keepalived + +global_defs { +} + +vrrp_instance VI_1 { + {% if inventory_hostname == NAT_master|string() %} + {% set role = 'MASTER' %} + {% set priority = '100' %} + {% elif inventory_hostname == NAT_backup1|string() %} + {% set role = 'BACKUP' %} + {% set priority = '101' %} + {% elif inventory_hostname == NAT_backup2|string() %} + {% set role = 'BACKUP' %} + {% set priority = '102' %} + {% endif %} + + state {{ role }} + interface mlx0 + virtual_router_id 51 + priority {{ priority }} + advert_int 1 + authentication { + auth_type PASS + auth_pass 1111 + } + virtual_ipaddress { + {{ NAT_virtualIP }}/{{ NAT_virtualIP_subnet }} dev mlx0 + } +} diff --git a/roles/nat_server/templates/keepalived.service.j2 b/roles/nat_server/templates/keepalived.service.j2 new file mode 100644 index 0000000000000000000000000000000000000000..353658150a699d8a871b4d5500990b0c2b9bba66 --- /dev/null +++ b/roles/nat_server/templates/keepalived.service.j2 @@ -0,0 +1,14 @@ +# systemd servive unit file for Keepalived +[Unit] +Description=Keepalived service for High Availability with VRRP +After=network.target network-online.target +ConditionFileNotEmpty=/etc/keepalived/keepalived.conf + +[Service] +Type=simple +ExecStart=/usr/sbin/keepalived --dont-fork +ExecReload=/bin/kill -s HUP $MAINPID #Define the procedure of killing the processes belonging to the Keepalived service unit. +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml index f0bf5e86b76efc5ce44c7180bb21299a01c2f34a..81b7cdaf0440a90f302cd4522a14bbffca3d32e6 100644 --- a/roles/slurm-start/tasks/main.yml +++ b/roles/slurm-start/tasks/main.yml @@ -44,7 +44,7 @@ register: slurmd_service_installed - name: deploy glusterfsheltest - template: dest=/etc/systemd/system/glusterfsheltest.sh src=glusterfsheltest.sh.j2 mode=744 + template: dest=/etc/systemd/system/glusterfsheltest.sh src=templates/glusterfsheltest.sh.j2 mode=744 become: true when: use_systemd is defined and start_slurmctld is defined and use_glusterfs register: slurmctld_service_installed @@ -98,6 +98,10 @@ become: true when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed +- name: stop slurmctld on servicefile change + service: name=slurmctld state=stopped enabled={{ start_slurmctld }} + when: use_systemd is defined and start_slurmctld is defined and not start_slurmctld and slurmctld_service_installed.changed + - name: start slurmctld service: name=slurmctld state=started become: true @@ -108,14 +112,16 @@ register: slurm_cluster_count check_mode: no changed_when: false + when: 'slurmctrl == inventory_hostname' - debug: var: slurm_cluster_count + when: slurmctrl == inventory_hostname - name: "create cluster in slurm db" #needs munge to run shell: "{{ slurm_dir }}/bin/sacctmgr -i create cluster {{ clustername }}" become: true - when: slurm_cluster_count.stdout == '1' and slurmctrl == inventory_hostname + when: 'slurmctrl == inventory_hostname and slurm_cluster_count.stdout == "1"' - name: start slurmd service: name=slurmd enabled={{ slurmd_enabled }} diff --git a/roles/slurm-start/templates/glusterfsheltest.sh.j2 b/roles/slurm-start/templates/glusterfsheltest.sh.j2 index 1a037e1673fab716f2480490708819f09077db26..5f68983e274eca88962b0cd65361527b524ad4ed 100644 --- a/roles/slurm-start/templates/glusterfsheltest.sh.j2 +++ b/roles/slurm-start/templates/glusterfsheltest.sh.j2 @@ -1,13 +1,10 @@ #!/bin/bash -exitCode=[[ "3" == `gluster volume heal gv info | grep -c "Number of entries: 0"` ]] - +healinfo=`gluster volume heal gv info | grep -c "Number of entries: 0"` HEADER="glusterfs preventing start of slurmctld on `hostname`" -MAILTO="{{ EMAIL_DEST }}" - -echo $exitCode -if [ $exitCode -eq 0 ] +MAILTO={{ admin_email }} +if [ $healinfo != "4" ] then EmailBody="glusterfs preventing start of slurmctld on `hostname` on `date` \ncheck via gluster volume heal gv info" echo -e "$EmailBody" | mail -s "$HEADER" "$MAILTO" fi -return $exitCode \ No newline at end of file +[[ $healinfo == "3" ]] diff --git a/roles/slurm-start/templates/slurmd.service.j2 b/roles/slurm-start/templates/slurmd.service.j2 index 36c56120c0e7a061f9d58f25ae7226468fb1c7a2..ef58f099aeacaee68e005e7d3d82e9b8b2bb2375 100644 --- a/roles/slurm-start/templates/slurmd.service.j2 +++ b/roles/slurm-start/templates/slurmd.service.j2 @@ -9,6 +9,7 @@ Type=forking KillMode=process LimitMEMLOCK=infinity #EnvironmentFile=/etc/default/slurmd +#ExecStart={{ slurm_dir }}/sbin/slurmd $SLURMD_OPTIONS # before slurm20 ExecStart={{ slurm_dir }}/sbin/slurmd --conf-server {{ slurmctrl }}:6817 PIDFile={{ slurmpiddir }}/slurmd.pid diff --git a/roles/slurm_config/tasks/main.yml b/roles/slurm_config/tasks/main.yml index fa4bd2005f3325a73109a78a2a5768ea384e5ddc..0c5b96e3d4db4954d4a049fd5ea127bd0337840e 100644 --- a/roles/slurm_config/tasks/main.yml +++ b/roles/slurm_config/tasks/main.yml @@ -3,6 +3,7 @@ copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf become: true become_user: root + when: inventory_hostname in groups.ManagementNodes or inventory_hostname in groups.LoginNodes - name: setup plugin template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755 diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml index df39091fe7e966ab00a1f60f66d7f3ece581ec2f..348178e6cb761a7ecb99d61346191ec5602415d6 100644 --- a/roles/upgrade/tasks/main.yml +++ b/roles/upgrade/tasks/main.yml @@ -26,8 +26,8 @@ - kmod-lustre-client - kernel-devel become: true - - name: remove mellanox rpms - yum: + - name: remove mellanox packages + package: state: absent name: - mlnx-ofa_kernel