Skip to content
Snippets Groups Projects
Commit 0e90e605 authored by Trung Nguyen's avatar Trung Nguyen
Browse files

Merge branch 'upcomingMaintenance' into 'master'

Upcoming maintenance

See merge request !460
parents 1fe96c79 20045cf1
No related branches found
No related tags found
1 merge request!460Upcoming maintenance
- hosts: 'ComputeNodes,DGXRHELNodes'
gather_facts: smart # not sure if false is clever here
gather_facts: false
tasks:
- include_vars: vars/ldapConfig.yml
- include_vars: vars/filesystems.yml
......@@ -102,17 +102,16 @@
tasks:
- { name: disable_lustre_service, service: name=lustre-client enabled=no, tags: [never,disable_lustre_service] }
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes'
gather_facts: false
tasks:
- { name: umount /home, mount: path=/home state=unmounted, become: true, become_user: root, tags: [never,umount_home] }
#- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes'
# gather_facts: false
# tasks:
# - name: Mount up device by UUID
# mount:
# path: /home
# src: UUID=b3e48f45-f933-4c8e-a700-22a159ec9077
# fstype: xfs
# opts: noatime
# state: present
#this should not really end up in the main branch but it does not hurt if it will
- hosts: 'ComputeNodes,LoginNodes,DGXRHELNodes,ManagementNodes'
gather_facts: false
tasks:
- { name: umount local-legacy, mount: path=/usr/local-legacy state=absent, become: true, become_user: root, tags: [never,umount_locallegacy] }
#!/bin/sh
#
......
......@@ -23,6 +23,12 @@
become_user: root
when: ansible_os_family == 'RedHat'
- name: start glusterfs
service: name=glusterfsd enabled=yes state=started
become: true
become_user: root
when: ansible_os_family == 'RedHat'
- name: start daemon
service: name=glusterfs-server enabled=yes state=started
become: true
......
......@@ -55,3 +55,38 @@
become: true
become_user: root
when: sys_conf.changed
- name: Install keepalived
package:
name: keepalived
state: present
become: true
become_user: root
- name: Template keepalived config
template: dest=/etc/keepalived/keepalived.conf src=keepalived.conf.j2
become: true
become_user: root
- name: Template keepalived service file
template: dest=/etc/systemd/system/keepalived.service src=keepalived.service.j2
become: true
become_user: root
register: keepalivedconfig
- name: Enable and start keepalived
service:
name: keepalived
state: started
enabled: yes
become: true
become_user: root
when: keepalivedconfig.changed
- name: Restart iptables service
service:
name: iptables
state: restarted
become: true
become_user: root
when: keepalivedconfig.changed
......@@ -27,6 +27,9 @@ COMMIT
-A INPUT -p icmp -j ACCEPT
-A INPUT -i lo -j ACCEPT
-A INPUT -p tcp -m state --state NEW -m tcp --dport 22 -j ACCEPT
# Allow VRRP for NAT service
-A INPUT -p 112 -d 224.0.0.18 -j ACCEPT
-A INPUT -p 51 -d 224.0.0.18 -j ACCEPT
-A INPUT -j REJECT --reject-with icmp-host-prohibited
-A FORWARD -i mlx0 -j ACCEPT
-A FORWARD -s {{ PRIVATE_NETWORK_CIDR }} -j ACCEPT
......
! Configuration File for keepalived
global_defs {
}
vrrp_instance VI_1 {
{% if inventory_hostname == NAT_master|string() %}
{% set role = 'MASTER' %}
{% set priority = '100' %}
{% elif inventory_hostname == NAT_backup1|string() %}
{% set role = 'BACKUP' %}
{% set priority = '101' %}
{% elif inventory_hostname == NAT_backup2|string() %}
{% set role = 'BACKUP' %}
{% set priority = '102' %}
{% endif %}
state {{ role }}
interface mlx0
virtual_router_id 51
priority {{ priority }}
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
}
virtual_ipaddress {
{{ NAT_virtualIP }}/{{ NAT_virtualIP_subnet }} dev mlx0
}
}
# systemd servive unit file for Keepalived
[Unit]
Description=Keepalived service for High Availability with VRRP
After=network.target network-online.target
ConditionFileNotEmpty=/etc/keepalived/keepalived.conf
[Service]
Type=simple
ExecStart=/usr/sbin/keepalived --dont-fork
ExecReload=/bin/kill -s HUP $MAINPID #Define the procedure of killing the processes belonging to the Keepalived service unit.
KillMode=process
[Install]
WantedBy=multi-user.target
......@@ -44,7 +44,7 @@
register: slurmd_service_installed
- name: deploy glusterfsheltest
template: dest=/etc/systemd/system/glusterfsheltest.sh src=glusterfsheltest.sh.j2 mode=744
template: dest=/etc/systemd/system/glusterfsheltest.sh src=templates/glusterfsheltest.sh.j2 mode=744
become: true
when: use_systemd is defined and start_slurmctld is defined and use_glusterfs
register: slurmctld_service_installed
......@@ -98,6 +98,10 @@
become: true
when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed
- name: stop slurmctld on servicefile change
service: name=slurmctld state=stopped enabled={{ start_slurmctld }}
when: use_systemd is defined and start_slurmctld is defined and not start_slurmctld and slurmctld_service_installed.changed
- name: start slurmctld
service: name=slurmctld state=started
become: true
......@@ -108,14 +112,16 @@
register: slurm_cluster_count
check_mode: no
changed_when: false
when: 'slurmctrl == inventory_hostname'
- debug:
var: slurm_cluster_count
when: slurmctrl == inventory_hostname
- name: "create cluster in slurm db" #needs munge to run
shell: "{{ slurm_dir }}/bin/sacctmgr -i create cluster {{ clustername }}"
become: true
when: slurm_cluster_count.stdout == '1' and slurmctrl == inventory_hostname
when: 'slurmctrl == inventory_hostname and slurm_cluster_count.stdout == "1"'
- name: start slurmd
service: name=slurmd enabled={{ slurmd_enabled }}
......
#!/bin/bash
exitCode=[[ "3" == `gluster volume heal gv info | grep -c "Number of entries: 0"` ]]
healinfo=`gluster volume heal gv info | grep -c "Number of entries: 0"`
HEADER="glusterfs preventing start of slurmctld on `hostname`"
MAILTO="{{ EMAIL_DEST }}"
echo $exitCode
if [ $exitCode -eq 0 ]
MAILTO={{ admin_email }}
if [ $healinfo != "4" ]
then
EmailBody="glusterfs preventing start of slurmctld on `hostname` on `date` \ncheck via gluster volume heal gv info"
echo -e "$EmailBody" | mail -s "$HEADER" "$MAILTO"
fi
return $exitCode
\ No newline at end of file
[[ $healinfo == "3" ]]
......@@ -9,6 +9,7 @@ Type=forking
KillMode=process
LimitMEMLOCK=infinity
#EnvironmentFile=/etc/default/slurmd
#ExecStart={{ slurm_dir }}/sbin/slurmd $SLURMD_OPTIONS # before slurm20
ExecStart={{ slurm_dir }}/sbin/slurmd --conf-server {{ slurmctrl }}:6817
PIDFile={{ slurmpiddir }}/slurmd.pid
......
......@@ -3,6 +3,7 @@
copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf
become: true
become_user: root
when: inventory_hostname in groups.ManagementNodes or inventory_hostname in groups.LoginNodes
- name: setup plugin
template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755
......
......@@ -26,8 +26,8 @@
- kmod-lustre-client
- kernel-devel
become: true
- name: remove mellanox rpms
yum:
- name: remove mellanox packages
package:
state: absent
name:
- mlnx-ofa_kernel
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment