Commit ad7002da authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'mlaas-ci' of gitlab.erc.monash.edu.au:hpc-team/HPCasCode into mlaas-ci

parents 1ceea7ce e896843b
......@@ -6,6 +6,7 @@ log_path = ./ansible.log
display_args_to_stdout = True
roles_path = HPCasCode/roles
files_path = ./files
forks=25
[ssh_connection]
ssh_args = -F ./ssh.cfg
......@@ -4,13 +4,16 @@ def init_ssh_keys(inventory):
# Run ssh-keygen to generate host keys for login and compute nodes
key_types = ['rsa','dsa','ecdsa','ed25519']
#groups = list(filter(lambda x: x != 'hostvars', inventory['all']['children'].keys()))
groups = ['LoginNodes','ComputeNodes']
groups = ['host']
fp={}
for t in key_types:
fp[t]={}
for g in groups:
output = 'files/ssh_{}_{}_key'.format(g,t)
if os.path.exists(output):
continue
p = subprocess.call(['ssh-keygen','-N','','-f',output,'-t',t])
if not os.path.exists(output):
p = subprocess.call(['ssh-keygen','-N','','-f',output,'-t',t])
fp[t][g] = subprocess.check_output(['cat','{}.pub'.format(output)])
return fp
def init_passwords():
import yaml
......@@ -105,6 +108,20 @@ def groups_from_inventory(inventory):
groups[g[0]] = g[1]['hosts'].keys()
return groups
def make_job_container_conf(choices, inventory):
import jinja2
with open('pre_templates/job_container.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/job_container.conf','w') as f:
f.write(template.render())
def make_cgroup_conf(choices, inventory):
import jinja2
with open('pre_templates/cgroup.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/cgroup.conf','w') as f:
f.write(template.render())
def make_slurm_config(choices, inventory):
# Generate a slurm.conf from template
# should include putting the compute nodes in the correct place
......@@ -147,9 +164,12 @@ def make_nhc_config(inventory):
with open('files/nhc.conf','w') as f:
f.write(template.render())
def make_known_hosts(inventory):
def make_known_hosts(inventory, fp, clustername):
with open('files/ssh_known_hosts','w') as f:
f.write('\n')
for t in fp.keys():
f.write("{}* {}".format(clustername, fp[t]['host'].decode()))
f.write("localhost {}".format(fp[t]['host'].decode()))
def make_gres_conf(inventory):
with open('files/gres.conf','w') as f:
f.write('\n')
......@@ -172,11 +192,12 @@ def make_vars_vars(choices, inventory):
def init_cluster(inventory):
def init_cluster(inventory, clustername):
#
# These values should not change, but they are chosen at random, so only do it once
#
init_ssh_keys(inventory)
fp = init_ssh_keys(inventory)
make_known_hosts(inventory, fp, clustername)
init_passwords()
def derive_ansible_constants(choices, inventory):
......@@ -186,9 +207,11 @@ def derive_ansible_constants(choices, inventory):
#
make_hosts(inventory)
make_slurm_config(choices, inventory)
make_job_container_conf(choices, inventory)
make_cgroup_conf(choices, inventory)
make_vars_filesystems(choices, inventory)
make_nfs_exports(inventory)
make_known_hosts(inventory)
#make_known_hosts(choices, inventory, fp)
make_nhc_config(inventory)
make_gres_conf(inventory)
make_vars_vars(choices, inventory)
......@@ -207,7 +230,8 @@ def main():
choices = { **choices, **versions }
init_cluster(inventory)
clustername = choices['clustername']
init_cluster(inventory, clustername)
with open('vars/passwords.yml') as f:
passwords = yaml.safe_load(f.read())
......
......@@ -37,5 +37,6 @@
- { role: ssh-password-login, tags: [ authentication ] }
- { role: enable_sudo_group, tags: [ authentication, sudo ] }
- { role: move_homedir }
- { role: SSHKnownHosts, tags: [ known_hosts ] }
- { role: ssh_host_keys, tags: [ ssh ] }
- { role: SSHKnownHosts, tags: [ ssh, known_hosts ] }
......@@ -75,6 +75,8 @@
#- { role: extra_packages, tags: [ other, extra_packages ] } # commented because it takes forever! good enough if this gets tested on clusterbuild
#- { role: telegraf, telegraf_install_rpm_url: 'http://consistency0/src/telegraf-1.12.6-1.x86_64.rpm', tags: [ monitoring,SiteSpecific ] }
- { role: pam_sshd, computenodepam: true, tags: [ authentication, pamd ] }
- { role: user_ssh_ca, tags: [ authentication ] }
- { role: additional_paths }
- hosts: 'VisNodes'
roles:
......@@ -87,4 +89,6 @@
- hosts: 'LoginNodes'
roles:
- { role: pam_sshd, tags: [ authentication, pamd ], when: ansible_os_family == 'RedHat' }
- { role: user_ssh_ca, tags: [ authentication ] }
- { role: additional_paths }
......@@ -29,7 +29,7 @@
- include_role:
name: nfs-server
vars:
exportList: export.exportList
exportList: "{{ export.exportList }}"
when: export.host == inventory_hostname
loop: "{{ nfsexports }}"
loop_control:
......
CgroupAutomount=yes
ConstrainDevices=yes
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainKmemSpace=no
AutoBasePath=true
BasePath=/mnt/privatedir
......@@ -32,7 +32,6 @@ SlurmdPidFile=/opt/slurm/var/run/slurmd.pid
#ProctrackType=proctrack/linuxproc
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
RebootProgram=/sbin/reboot
......@@ -76,9 +75,6 @@ SchedulerType="sched/backfill"
SelectType="select/cons_tres"
SelectTypeParameters=CR_Core_Memory
JobContainerType=job_container/tmpfs
{% if slurmselecttype.find("cons_tres") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
......
......@@ -51,7 +51,7 @@ slurmfairshare: {def: false, val: 10000}
slurmdatadir: "/opt/slurm/var/spool"
slurmstatedir: "/opt/slurm/var/state"
slurmpiddir: "/opt/slurm/var/run"
slurmselecttype: "select/linear"
slurmselecttype: "select/cons_tres"
slurmfastschedule: "1"
slurmschedulertype: "sched/backfill"
restartServerList:
......
......@@ -4,11 +4,11 @@ nhc_src_checksum: "sha1:766762d2c8cd81204b92d4921fb5b66616351412"
nhc_src_dir: /opt/src/nhc-1.4.2
nhc_dir: /opt/nhc-1.4.2
slurm_version: 20.02.6
slurm_src_url: https://github.com/SchedMD/slurm/archive/refs/tags/slurm-20-02-6-1.tar.gz
slurm_src_checksum: "sha1:99f635b528ce120f10efd432019b3b7270a0f9b3"
slurm_src_dir: /opt/src/slurm-slurm-20-02-6-1
slurm_dir: /opt/slurm-20.02.6
slurm_version: 21.08.3
slurm_src_url: https://download.schedmd.com/slurm/slurm-21.08.3.tar.bz2
slurm_src_checksum: "sha1:4ed4978b06e8916a6188acedf31b46e42e362ee9"
slurm_src_dir: /opt/src/slurm-21.08.3
slurm_dir: /opt/slurm-21.08.3
ucx_version: 1.8.0
ucx_src_url: https://github.com/openucx/ucx/releases/download/v1.8.0/ucx-1.8.0.tar.gz
......@@ -21,3 +21,13 @@ munge_src_url: https://github.com/dun/munge/archive/refs/tags/munge-0.5.14.tar.g
munge_src_checksum: "sha1:70f6062b696c6d4f17b1d3bdc47c3f5eca24757c"
munge_dir: /opt/munge-0.5.14
munge_src_dir: /opt/src/munge-munge-0.5.14
nvidia_mig_parted_version: 0.1.3
nvidia_mig_parted_src_url: https://github.com/NVIDIA/mig-parted/archive/refs/tags/v0.1.3.tar.gz
nvidia_mig_parted_src_checksum: "sha1:50597b4a94348c3d52b3234bb22783fa236f1d53"
nvidia_mig_parted_src_dir: /opt/src/mig-parted-0.1.3
nvidia_mig_slurm_discovery_version: master
nvidia_mig_slurm_discovery_src_url: https://gitlab.com/nvidia/hpc/slurm-mig-discovery.git
nvidia_mig_slurm_discovery_src_dir: /opt/src/mig-slurm_discovery
- name: setup additiona PATHs in /etc/profile.d
template:
src: additional_paths.sh.j2
dest: /etc/profile.d/additional_paths.sh
become: true
when: additional_paths is defined
export PATH=$PATH:{{ additional_paths|join(":") }}
- name: run nvidia smi
command: "nvidia-smi"
become: true
check_mode: no
changed_when: false
- name: assert mig config
command: "./nvidia-mig-parted assert -f examples/config.yaml -c {{ mig_config }}"
args:
chdir: "{{ nvidia_mig_parted_src_dir }}"
become: true
check_mode: no
changed_when: false
ignore_errors: true
register: mig_state_assert
- name: change mig config
block:
- name: apply mig config
command: "./nvidia-mig-parted apply -f examples/config.yaml -c {{ mig_config }}"
args:
chdir: "{{ nvidia_mig_parted_src_dir }}"
become: true
- name: generate mig config
command: "./mig"
args:
chdir: "{{ nvidia_mig_slurm_discovery_src_dir }}"
become: true
- name: install gres.conf
copy:
remote_src: yes
src: "{{ nvidia_mig_slurm_discovery_src_dir }}/gres.conf"
dest: "{{ slurm_dir }}/etc/gres.conf"
become: true
- name: install cgroup
copy:
remote_src: yes
src: "{{ nvidia_mig_slurm_discovery_src_dir }}/cgroup_allowed_devices_file.conf"
dest: "{{ slurm_dir }}/etc/cgroup_allowed_devices.conf"
become: true
# when: mig_state_assert.failed
- name: Create nvidia mig parted if it does not exist
file:
path: "{{ nvidia_mig_parted_src_dir | dirname }}"
state: directory
owner: "{{ ansible_user }}"
group: root
mode: u=rwx,g=rx,o=rx
become: true
- name: install required packages
package:
state: present
name: golang-1.16-go
become: true
when: ansible_os_family == "Debian"
- name: getch mig parted code
get_url:
url: "{{ nvidia_mig_parted_src_url }}"
checksum: "{{ nvidia_mig_parted_src_checksum }}"
dest: "{{ nvidia_mig_parted_src_dir | dirname }}/mig_parted_src"
- name: unarchive
unarchive:
src: "{{ nvidia_mig_parted_src_dir | dirname }}/mig_parted_src"
dest: "{{ nvidia_mig_parted_src_dir | dirname }}"
remote_src: yes
creates: "{{ nvidia_mig_parted_src_dir }}"
- name: compile
command: /usr/lib/go-1.16/bin/go build ./cmd/nvidia-mig-parted
args:
chdir: "{{ nvidia_mig_parted_src_dir }}"
creates: "{{ nvidia_mig_parted_src_dir }}/nvidia-mig-parted"
- name: install cuda
block:
- name: get PIN
get_url:
url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
dest: /etc/apt/preferences.d/cuda-repository-pin-600
become: true
- name: install key
apt_key:
url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
state: present
become: true
- name: add repo
apt_repository:
repo: deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /
become: true
- name: install cuda
apt:
name: cuda
state: present
update_cache: yes
become: true
- name: install cudnn
apt:
name: libcudnn8-dev
state: present
update_cache: yes
become: true
when: ansible_os_family == "Debian"
- name: Create mig_slurm_discovery dir if it does not exist
file:
path: "{{ nvidia_mig_slurm_discovery_src_dir }}"
state: directory
owner: "{{ ansible_user }}"
group: root
mode: u=rwx,g=rx,o=rx
become: true
- name: getch mig slurm discovery code
git:
repo: "{{ nvidia_mig_slurm_discovery_src_url }}"
dest: "{{ nvidia_mig_slurm_discovery_src_dir }}"
- name: compile
command: gcc -g -o mig -I/usr/local/cuda/include -I/usr/cuda/include mig.c -L/usr/lib/x86_64-linux-gnu/ -l:libnvidia-ml.so.1
args:
chdir: "{{ nvidia_mig_slurm_discovery_src_dir }}"
creates: "{{ nvidia_mig_slurm_discovery_src_dir }}/mig"
......@@ -13,10 +13,10 @@
when: ansible_os_family == "Debian"
become: True
- name: config cgroup.conf file
template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644
become: True
#- name: config cgroup.conf file
# template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644
# become: True
- name: config cgroup_allowed_devices.conf file
template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644
become: True
#- name: config cgroup_allowed_devices.conf file
# template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644
# become: True
......@@ -107,23 +107,27 @@
become: true
# when: not stat_ucx.stat.exists
- name: configure slurm centos
command: "{{ slurm_src_dir }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix={{ pmix_dir }} --with-ucx={{ ucx_dir }}"
args:
creates: "{{ slurm_dir }}/bin/srun"
chdir: "{{ slurm_src_dir }}"
- name: install pmix from apt
block:
- name: install
apt:
name: libpmix-dev
state: present
become: true
- name: set dir
set_fact:
pmix_dir: "/usr/lib/x86_64-linux-gnu/pmix"
when:
- force_slurm_recompile is defined or not stat_srun.stat.exists
- ansible_os_family == 'RedHat'
- ansible_os_family == 'Debian'
- name: configure slurm ubuntu
command: "{{ slurm_src_dir }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix --with-ucx={{ ucx_dir }}"
- name: configure slurm
command: "{{ slurm_src_dir }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix={{ pmix_dir }} --with-ucx={{ ucx_dir }}"
args:
creates: "{{ slurm_dir }}/bin/srun"
chdir: "{{ slurm_src_dir }}"
when:
- force_slurm_recompile is defined or not stat_srun.stat.exists
- ansible_os_family == 'Debian'
- name: build slurm
command: make
......
......@@ -9,6 +9,7 @@
enabled: false
become: true
when: services["firewalld.service"] is defined
ignore_errors: true
- name: set use_systemd
set_fact:
......@@ -16,6 +17,11 @@
when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and
( ansible_distribution_major_version == "7")
- name: set use_systemd
set_fact:
use_systemd: True
when: ansible_distribution == "Ubuntu"
- name: set slurmd_enabled (default enabled)
set_fact:
slurmd_enabled: True
......@@ -114,10 +120,10 @@
service: name=slurmctld state=stopped enabled={{ start_slurmctld }}
when: use_systemd is defined and start_slurmctld is defined and not start_slurmctld and slurmctld_service_installed.changed
- name: start slurmctld
- name: start slurmctld on primary
service: name=slurmctld state=started
become: true
when: use_systemd is defined and start_slurmctld is defined and start_slurmctld
when: 'use_systemd is defined and start_slurmctld is defined and start_slurmctld and slurmctrl == inventory_hostname'
- name: "count clusters in slurm db"
shell: "{{ slurm_dir }}/bin/sacctmgr show cluster -p | wc -l"
......@@ -134,8 +140,25 @@
shell: "{{ slurm_dir }}/bin/sacctmgr -i create cluster {{ clustername }}"
become: true
when: 'slurmctrl == inventory_hostname and slurm_cluster_count.stdout == "1"'
- name: wait for slurmctld primary to start
wait_for:
host: "{{ slurmctrl }}"
port: 6817
delay: 5
timeout: 300
- name: stop systemd-logind
- name: start slurmctld on secondary
service: name=slurmctld state=started
retries: 20
delay: 10
register: result
until: result is not failed
become: true
when: 'use_systemd is defined and start_slurmctld is defined and start_slurmctld and slurmctrl != inventory_hostname'
- name: stop systemd-logind it conflicts with pam_slurm_adopt on compute nodes
systemd: name=systemd-logind state=stopped masked=yes
become: true
when: use_systemd is defined and slurmd_enabled is defined
......@@ -155,7 +178,13 @@
become: true
tags: [never,DEPLOYSLURMDSERVICEBACKUP]
- name: stop systemd-logind
service: name=systemd-logind state=stopped enabled=no masked=yes
become: true
when: use_systemd is not defined and start_slurmd is defined
- name: start slurm
service: name=slurm state=restarted enabled={{ slurmd_enabled }}
become: true
when: use_systemd is not defined and start_slurmd is defined
......@@ -10,9 +10,18 @@
become: true
when: inventory_hostname in groups.ManagementNodes
- name: install cgroup.conf
copy: src=files/cgroup.conf dest={{ slurm_dir }}/etc/cgroup.conf
become: true
when: inventory_hostname in groups.ManagementNodes
- name: install job_container.conf
copy: src=files/job_container.conf dest={{ slurm_dir }}/etc/job_container.conf
become: true
when: inventory_hostname in groups.ManagementNodes
- name: setup plugin
template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755
become: true
become_user: root
when: slurm_lua is defined and slurm_lua==True
......@@ -55,7 +55,7 @@
dest: "{{ slurm_dir }}/etc/slurmdbd.conf"
owner: slurm
group: slurm
mode: u+rw,g-wx,o-rwx
mode: u+rw,g-rwx,o-rwx
become: true
when: slurm_dir is defined
......
---
- name: copy ssh host keys
template: src=files/ssh_host_{{ item }}_key dest=/etc/ssh/ssh_host_{{ item }}_key mode=600 owner=root group=root
become: true
become_user: root
with_items:
- rsa
- dsa
- ecdsa
- ed25519
register: host_keys
- name: restart sshd
service: name=sshd state=restarted
become: true
become_user: root
when: host_keys is changed
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment