Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Commits on Source (2428)
Showing
with 1221 additions and 0 deletions
*.swp
*.retry
*-openrc.sh
*~
gc_key.pem
CICD/files/slurm.conf
CICD/files/slurmdbd.conf
CICD/files/ssh_known_hosts
variables:
GIT_SUBMODULE_STRATEGY: recursive
STACKNAME: CICD-${CI_PROJECT_NAME}-gitlabci #-$CI_COMMIT_REF_NAME
NECTAR_ALLOCATION: HPCCICD
ANSIBLE_HOST_KEY_CHECKING: "False"
DEFAULT_PATH: "CICD"
stages:
- lint
- openstack_create
- configure_cluster
- tests
- integration_test #https://docs.gitlab.com/ee/ci/triggers/
- openstack_destroy
trigger_pipeline_in_Clusterbuild:
stage: integration_test
tags:
- ansible
script:
- echo ${CI_JOB_TOKEN}
- curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=master https://gitlab.erc.monash.edu.au/api/v4/projects/193/trigger/pipeline # ID is from clusterbuild
only:
- ${CI_PROJECT_NAME} == 'HPCasCode'
trigger_pipeline_in_monarch:
stage: integration_test
tags:
- ansible
script:
- echo ${CI_JOB_TOKEN}
- curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=master https://gitlab.erc.monash.edu.au/api/v4/projects/385/trigger/pipeline # ID is from monarch
only:
- ${CI_PROJECT_NAME} == 'HPCasCode'
yamllint:
stage: lint
allow_failure: true
tags:
- yamllint
script:
- echo "stage yamllint"
- cd CICD
- yamllint -c ./.yamllintheat.yaml ./heat
ansiblelint:
allow_failure: true
stage: lint
tags:
- ansiblelint
script:
- echo "stage ansiblelint"
- cd CICD
- python3 ansiblelint/run_lint.py --targets master_playbook.yml
- python3 ansiblelint/run_lint.py --targets ../qa.yml
- python3 ansiblelint/run_lint.py --targets ../maintenance.yml
remove_infra:
stage: openstack_destroy
image: ubuntu
allow_failure: false
tags:
- heat
before_script:
- cd $DEFAULT_PATH
- echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh
script:
- whoami
- apt update
- apt -y upgrade
- apt -y install openssh-client python3-pip
- pip3 install joblib python-cinderclient python-keystoneclient python-novaclient python-openstackclient PyYAML ansible
- source ./$NECTAR_ALLOCATION-openrc.sh
- echo "Generating a random SSH key for our cluster"
- cd infra
#- ansible-playbook os_delete.yml
build_infra:
stage: openstack_create
image: ubuntu
allow_failure: false
tags:
- heat
before_script:
- cd $DEFAULT_PATH
- echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh
script:
- whoami
- apt update
- apt -y upgrade
- apt -y install openssh-client python3-pip
- pip3 install openstacksdk==0.61.0 joblib python-cinderclient python-keystoneclient python-novaclient python-openstackclient PyYAML ansible
- source ./$NECTAR_ALLOCATION-openrc.sh
- echo "Generating a random SSH key for our cluster"
- ssh-keygen -f tmp_key -N "" -t ed25519
- export SSH_CA=`cat tmp_key.pub`
- cd infra
- python3 ./template.py
- ansible-playbook os_delete.yml
- ansible-playbook os_create.yml
- python3 ../../scripts/make_inventory.py $CI_PROJECT_NAME-ci-$CI_COMMIT_REF_NAME > ../inventory.yml
- cd ..
artifacts:
when: always
paths:
- CICD/inventory.yml
- CICD/tmp_key
- CICD/tmp_key.pub
- CICD/infra/os_vars.yml
configure_cluster:
stage: configure_cluster
image: ubuntu
tags:
- ansible
artifacts:
when: always
paths:
- CICD/inventory.yml
- CICD/tmp_key
- CICD/tmp_key.pub
- CICD/infra/os_vars.yml
- CICD/files
- CICD/vars
- CICD/master.log.json
- CICD/ssh.cfg
script:
- whoami
- apt update
- apt -y upgrade
- apt -y install openssh-client python3-pip
- pip3 install joblib python-cinderclient python-keystoneclient python-novaclient python-openstackclient PyYAML ansible
- cd CICD
- python3 ./make_files.py ./inventory.yml ./infra/os_vars.yml ./vars/versions.yml
- mkdir -p ~/.ssh
- ssh-keyscan -H `cat ssh.cfg | grep Proxy | cut -f 2 -d "@"` >> ~/.ssh/known_hosts
- eval `ssh-agent`
- ssh-add ./tmp_key
- ssh -vvv `cat ssh.cfg | grep Proxy | cut -f 2 -d "="` exit 0
- export ANSIBLE_HOST_KEY_CHECKING=False
- export ANSIBLE_CONFIG=`pwd`/ansible.cfg
- ansible -i inventory.yml -m ping 'all'
- ansible-playbook -i inventory.yml upgrade_and_reboot.yml || true
- export ANSIBLE_STDOUT_CALLBACK=json
- ansible-playbook -i inventory.yml master_playbook.yml > master.log.json
# I don't think this is necessary any more
#- ansible -i inventory.yml -b -a "systemctl restart slurmdbd" ManagementNodes
#- ansible -i inventory.yml -b -a "systemctl restart slurmctld" ManagementNodes
tests:
stage: tests
image: ubuntu
tags:
- ansible
before_script:
- whoami
- apt update
- apt -y upgrade
- apt -y install openssh-client python3-pip
- pip3 install joblib python-cinderclient python-keystoneclient python-novaclient python-openstackclient PyYAML ansible
- cd CICD
- mkdir -p ~/.ssh
- ssh-keyscan -H `cat ssh.cfg | grep Proxy | cut -f 2 -d "@"` >> ~/.ssh/known_hosts
- eval `ssh-agent`
- ssh-add ./tmp_key
- export ANSIBLE_HOST_KEY_CHECKING=False
- export ANSIBLE_CONFIG=`pwd`/ansible.cfg
script:
#- ansible-playbook -i inventory.yml ./tests/mockSlurmData.yml
- ansible-playbook -i inventory.yml ./tests/mockSlurmData.yml -e @./vars/slurm.yml
- ansible -B 30 -i inventory.yml -a "/opt/slurm-latest/bin/sinfo" ManagementNodes
- ansible -B 30 -i inventory.yml -a "/opt/slurm-latest/bin/squeue" ManagementNodes
- ansible -B 30 -i inventory.yml -a "/opt/slurm-latest/bin/scontrol ping" LoginNodes
- ansible -B 30 -i inventory.yml -a "systemctl is-active --quiet ntpd" CentosNodes
- ansible -B 30 -i inventory.yml -a "systemctl is-active --quiet ntp" UbuntuNodes
- ansible -B 30 -i inventory.yml -a "systemctl is-active --quiet mariadb" SQLNodes
- ansible -B 30 -i inventory.yml -a "systemctl is-active --quiet slurmctld" ManagementNodes
- ansible -B 30 -i inventory.yml -a "systemctl is-active --quiet slurmdbd" ManagementNodes
- bash -e ./tests/run_tests.sh all "inventory.yml"
- bash -e ./tests/run_tests.sh ComputeNodes "inventory.yml"
- bash -e ./tests/run_tests.sh LoginNodes "inventory.yml"
- bash -e ./tests/run_tests.sh ManagementNodes "inventory.yml"
- bash -e ./tests/run_tests.sh NFSNodes "inventory.yml"
- bash -e ./tests/run_tests.sh SQLNodes "inventory.yml"
# Note to self: deactivated because it is broken. please fix it again - bash -e ./tests/run_tests.sh slurm "files/inventory.$STACKNAME" "../gc_key.pem"
#- ansible -i inventory.yml -a 'sudo su - user1 -c whoami' LoginNodes,ComputeNodes # to test ldap # CH deactivated because this user doesn't exist on a default CI pipeline cluster
#- sshpass -p 'redhat' ssh -o StrictHostKeyChecking=no user1@server.example.com
# copied from luhan
extends: default
rules:
braces:
level: warning
max-spaces-inside: 1
brackets:
level: warning
max-spaces-inside: 1
colons:
level: warning
commas:
level: warning
comments: disable
comments-indentation: disable
document-start: disable
empty-lines:
level: warning
hyphens:
level: warning
indentation:
level: warning
indent-sequences: consistent
line-length:
level: warning
allow-non-breakable-inline-mappings: true
truthy: disable
trailing-spaces:
level: warning
---
extends: default
rules:
braces: {min-spaces-inside: 0, max-spaces-inside: 1}
brackets: {min-spaces-inside: 0, max-spaces-inside: 1}
comments: disable
comments-indentation: disable
document-start: disable
indentation: disable
line-length: disable
\ No newline at end of file
[defaults]
#remote_tmp = /tmp/.ansible/tmp
host_key_checking = False
pipelining=True
log_path = ./ansible.log
display_args_to_stdout = True
roles_path = HPCasCode/roles
files_path = ./files
forks=25
[ssh_connection]
ssh_args = -F ./ssh.cfg
logdir/*
import yaml
from argparse import ArgumentParser
import subprocess
from pathlib import Path
import re
import sys
import os
from collections import defaultdict
def parse_argument():
parser = ArgumentParser("ansible lint runner with customized spec")
parser.add_argument('--targets', type=str, nargs='*',
help="path to roles or playbook targets")
parser.add_argument('--logdir', type=Path, default=Path( __file__ + '/../logdir').resolve(), nargs='?', help='log directory default to ./ansiblelint/logdir')
args = parser.parse_args()
args.logdir.mkdir(exist_ok=True)
return args
def parse_rule_output(line):
# (filepath, line, rule, severity, rule_desc)
expression = '(.*\.yml):([0-9]+): \[(.*)\] \[(.*)\] (.*$)'
matched = re.match(expression, line)
# print(line)
matched_groups = matched.groups()
return matched_groups
def group_by(output, idx):
res = defaultdict(list)
for i in output:
# print(i)
res[i[idx]].append(i)
return res
cmd_template = "ansible-lint --parseable-severity --nocolor "
outputs = defaultdict()
def main():
exit_code = 0
args = parse_argument()
for item in args.logdir.iterdir():
item.unlink()
cmd = cmd_template
if args.targets is not None:
cmd += ' ' + ' '.join(args.targets)
else:
rolenames = [str(i.resolve())
for i in Path(__file__ + '/../../plays/roles').resolve().iterdir() if i.is_dir()]
cmd += ' ' + ' '.join(rolenames)
# print(cmd)
logfile = args.logdir.joinpath('logfile')
cmd += ' 2>&1 | tee {}'.format(str(logfile.resolve()))
# print(cmd)
output = subprocess.check_output(cmd, shell=True)
print(output.decode())
output = output.decode().splitlines()
# print(output)
output = [parse_rule_output(line) for line in output]
# group by serverity
output = group_by(output, 3)
# print(output.keys())
# print(output.keys())
for k,v in output.items():
# print(k, v)
if (k=='VERY_HIGH') and len(v) != 0:
exit_code = 1
current_log = args.logdir.joinpath(k).resolve()
with current_log.open(mode='w') as f:
f.writelines(['filepath\tline\trule\tserverity\trule description\n'])
f.writelines(['\t'.join(list(i)) + '\n' for i in v])
sys.exit(exit_code)
# return
if __name__ == "__main__":
main()
---
# https://docs.ansibl.com/ansibl-lint/ruls/dfault_ruls.html
error:
- 101
- 102
- 103
- 104
- 202
- 304
- 306
- 401
- 402
- 403
- 404
- 501
- 502
- 701
warning:
- 105
- 201
- 203
- 204
- 205
- 206
- 301
- 302
- 303
- 305
- 503
- 504
- 601
- 602
- 702
- 703
- 704
nhc.conf
ssh_known_hosts
slurm.conf
slurmdbd.conf
etcHosts
inventory.*
---
- name: load vars
hosts: localhost
tasks:
- include_vars: os_vars.yml
tags: allways
- name: create network
hosts: localhost
tasks:
- name: network
openstack.cloud.network:
state: present
name: "{{ clustername }}-network"
register: network
- name: debug network
debug:
var: network
- openstack.cloud.subnet:
state: present
name: "{{ clustername }}-subnet"
network_name: "{{ network.network.name }}"
ip_version: 4
cidr: 192.168.0.0/24
dns_nameservers:
- 8.8.8.7
- 8.8.8.8
register: subnet
- openstack.cloud.router:
state: present
name: "{{ clustername }}-router"
network: "{{ ext_network }}"
interfaces:
- "{{ clustername }}-subnet"
- openstack.cloud.security_group:
state: present
name: "{{ clustername }}-secgroup"
register: secgroup
- name: debug secgroup
debug:
var: secgroup
- openstack.cloud.security_group_rule:
security_group: "{{ clustername}}-secgroup"
remote_group: "{{ clustername}}-secgroup"
protocol: "tcp"
- openstack.cloud.security_group_rule:
security_group: "{{ clustername}}-secgroup"
remote_group: "{{ clustername}}-secgroup"
protocol: "icmp"
- openstack.cloud.security_group_rule:
security_group: "{{ clustername}}-secgroup"
remote_ip_prefix: "0.0.0.0/0"
protocol: "tcp"
port_range_min: 22
port_range_max: 22
- name: debug subnet
debug:
var: subnet
- name: create bastion node
hosts: localhost
tasks:
- name: bastionnodes
openstack.cloud.server:
state: present
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
- "{{ clustername}}-secgroup"
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "BastionNodes" ]'
nics:
- net-id: "{{ network.network.id }}"
userdata: "{{ os_userdata }}"
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername}}-bastion0", flavor: t3.xsmall }
- name: create login nodes
hosts: localhost
tasks:
- name: Loginnodes
openstack.cloud.server:
state: present
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
- "{{ clustername}}-secgroup"
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "LoginNodes", "ManagementNodes", "LdapNodes" ]'
nics:
- net-id: "{{ network.network.id }}"
userdata: "{{ os_userdata }}"
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername }}-login0", flavor: m3.small }
- { vmname: "{{ clustername }}-login1", flavor: m3.small }
- name: create volumes for slurmstate and homedir
hosts: localhost
tasks:
- name: slurm volume
openstack.cloud.volume:
state: present
availability_zone: "{{ availability_zone }}"
size: 2
display_name: "{{clustername}}_slurm_state"
register: slurm_state_volume
- name: slurm volume
openstack.cloud.volume:
state: present
availability_zone: "{{ availability_zone }}"
size: 10
display_name: "{{ clustername }}_userdata"
register: user_volume
- name: debug slurm_state
debug:
var: slurm_state_volume
- name: create sql node
hosts: localhost
tasks:
- name: SQLnodes
openstack.cloud.server:
state: present
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
- "{{ clustername}}-secgroup"
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "SQLNodes", "NFSNodes" ]'
nics:
- net-id: "{{ network.network.id }}"
userdata: "{{ os_userdata }}"
name: "{{ item.vmname }}"
volumes: [ "{{ slurm_state_volume.volume.id }}", "{{ user_volume.volume.id }}"]
loop:
- { vmname: "{{ clustername }}-sql0", flavor: m3.small }
- name: create compute nodes
openstack.cloud.server:
state: present
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
- "{{ clustername}}-secgroup"
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "ComputeNodes" ]'
nics:
- net-id: "{{ network.network.id }}"
userdata: "{{ os_userdata }}"
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername }}-node00", flavor: m3.small }
- { vmname: "{{ clustername }}-node01", flavor: m3.small }
- name: attach floating ip
openstack.cloud.floating_ip:
state: present
server: "{{ clustername }}-bastion0"
network: "{{ ext_network }}"
reuse: true
---
- name: load vars
hosts: localhost
tasks:
- include_vars: os_vars.yml
tags: allways
- name: create bastion node
hosts: localhost
tasks:
- name: bastionnodes
openstack.cloud.server:
state: absent
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "BastionNodes" ]'
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername}}-bastion0", flavor: t3.xsmall }
- name: create login nodes
hosts: localhost
tasks:
- name: Loginnodes
openstack.cloud.server:
state: absent
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "LoginNodes", "ManagementNodes", "LdapNodes" ]'
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername }}-login0", flavor: t3.medium }
- { vmname: "{{ clustername }}-login1", flavor: t3.medium }
- name: create sql node
hosts: localhost
tasks:
- name: SQLnodes
openstack.cloud.server:
state: absent
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "SQLNodes", "NFSNodes" ]'
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername }}-sql0", flavor: t3.medium }
- name: create volumes for slurmstate and homedir
hosts: localhost
tasks:
- name: slurm volume
openstack.cloud.volume:
state: absent
availability_zone: "{{ availability_zone }}"
size: 2
display_name: "{{clustername}}_slurm_state"
- name: slurm volume
openstack.cloud.volume:
state: absent
availability_zone: "{{ availability_zone }}"
size: 10
display_name: "{{clustername}}_userdata"
register: user_volume
- name: create compute nodes
openstack.cloud.server:
state: absent
timeout: 600
availability_zone: "{{ availability_zone }}"
image: "{{ image }}"
flavor: "{{ item.flavor }}"
auto_ip: false
security_groups:
- default
meta:
clustername: "{{ clustername }}"
ansible_user: ubuntu
ansible_host_groups: '[ "ComputeNodes" ]'
name: "{{ item.vmname }}"
loop:
- { vmname: "{{ clustername }}-node00", flavor: t3.medium }
- { vmname: "{{ clustername }}-node01", flavor: t3.medium }
- openstack.cloud.router:
state: absent
name: "{{ clustername }}-router"
- openstack.cloud.subnet:
state: absent
name: "{{ clustername }}-subnet"
- openstack.cloud.security_group:
state: absent
name: "{{ clustername }}-secgroup"
- name: create network
hosts: localhost
tasks:
- name: network
openstack.cloud.network:
state: absent
name: "{{ clustername }}-network"
register: network
- name: debug network
debug:
var: network
---
# variabls for openstack
clustername: "{{ clustername }}"
image: 356ff1ed-5960-4ac2-96a1-0c0198e6a999
availability_zone: monash-02
ext_network: monash
# The bit that says #cloud-config is not a COMMENT, its part of a multiline string sent to openstack
os_userdata: |
#cloud-config
users:
- default
- name: ubuntu
ssh_authorized_keys: '{{ ssh_ca }}'
import os
import jinja2
ref_name = os.environ.get('CI_COMMIT_REF_NAME')
git_name = os.environ.get('CI_PROJECT_NAME')
ssh_ca = os.environ.get('SSH_CA')
cluster_name = "{}-ci-{}".format(git_name,ref_name)
with open('os_vars.yml.j2','r') as f:
template = jinja2.Template(f.read())
with open('os_vars.yml','w') as f:
f.write(template.render(clustername=cluster_name, ssh_ca=ssh_ca))
import os
def init_ssh_keys(inventory):
import subprocess
# Run ssh-keygen to generate host keys for login and compute nodes
key_types = ['rsa','dsa','ecdsa','ed25519']
#groups = list(filter(lambda x: x != 'hostvars', inventory['all']['children'].keys()))
groups = ['host']
fp={}
for t in key_types:
fp[t]={}
for g in groups:
output = 'files/ssh_{}_{}_key'.format(g,t)
if not os.path.exists(output):
p = subprocess.call(['ssh-keygen','-N','','-f',output,'-t',t])
fp[t][g] = subprocess.check_output(['cat','{}.pub'.format(output)])
return fp
def init_passwords():
import yaml
def gen_password(n=32): # Munge key needs a minimum 32 bytes
import string
import random
characters = string.ascii_letters + string.digits
return ''.join(random.choice(characters) for i in range(n))
required_passwords = [
"mungekey",
"slurmdb_passwd",
"default_user_password",
"influxdb_password",
]
passwords = {}
for p in required_passwords:
passwords[p] = gen_password()
outputfile = 'vars/passwords.yml'
if os.path.exists(outputfile):
return
with open(outputfile,'w') as f:
f.write(yaml.dump(passwords))
def make_hosts(inventory):
# create a file to template /etc/hosts with each nodes IP address
hostsdata = """
127.0.0.1 localhost
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
"""
for h in inventory['all']['children']['hostvars']['hosts'].items():
hostsdata = hostsdata + "{} {}\n".format(h[1]['ansible_host'],h[0])
outputfile = 'files/etcHosts'
with open(outputfile,'w') as f:
f.write(hostsdata)
def make_vars_filesystems(choices, inventory):
import jinja2
data = {}
data['clustername'] = choices['clustername']
data['domain'] = "{}.org.au".format(choices['clustername'])
clustername = choices['clustername']
# ATM we have only one NFS server, and its hostname ends in -sql because it does double duty as the SQL server for slurm accounting logs
nfsserver = "{}-sql0".format(choices['clustername'])
#for vol in ['userdata','slurm_state', 'userdata2']:
data['exports']=[]
data['disks'] = {}
for (name,value) in inventory['all']['children']['hostvars']['hosts'][nfsserver]['ansible_host_volumes'].items():
try:
clusternamelen = len(choices['clustername'])
vol = name[clusternamelen+1:]
data['{}_disk'.format(vol)] = inventory['all']['children']['hostvars']['hosts'][nfsserver]['ansible_host_volumes']["{}_{}".format(clustername,vol)]['dev']
data['exports'].append(vol)
data['disks'][vol] = inventory['all']['children']['hostvars']['hosts'][nfsserver]['ansible_host_volumes']["{}_{}".format(clustername,vol)]['dev']
except Exception as e:
import traceback
print(traceback.format_exc())
print(e)
#pass
with open('pre_templates/filesystems_yml.j2') as f:
template = jinja2.Template(f.read())
with open('vars/filesystems.yml','w') as f:
f.write(template.render(**data))
def make_ssh_cfg(choices, inventory):
import jinja2
#data = choices | inventory
data = { **choices, **inventory }
bastion_server = "{}-bastion0".format(choices['clustername'])
ansible_user = inventory['all']['children']['hostvars']['hosts'][bastion_server]['ansible_user']
bastion_floating_ip = inventory['all']['children']['hostvars']['hosts'][bastion_server]['ext_ip']
data['ansible_user'] = ansible_user
data['bastion_floating_ip'] = bastion_floating_ip
with open('pre_templates/ssh_cfg.j2') as f:
template = jinja2.Template(f.read())
with open('ssh.cfg','w') as f:
f.write(template.render(**data))
def make_nfs_exports(inventory):
# Generate an /etc/exports file for our NFS server to allow compute nodes to connect
import yaml
srvopts = "rw,no_root_squash"
with open('vars/filesystems.yml','r') as f:
fs = yaml.safe_load(f.read())
for host in fs['nfsexports']:
with open('files/etcExports','w') as f:
for e in host['exportList']:
f.write('{} {}({})\n'.format(e['src'],'192.168.0.0/24',srvopts))
def groups_from_inventory(inventory):
groups = {}
for g in inventory['all']['children'].items():
groups[g[0]] = g[1]['hosts'].keys()
return groups
def make_job_container_conf(choices, inventory):
import jinja2
with open('pre_templates/job_container.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/job_container.conf','w') as f:
f.write(template.render())
def make_cgroup_conf(choices, inventory):
import jinja2
with open('pre_templates/cgroup.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/cgroup.conf','w') as f:
f.write(template.render())
with open('files/cgroup.conf.j2','w') as f:
f.write(template.render())
def make_slurm_config(choices, inventory):
# Generate a slurm.conf from template
# should include putting the compute nodes in the correct place
# and listing the mgmt nodes as slurm controlers
import jinja2
import yaml
groups = groups_from_inventory(inventory)
mgmtnodes = list(inventory['all']['children']['ManagementNodes']['hosts'].keys())
sqlnodes = list(inventory['all']['children']['SQLNodes']['hosts'].keys())
mgmtnodes.sort()
choices['controller']=mgmtnodes[0]
choices['backup']=mgmtnodes[1]
choices['sqlnode']=sqlnodes[0]
choices['domain'] = "{}.org.au".format(choices['clustername'])
with open('pre_templates/slurm_vars.j2') as f:
template = jinja2.Template(f.read())
with open('vars/slurm.yml','w') as f:
f.write(template.render(**choices))
with open('vars/slurm.yml') as f:
slurmvars = yaml.safe_load(f.read())
slurmvars['groups'] = groups
#slurmvars = slurmvars | choices
slurmvars = { **slurmvars, **choices }
with open('pre_templates/slurm.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/slurm.conf','w') as f:
f.write(template.render(**slurmvars))
with open('pre_templates/slurmdbd.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/slurmdbd.conf','w') as f:
f.write(template.render(**slurmvars))
def make_nhc_config(inventory):
import jinja2
with open('pre_templates/nhc.conf.j2') as f:
template = jinja2.Template(f.read())
with open('files/nhc.conf','w') as f:
f.write(template.render())
def make_known_hosts(inventory, fp, clustername):
with open('files/ssh_known_hosts','w') as f:
for t in fp.keys():
f.write("{}* {}".format(clustername, fp[t]['host'].decode()))
f.write("localhost {}".format(fp[t]['host'].decode()))
def make_gres_conf(inventory):
with open('files/gres.conf','w') as f:
f.write('\n')
def make_vars_vars(choices, inventory):
import jinja2
groups = groups_from_inventory(inventory)
mgmtnodes = list(inventory['all']['children']['ManagementNodes']['hosts'].keys())
sqlnodes = list(inventory['all']['children']['SQLNodes']['hosts'].keys())
mgmtnodes.sort()
choices['controller']=mgmtnodes[0]
choices['backup']=mgmtnodes[1]
choices['sqlnode']=sqlnodes[0]
with open('pre_templates/vars_yml.j2') as f:
template = jinja2.Template(f.read())
with open('vars/vars.yml','w') as f:
f.write(template.render(**choices))
with open('vars/vars_centos79.yml','w') as f:
f.write('# empty file for compatibility with HPCasCode\n')
def init_cluster(inventory, clustername):
#
# These values should not change, but they are chosen at random, so only do it once
#
fp = init_ssh_keys(inventory)
make_known_hosts(inventory, fp, clustername)
init_passwords()
def derive_ansible_constants(choices, inventory):
#
# These values are deviced from the inventory and various choices make about the cluster
# No matter how many times you run this it won't change
#
make_hosts(inventory)
make_slurm_config(choices, inventory)
make_job_container_conf(choices, inventory)
make_cgroup_conf(choices, inventory)
make_vars_filesystems(choices, inventory)
make_nfs_exports(inventory)
#make_known_hosts(choices, inventory, fp)
make_nhc_config(inventory)
make_gres_conf(inventory)
make_vars_vars(choices, inventory)
make_ssh_cfg(choices, inventory)
def main():
import sys
import yaml
with open(sys.argv[1]) as f:
inventory = yaml.safe_load(f.read())
with open(sys.argv[2]) as f:
choices = yaml.safe_load(f.read())
with open(sys.argv[3]) as f:
versions = yaml.safe_load(f.read())
choices = { **choices, **versions }
clustername = choices['clustername']
init_cluster(inventory, clustername)
with open('vars/passwords.yml') as f:
passwords = yaml.safe_load(f.read())
choices = { **choices, **passwords }
derive_ansible_constants(choices, inventory)
if __name__ == "__main__":
main()
---
- import_playbook: plays/allnodes.yml
tags: [allnodes]
- import_playbook: plays/filesystems.yml
tags: [filesystems]
- import_playbook: plays/nfssqlnodes.yml
tags: [nfssql]
- import_playbook: plays/mgmtnodes.yml
tags: [mgmtnodesplaybook]
- import_playbook: plays/computenodes.yml
tags: [computenodesplaybook]
- import_playbook: plays/loginnodes.yml
tags: [loginnodesplaybook]
- hosts: 'all'
vars_files:
- vars/passwords.yml
# - vars/ldapConfig.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
tasks:
- { name: set use shared state, set_fact: usesharedstatedir=False }
- { name: set hostgroup, set_fact: hostgroup='ComputeNodes' }
tags: [ always ]
- hosts: 'all'
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
- vars/vars_centos79.yml
strategy: free
roles:
# - { role: disable_selinux, tags: [ disableselinux ] }
- { role: etcHosts, tags: [ networking ] }
# - { role: config_repos, tags: [ repos ] }
# - { role: upgrade, tags: [ upgrade ]}
- { role: logrotate, tags: [ logrotate, other ] }
- hosts: 'DesktopNodes,ComputeNodes,LoginNodes,ManagementNodes'
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
strategy: free
roles:
- { role: disable_selinux, tags: [ disableselinux ] }
- { role: ssh-password-login, tags: [ authentication ] }
- { role: enable_sudo_group, tags: [ authentication, sudo ] }
- { role: move_homedir }
- { role: ssh_host_keys, tags: [ ssh ] }
- { role: SSHKnownHosts, tags: [ ssh, known_hosts ] }
- hosts: 'DesktopNodes,ComputeNodes,LoginNodes,VisNodes'
gather_facts: True
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
tasks:
- include_vars: vars/passwords.yml
- include_vars: vars/slurm.yml
- include_vars: vars/vars.yml
- include_vars: vars/versions.yml
#- include_vars: vars/c7packages.yml
- { name: set use shared state, set_fact: usesharedstatedir=False }
tags: [ always ]
- hosts: ComputeNodes
vars_files:
- ./vars/slurm.yml
- ./vars/versions.yml
roles:
- { role: nvidia_mig_tools, tags: [ nvidia_mig] }
- { role: nvidia_mig_configure, mig_config: mig_config.yml, mig_setting: all-cryosparc, tags: [ nvidia_mig] }
- hosts: 'DesktopNodes,ComputeNodes,LoginNodes'
gather_facts: False
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
strategy: free
roles:
- { role: move_homedir, tags: [ authentication, filesystems ] }
# - { role: nfs-client, nfsMounts: "{{ computeNfsMounts }}", tags: [ filesystems ] }
- { role: slurm-common, tags: [ slurm, slurm-common ] }
#- { role: lmod, tags: [ other ] } # actually preffered on ubuntu but mutually exclusive with environment-modules
- { role: enable_modules, default_modules: "modulecmd", tags: [ other ] }
# - { role: postfix, tags: [ mail, other ] }
- { role: set_semaphore_count, tags: [ semaphore ] }
# - { role: ldapclient, ssl: false, tags: [ ldapclient ] }
# - { role: rsyslog_client, tags: [ syslog ] }
- { role: ssh-keepalive, tags: [ ssh ] }
- { role: enable_sudo_group, tags: [ authentication ] }
- hosts: 'VisNodes'
gather_facts: False
vars_files:
- vars/vars.yml
- vars/versions.yml
roles:
- { role: gpu, tags: [ gpu ] }
- hosts: 'DesktopNodes,ComputeNodes,LoginNodes'
gather_facts: False
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
roles:
- { role: slurm_config, tags: [slurm, slurm_config] }
- hosts: 'VisNodes'
tasks:
- { name: set cuda monitoring, set_fact: cudamonitor=true }
tags: [ always ]
- hosts: 'ComputeNodes,DesktopNodes'
vars_files:
- vars/slurm.yml
roles:
- { role: slurm_config, tags: [ slurm_config, slurm ] }
- { role: nhc, tags: [ nhc, slurm ] }
- { role: slurm-start, start_slurmd: True, tags: [ slurm, slurm-start ] }
# - { role: jasons_ssh_ca, tags: [ other ] }
#- { role: extra_packages, tags: [ other, extra_packages ] } # commented because it takes forever! good enough if this gets tested on clusterbuild
- { role: pam_sshd, computenodepam: true, tags: [ authentication, pamd ] }
- { role: user_ssh_ca, tags: [ authentication ] }
- { role: additional_paths }
- hosts: 'VisNodes'
roles:
- { role: systemd-nvidia-uvm, tags: [ uvm,SiteSpecific ] }
- hosts: 'VisNodes'
roles:
- { role: deploy-xorg, tags: [ deploy-xorg ] }
- hosts: 'LoginNodes'
roles:
- { role: pam_sshd, tags: [ authentication, pamd ], when: ansible_os_family == 'RedHat' }
- { role: user_ssh_ca, tags: [ authentication ] }
- { role: additional_paths }
../files
\ No newline at end of file
- hosts: 'all'
tasks:
- include_vars: vars/filesystems.yml
- name: Ubuntu quota packages
block:
- name: install packages
package:
state: present
name: "{{ item }}"
become: true
with_items:
- quota
- quotatool
- linux-image-extra-virtual
register: installed
- name: enable service
service:
name: quotarpc
state: started
enabled: yes
become: true
- name: reboot server
reboot: {}
when: installed.changed
become: true
when: ansible_distribution == 'Ubuntu'
- hosts: 'all'
tasks:
- filesystem:
fstype: ext4
dev: "{{ volume.dev }}"
when: volume.host == inventory_hostname
loop: "{{ volumes }}"
loop_control:
loop_var: "volume"
become: true
- mount:
fstype: ext4
src: "{{ volume.dev }}"
state: mounted
opts: usrquota,grpquota
path: "{{ volume.mnt }}"
when: volume.host == inventory_hostname
loop: "{{ volumes }}"
loop_control:
loop_var: "volume"
become: true
- hosts: 'all'
tasks:
- include_role:
name: nfs-server
vars:
exportList: "{{ export.exportList }}"
when: export.host == inventory_hostname
loop: "{{ nfsexports }}"
loop_control:
loop_var: "export"
- hosts: 'all'
tasks:
- include_role:
name: nfs-client
vars:
nfsMounts: "{{ mount.nfsMounts }}"
when: mount.group in group_names
loop: "{{ nfsmounts }}"
loop_control:
loop_var: "mount"
---
- hosts: 'all'
tasks:
- include_vars: vars/passwords.yml
- include_vars: vars/names.yml
- include_vars: vars/ldapConfig.yml
- include_vars: vars/filesystems.yml
- include_vars: vars/slurm.yml
- include_vars: vars/vars.yml
- hosts: 'all'
tasks:
- { name: setup, setup: }
- hosts: 'ManagementNodes'
roles:
- { role: calculateSlurmConf }