diff --git a/.gitignore b/.gitignore index ab476012443e79ef3aa8cd439ce730fd763f39f8..4dadd58568931d59eb760aa69aaeeb2ac8992fca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ +*.swp *.retry +*-openrc.sh +gc_key.pem +CICD/files/slurm.conf +CICD/files/slurmdbd.conf +CICD/files/ssh_known_hosts diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..18c7d93c31bcf075157165f0bda32eddfe6074a5 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,220 @@ +variables: + GIT_SUBMODULE_STRATEGY: recursive + STACKNAME: CICD_reporef$CI_COMMIT_REF_NAME + NECTAR_ALLOCATION: HPCCICD + ANSIBLE_HOST_KEY_CHECKING: "False" + +stages: +# - integration_test_downstream # working but unwanted here +# - trigger_pipeline_in_B # working but unwanted here + - lint + #- delete_stack_manual + - extended + #- heat_test + - heat + - ansible_create_cluster_stage + - push_button_spawn_cluster +# - e2e + - tests + - clean # manually delete stack + + + +#trigger_pipeline_in_B: +# stage: integration_test_downstream +# tags: +# - ansible +# script: +# - "curl --request POST --form token=${CI_JOB_TOKEN} --form ref=master https://gitlab.erc.monash.edu.au/api/v4/projects/1085/trigger/pipeline" # ID is from pysshauthz + +# heat_test: +# stage: heat_test +# allow_failure: false +# tags: +# - heat +# before_script: +# - echo "$GC_KEY" > gc_key.pem +# - chmod 400 gc_key.pem +# - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh +# - source ./$NECTAR_ALLOCATION-openrc.sh +# - export HEAT_TEST_STACKNAME=_TESTING_HEAT +# - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $HEAT_TEST_STACKNAME +# - sleep 60 +# script: +# - echo "heat_test stage" +# - source ./$NECTAR_ALLOCATION-openrc.sh +# - bash -x ./CICD/heat/heatcicdwrapper.sh create $HEAT_TEST_STACKNAME +# - openstack stack list +# - bash -x ./CICD/heat/heatcicdwrapper.sh update $HEAT_TEST_STACKNAME +# - openstack stack list +# - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $HEAT_TEST_STACKNAME +# - openstack stack list +# after_script: +# - sleep 20 # artifically wait a bit to make sure it is really dead +# when: manual + +yamllint: + stage: lint + allow_failure: true + tags: + - yamllint + script: + - echo "stage yamllint" + - cd CICD + # - ansible-lint -c .yamllintconf.yaml -x ANSIBLE0002 master_playbook.yml + - yamllint -c ./.yamllintheat.yaml ./heat + +# delete_stack_manual: +# stage: delete_stack_manual +# tags: +# - heat +# before_script: +# - echo "$GC_KEY" > gc_key.pem +# - chmod 400 gc_key.pem +# - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh +# script: +# - echo "heat stage" +# - source ./$NECTAR_ALLOCATION-openrc.sh +# - openstack stack list +# - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $STACKNAME +# when: manual + +ansiblelint: + allow_failure: true + stage: lint + tags: + - ansiblelint + script: + - echo "stage ansiblelint" + - cd CICD + - python3 ansiblelint/run_lint.py --targets master_playbook.yml + + +build_cluster_cicd: + stage: heat + allow_failure: false + tags: + - heat + before_script: + - echo "$GC_KEY" > gc_key.pem + - chmod 400 gc_key.pem + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - echo "heat stage" + - source ./$NECTAR_ALLOCATION-openrc.sh + - openstack stack list + - bash -x ./CICD/heat/heatcicdwrapper.sh create_or_update $STACKNAME + after_script: + - sleep 20 # artifically wait a bit to give the nodes time to boot +# only: +# changes: #https://docs.gitlab.com/ee/ci/yaml/#onlychangesexceptchanges +# - "heat/*HOT*.yaml" +# - schedules +# - ./.gitlab-ci.yml + +ansible_create_cluster_stage: + stage: ansible_create_cluster_stage + tags: + - ansible + before_script: + - echo "$GC_KEY" > gc_key.pem + - chmod 400 gc_key.pem + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - echo "ansible_create_cluster_stage" + - bash -x ./CICD/ansible_create_cluster_script.sh + #after_script: + #- rm ./files/inventory.$STACKNAME + #only: + # changes: #https://docs.gitlab.com/ee/ci/yaml/#onlychangesexceptchanges + # - "master_playbook.yml" + # - "vars/*.{yml,yaml}" + # - schedules + # - CICD/.gitlab-ci.yml + +tests: + stage: tests + tags: + - ansible + before_script: + - echo "$GC_KEY" > gc_key.pem + - chmod 400 gc_key.pem + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - echo "tests stage" + - source ./$NECTAR_ALLOCATION-openrc.sh + - openstack stack list + - cd CICD + - python3 ../scripts/make_inventory.py static $STACKNAME | tee ./files/inventory.$STACKNAME && chmod 755 ./files/inventory.$STACKNAME + - grep -qv "I could not find any resouces tagged with project_name:" ./files/inventory.$STACKNAME #fail if inventory file is empty + - ansible -m ping -i files/inventory.$STACKNAME --key-file ../gc_key.pem all + - ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sudo ls" all + + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sinfo" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "squeue" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet mariadb" SQLNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmctld" ManagementNodes + - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmdbd" ManagementNodes + + - bash -e ./tests/run_tests.sh all "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh ComputeNodes "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh LoginNodes "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh ManagementNodes "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh NFSNodes "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh SQLNodes "files/inventory.$STACKNAME" "../gc_key.pem" + + # licensing https://gitlab.erc.monash.edu.au/hpc-team/license_server/tree/master/roles/avizo_license_monitor + +manual_cluster_spawn: + stage: push_button_spawn_cluster + tags: + - heat + - ansible + before_script: + - echo "press button spawn cluster." + - echo "for this to work you have to provide a variable called manual stackname" + - echo I still need to handle os password + - echo $MANUAL_STACKNAME + - echo "$GC_KEY" > gc_key.pem + - chmod 400 gc_key.pem + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - source ./$NECTAR_ALLOCATION-openrc.sh + - bash -x ./CICD/heat/heatcicdwrapper.sh create $MANUAL_STACKNAME + - openstack stack list + - export STACKNAME=$MANUAL_STACKNAME + - sleep 25 + - bash -x CICD/ansible_create_cluster_script.sh + when: manual + only: + refs: + - "cicd" + +extended: + stage: extended + tags: + - heat + - ansible + before_script: + - echo "cleanup stack" + - sleep 30 + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - source ./$NECTAR_ALLOCATION-openrc.sh + - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $STACKNAME + only: + variables: + - $EXTENDED != null + +clean: + stage: clean + tags: + - heat + before_script: + - echo "cleanup stack" + - sleep 30 + - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh + script: + - source ./$NECTAR_ALLOCATION-openrc.sh + - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $STACKNAME + #when: manual \ No newline at end of file diff --git a/CICD/ansible_create_cluster_script.sh b/CICD/ansible_create_cluster_script.sh new file mode 100755 index 0000000000000000000000000000000000000000..b062d4f21e71371698683164a1c81a9ef40a39b2 --- /dev/null +++ b/CICD/ansible_create_cluster_script.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e +export ANSIBLE_HOST_KEY_CHECKING=False + +source ./$NECTAR_ALLOCATION-openrc.sh +openstack stack list + +cd CICD + +python3 ../scripts/make_inventory.py static $STACKNAME | tee ./files/inventory.$STACKNAME && chmod 755 ./files/inventory.$STACKNAME +grep -qv "I could not find any resouces tagged with project_name:" ./files/inventory.$STACKNAME #fail if inventory file is empty +ansible -m ping -i files/inventory.$STACKNAME --key-file ../gc_key.pem all +ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sudo ls" all + +#cd roles + #- "egrep -lRZ 'sudo: true' . | xargs -0 -l sed -i -e 's/sudo: true/become: true/g' " +#cd .. +ansible-playbook -i files/inventory.$STACKNAME --key-file ../gc_key.pem master_playbook.yml +sleep 15 +echo uglyuglyfix +ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -b -a "systemctl restart slurmdbd" ManagementNodes \ No newline at end of file diff --git a/CICD/ansiblelint/.gitignore b/CICD/ansiblelint/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..34d524d13cda88e46754f4ad563d95a135d94748 --- /dev/null +++ b/CICD/ansiblelint/.gitignore @@ -0,0 +1 @@ +logdir/* diff --git a/CICD/ansiblelint/run_lint.py b/CICD/ansiblelint/run_lint.py new file mode 100644 index 0000000000000000000000000000000000000000..cea3e7d6ae52e212c3ee0b151e58b0cdf361af1f --- /dev/null +++ b/CICD/ansiblelint/run_lint.py @@ -0,0 +1,72 @@ +import yaml +from argparse import ArgumentParser +import subprocess +from pathlib import Path +import re +import sys +import os +from collections import defaultdict +def parse_argument(): + parser = ArgumentParser("ansible lint runner with customized spec") + parser.add_argument('--targets', type=str, nargs='*', + help="path to roles or playbook targets") + parser.add_argument('--logdir', type=Path, default=Path( __file__ + '/../logdir').resolve(), nargs='?', help='log directory default to ./ansiblelint/logdir') + + args = parser.parse_args() + args.logdir.mkdir(exist_ok=True) + return args +def parse_rule_output(line): + # (filepath, line, rule, severity, rule_desc) + expression = '(.*\.yml):([0-9]+): \[(.*)\] \[(.*)\] (.*$)' + matched = re.match(expression, line) + # print(line) + matched_groups = matched.groups() + return matched_groups + +def group_by(output, idx): + res = defaultdict(list) + for i in output: + # print(i) + res[i[idx]].append(i) + return res +cmd_template = "ansible-lint --parseable-severity --nocolor " +outputs = defaultdict() +def main(): + exit_code = 0 + args = parse_argument() + for item in args.logdir.iterdir(): + item.unlink() + cmd = cmd_template + if args.targets is not None: + cmd += ' ' + ' '.join(args.targets) + else: + rolenames = [str(i.resolve()) + for i in Path(__file__ + '/../../plays/roles').resolve().iterdir() if i.is_dir()] + cmd += ' ' + ' '.join(rolenames) + # print(cmd) + logfile = args.logdir.joinpath('logfile') + cmd += ' 2>&1 | tee {}'.format(str(logfile.resolve())) + # print(cmd) + output = subprocess.check_output(cmd, shell=True) + print(output.decode()) + output = output.decode().splitlines() + # print(output) + output = [parse_rule_output(line) for line in output] + + # group by serverity + output = group_by(output, 3) + # print(output.keys()) + # print(output.keys()) + for k,v in output.items(): + # print(k, v) + if (k=='VERY_HIGH') and len(v) != 0: + exit_code = 1 + current_log = args.logdir.joinpath(k).resolve() + + with current_log.open(mode='w') as f: + f.writelines(['filepath\tline\trule\tserverity\trule description\n']) + f.writelines(['\t'.join(list(i)) + '\n' for i in v]) + sys.exit(exit_code) + # return +if __name__ == "__main__": + main() diff --git a/CICD/ansiblelint/spec.yml b/CICD/ansiblelint/spec.yml new file mode 100644 index 0000000000000000000000000000000000000000..53ee4a04a8c583cef6d601cd268d0399bbf6cdba --- /dev/null +++ b/CICD/ansiblelint/spec.yml @@ -0,0 +1,37 @@ +--- +# https://docs.ansibl.com/ansibl-lint/ruls/dfault_ruls.html +error: + - 101 + - 102 + - 103 + - 104 + - 202 + - 304 + - 306 + - 401 + - 402 + - 403 + - 404 + - 501 + - 502 + - 701 + +warning: + - 105 + - 201 + - 203 + - 204 + - 205 + - 206 + - 301 + - 302 + - 303 + - 305 + - 503 + - 504 + - 601 + - 602 + - 702 + - 703 + - 704 + diff --git a/CICD/files/.gitignore b/CICD/files/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..37e22cdfa443a09339be8c5dc62c492e2914cce0 --- /dev/null +++ b/CICD/files/.gitignore @@ -0,0 +1,4 @@ +ssh_known_hosts +*.conf +etcHosts +inventory.* diff --git a/CICD/files/etcExports b/CICD/files/etcExports new file mode 100644 index 0000000000000000000000000000000000000000..0867fd1b7bb1aff1d6be948f7c4fd40ee07f199a --- /dev/null +++ b/CICD/files/etcExports @@ -0,0 +1,4 @@ +/nfsvol/home *(fsid=1,rw,no_root_squash) +/slurmstate *(fsid=2,rw,no_root_squash) +/nfsvol/projects *(fsid=4,rw,no_root_squash) +/nfsvol/scratch *(fsid=5,rw,no_root_squash) diff --git a/CICD/heat/gc_HOT.yaml b/CICD/heat/gc_HOT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d2935aef33a7e480cba89da6d3bd58cff815f95 --- /dev/null +++ b/CICD/heat/gc_HOT.yaml @@ -0,0 +1,269 @@ +--- +heat_template_version: 2013-05-23 +description: "A simple template to boot a cluster of desktops (LoginNode, ManagementNodes and Desktop Nodes)" +# avz parameters disabled. they are working but I want just more options than monash-02. I would like to have a parameter that says "I don't care" + +parameters: + ubuntu_1804_image_id: + type: string + label: Image ID + description: Ubuntu Image + default: 99d9449a-084f-4901-8bd8-c04aebd589ca + centos_7_image_id: + type: string + label: Image ID + description: Centos Image + default: 12da1997-5122-4be3-a2a9-2f44961c1b16 + ssh_key: + type: string + default: gc_key + avz: + type: string + default: monash-02 + project_name: + type: string + NetID: + type: string + default: Classic Provider + Flavour: + type: string + default: t3.xsmall + + +resources: + + SlurmSecGroup: + type: "OS::Neutron::SecurityGroup" + properties: + name: "heatslurmsecgroup" + rules: [ { protocol: tcp, + port_range_min: 12000, + port_range_max: 12999}, + { protocol: tcp, + port_range_min: 6817, + port_range_max: 6819}, + { protocol: tcp, + port_range_min: 1019, + port_range_max: 1019}] + NFSSecGroup: + type: "OS::Neutron::SecurityGroup" + properties: + name: "heatnfssecgroup" + rules: [ { protocol: tcp, + port_range_min: 2049, + port_range_max: 2049}, + { protocol: tcp, + port_range_min: 111, + port_range_max: 111}, + { protocol: udp, + port_range_min: 2049, + port_range_max: 2049}, + { protocol: udp, + port_range_min: 111, + port_range_max: 111}] + MySQLSecGroup: + type: "OS::Neutron::SecurityGroup" + properties: + name: "heatmysqlsecgroup" + rules: [ { protocol: tcp, + port_range_min: 3306, + port_range_max: 3306} ] + SSHMonashSecGroup: + type: "OS::Neutron::SecurityGroup" + properties: + name: "SSHMonashSecGroup" + rules: [ { protocol: tcp, + port_range_min: 22, + port_range_max: 22, + direction: ingress, + remote_ip_prefix: 118.138.240.0/21 + } ] +# SSHInternalSecGroup: +# type: "OS::Neutron::SecurityGroup" +# properties: +# name: "SSHInternalSecGroup" +# rules: [ { protocol: tcp, +# port_range_min: 22, +# port_range_max: 22, +# direction: ingress} ] + #remote_ip_prefix: { get_param: REMOTE_IP }, direction: ingress + webaccess: + type: "OS::Neutron::SecurityGroup" + properties: + name: "webaccess" + rules: [ { protocol: tcp, + port_range_min: 80, + port_range_max: 80}, + { protocol: tcp, + port_range_min: 443, + port_range_max: 443} ] + + SQLNode0: + type: "OS::Nova::Server" + properties: + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'sql0' ]] + availability_zone: { get_param: avz } + flavor: t3.small + image: { get_param: centos_7_image_id } + key_name: { get_param: ssh_key } + security_groups: [ { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: MySQLSecGroup }, { get_resource: NFSSecGroup } ] + metadata: + ansible_host_groups: [ SQLNodes, NFSNodes ] + ansible_ssh_user: ec2-user + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + + NFSVolume: + type: OS::Cinder::Volume + properties: + availability_zone: { get_param: avz } + size: 1 + name: nfsvol + NFSVolumeAttachment: + type: "OS::Cinder::VolumeAttachment" + properties: + volume_id: { get_resource: NFSVolume } + instance_uuid: { get_resource: SQLNode0 } + + SLURMSTATEVolume: + type: OS::Cinder::Volume + properties: + availability_zone: { get_param: avz } + size: 1 + name: slurmstate + SLURMSTATEVolumeAttachment: + type: "OS::Cinder::VolumeAttachment" + properties: + volume_id: { get_resource: SLURMSTATEVolume } + instance_uuid: { get_resource: SQLNode0 } + + DBVolume: + type: OS::Cinder::Volume + properties: + availability_zone: { get_param: avz } + size: 10 + name: dbvol + DBVolumeAttachment: + type: "OS::Cinder::VolumeAttachment" + properties: + volume_id: { get_resource: DBVolume } + instance_uuid: { get_resource: SQLNode0 } + + MgmtNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 2 + resource_def: + type: My::Server::MgmtNode + properties: + #avz: { get_param: avz } + image: { get_param: centos_7_image_id } + ansible_ssh_user: ec2-user + mynodename: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'mgmt%index%' ]] + ssh_key: { get_param: ssh_key } + security_groups: [ default, { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: NFSSecGroup }, { get_resource: MySQLSecGroup } ] + project_name: { get_param: project_name } + + LoginNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 1 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: t3.small + image: { get_param: centos_7_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'login%index%' ]] + security_groups: [ default, { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: NFSSecGroup } ] + metadata: + ansible_host_groups: [ LoginNodes ] + ansible_ssh_user: ec2-user + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + + DesktopNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: t3.small + image: { get_param: centos_7_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'desktopc%index%' ]] + security_groups: [ default, { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: NFSSecGroup } ] + metadata: + ansible_host_groups: [ DesktopNodes, VisNodes, ComputeNodes ] + ansible_ssh_user: ec2-user + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + + ComputeNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 1 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: t3.small + image: { get_param: centos_7_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'computec%index%' ]] + security_groups: [ default, { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: NFSSecGroup } ] + metadata: + ansible_host_groups: [ ComputeNodes ] + ansible_ssh_user: ec2-user + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + + UbuntuDesktopNodes: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: t3.small + image: { get_param: ubuntu_1804_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'desktopu%index%' ]] + security_groups: [ default, { get_resource: SSHMonashSecGroup }, { get_resource: SlurmSecGroup }, { get_resource: NFSSecGroup } ] + metadata: + ansible_host_groups: [ DesktopNodes ] + ansible_ssh_user: ubuntu + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + +# PySSHauthz: +# type: "OS::Nova::Server" +# properties: +# name: +# list_join: [ '-', [ { get_param: "OS::stack_name" }, 'pysshautz' ]] +# availability_zone: { get_param: avz } +# flavor: t3.xsmall +# image: { get_param: ubuntu_1804_image_id } +# key_name: { get_param: ssh_key } +# security_groups: [ { get_resource: SSHMonashSecGroup }, { get_resource: webaccess } ] +# metadata: +# ansible_host_groups: [ PySSHauthz ] +# ansible_ssh_user: ubuntu +# project_name: { get_param: project_name } +# networks: +# - network: { get_param: NetID } diff --git a/CICD/heat/heatcicdwrapper.sh b/CICD/heat/heatcicdwrapper.sh new file mode 100644 index 0000000000000000000000000000000000000000..abbd2ee6e7734b6126cc217bad45f858f5ee1958 --- /dev/null +++ b/CICD/heat/heatcicdwrapper.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# This script does not check available ressources on nectar! + + +function usage { + echo $"Usage: $0 {create|update|show|create_or_update,delete_if_exists} STACKNAME" + exit 1 +} + +if [ "$#" -ne 2 ]; then + echo "Illegal number of parameters expecting 2" + usage +fi + +STACKNAME=$2 + + +if [[ "$STACKNAME" == "CICD"* ]]; then + echo "CICD found in stackname. doing nothing" +else + STACKNAME="CICD"$STACKNAME +fi + + + +echo "[heatcicdwrapper] Prefixing Stackname with CICD. This is a safety feature because this script can also delete stacks" $STACKNAME + +function check_stack_exists { + if openstack stack list | grep -w $STACKNAME; + then + echo "stack found"; + else + echo "stack not found"; + return 1 + fi +} + + +function func_delete_if_exists { + if ! check_stack_exists + then + exit 0 + fi + openstack stack delete -y --wait $STACKNAME + ret=$? + if [ $ret -ne "0" ] + then + sleep 15 + openstack stack delete -y --wait $STACKNAME + ret=$? + fi + exit $ret +} + +function create_stack { + + if check_stack_exists + then + echo "I will NOT create existing stack maybe use update" + exit -44 + fi + openstack stack create --wait --template ./CICD/heat/gc_HOT.yaml --parameter "project_name=$STACKNAME" -e ./CICD/heat/resource_registry.yaml $STACKNAME + createreturn=$? + if [ $createreturn -ne "0" ] + then + openstack stack delete -y --wait $STACKNAME + echo "creation failed. trying to delete" + exit -47 + fi + exit $createreturn +} + + +case "$1" in + create) + create_stack + ;; + + update) + if ! check_stack_exists + then + echo "I cannot update a stack which does not exist" + exit -45 + fi + openstack stack update --wait --template ./CICD/heat/gc_HOT.yaml --parameter "project_name=$STACKNAME" -e ./CICD/heat/resource_registry.yaml $STACKNAME + ret=$? + exit $ret + ;; + create_or_update) + if check_stack_exists + then + openstack stack update --wait --template ./CICD/heat/gc_HOT.yaml --parameter "project_name=$STACKNAME" -e ./CICD/heat/resource_registry.yaml $STACKNAME + ret=$? + exit $ret + fi + create_stack + + ;; + delete_if_exists) + func_delete_if_exists + + ;; + + + show) + check_stack_exists + echo $? + OUTPUT=$(openstack stack show $STACKNAME| grep -w stack_status) + echo $OUTPUT + ;; + + *) + usage + +esac diff --git a/CICD/heat/mgmtnode_HOT.yaml b/CICD/heat/mgmtnode_HOT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4f9858acef22bf8237d4833f6ab353cb050db3c --- /dev/null +++ b/CICD/heat/mgmtnode_HOT.yaml @@ -0,0 +1,38 @@ +heat_template_version: 2013-05-23 +parameters: + mynodename: + type: string + ssh_key: + type: string + image: + type: string + #avz: + # type: string + project_name: + type: string + ansible_ssh_user: + type: string + security_groups: + type: json + NetID: + type: string + #default: 915a3d96-693d-4c9d-a2ef-04996ab085d3 + default: Classic Provider + +resources: + + instance: + type: OS::Nova::Server + properties: + #availability_zone: { get_param: avz } + flavor: t3.xsmall + image: { get_param: image } + key_name: { get_param: ssh_key } + security_groups: { get_param: security_groups } + name: { get_param: mynodename } + metadata: + ansible_host_groups: [ ManagementNodes ] + ansible_ssh_user: { get_param: ansible_ssh_user } + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } diff --git a/CICD/heat/resource_registry.yaml b/CICD/heat/resource_registry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0638b887c8c09d5d6a98f51a34d3b4eeb6e9aafb --- /dev/null +++ b/CICD/heat/resource_registry.yaml @@ -0,0 +1,2 @@ +resource_registry: + My::Server::MgmtNode: mgmtnode_HOT.yaml diff --git a/CICD/master_playbook.yml b/CICD/master_playbook.yml new file mode 100644 index 0000000000000000000000000000000000000000..04dc7e747b8894c60a049697e93c8ac89a2b8dc1 --- /dev/null +++ b/CICD/master_playbook.yml @@ -0,0 +1,7 @@ +--- +- import_playbook: plays/make_files.yml +- import_playbook: plays/allnodes.yml +- import_playbook: plays/init_slurmconf.yml # this requires management nodes +- import_playbook: plays/nfssqlnodes.yml +- import_playbook: plays/mgmtnodes.yml +- import_playbook: plays/computenodes.yml diff --git a/CICD/plays/allnodes.yml b/CICD/plays/allnodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..406bdb797108aa4d19a505742cb4df02f5df6909 --- /dev/null +++ b/CICD/plays/allnodes.yml @@ -0,0 +1,48 @@ +- hosts: 'all' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + tasks: + - { name: set use shared state, set_fact: usesharedstatedir=False } + - { name: set hostgroup, set_fact: hostgroup='ComputeNodes' } + tags: [ always ] + +- hosts: 'all' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + roles: +# - { role: disable_selinux, tags: [ disableselinux ] } + - { role: etcHosts, tags: [ networking ] } + - { role: config_repos, tags: [ repos ] } + - { role: upgrade } + - { role: set_password } + + +- hosts: 'DesktopNodes,ComputeNodes,LoginNodes,ManagementNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + roles: + - { role: disable_selinux, tags: [ disableselinux ] } + #- { role: ldapclient, tags: [ authentication ] } + - { role: ssh-password-login, tags: [ authentication ] } + - { role: enable_sudo_group, tags: [ authentication, sudo ] } + - { role: move_homedir } + - { role: calculateKnownHosts, tags: [ calculateKnownHosts ] } + - { role: SSHKnownHosts, tags: [ known_hosts ] } + - { role: jasons_ssh_ca, tags: [ ssh_ca ] } diff --git a/CICD/plays/computenodes.yml b/CICD/plays/computenodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..a43a5a927506f800d0f8d5a1eb3da208d404f4b5 --- /dev/null +++ b/CICD/plays/computenodes.yml @@ -0,0 +1,64 @@ + +- hosts: 'DesktopNodes,ComputeNodes,LoginNodes,VisNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + tasks: + - { name: set use shared state, set_fact: usesharedstatedir=False } + tags: [ always ] + +- hosts: 'DesktopNodes,ComputeNodes,LoginNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + roles: + - { role: move_homedir, tags: [ authentication, filesystems ] } + - { role: nfs-client, nfsMounts: "{{ computeNfsMounts }}", tags: [ filesystems ] } + - { role: slurm-common, tags: [ slurm, slurm-common ] } + - { role: lmod, tags: [ other ] } + - { role: enable_modules, default_modules: "lmod", tags: [ other ] } + - { role: postfix, tags: [ mail, other ] } + +- hosts: 'VisNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + roles: + - { role: gpu, tags: [ gpu ] } + +- hosts: 'DesktopNodes,ComputeNodes,LoginNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + roles: + - { role: slurm_config, tags: [slurm, slurm_config] } + +- hosts: 'DesktopNodes,ComputeNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + roles: + - { role: slurm-start, start_slurmd: True, tags: [ slurm, slurmstart ] } + #- { role: mate-de-install, tags: [ mate-de-install ] } # TODO this crashes for everything except cmca \ No newline at end of file diff --git a/CICD/plays/files b/CICD/plays/files new file mode 120000 index 0000000000000000000000000000000000000000..feb122881ce2321d72ad6b867bd2a3d01eadaac3 --- /dev/null +++ b/CICD/plays/files @@ -0,0 +1 @@ +../files \ No newline at end of file diff --git a/CICD/plays/init_slurmconf.yml b/CICD/plays/init_slurmconf.yml new file mode 100644 index 0000000000000000000000000000000000000000..30667ac53b5b6c387af0bdacb609f09cc8bfa5c3 --- /dev/null +++ b/CICD/plays/init_slurmconf.yml @@ -0,0 +1,15 @@ +--- +- hosts: 'all' + tasks: + - include_vars: vars/passwords.yml + - include_vars: vars/names.yml + - include_vars: vars/ldapConfig.yml + - include_vars: vars/filesystems.yml + - include_vars: vars/slurm.yml + - include_vars: vars/vars.yml +- hosts: 'all' + tasks: + - { name: setup, setup: } +- hosts: 'ManagementNodes' + roles: + - { role: calculateSlurmConf } diff --git a/CICD/plays/make_files.yml b/CICD/plays/make_files.yml new file mode 100644 index 0000000000000000000000000000000000000000..b05925ce73f9be136bb46128961990b938c07910 --- /dev/null +++ b/CICD/plays/make_files.yml @@ -0,0 +1,22 @@ +--- +# just calculates an etc hosts +- hosts: 'all' + tasks: + - include_vars: vars/passwords.yml + - include_vars: vars/names.yml + - include_vars: vars/ldapConfig.yml + - include_vars: vars/filesystems.yml + - include_vars: vars/slurm.yml + - include_vars: vars/vars.yml +- hosts: 'all' + tasks: + - { name: setup, setup: } +- hosts: 'ManagementNodes' + roles: + - { role: calculateEtcHosts } + +#- hosts: 'NFSNodes' +# roles: +# - { role: calculateExports } + + diff --git a/CICD/plays/mgmtnodes.yml b/CICD/plays/mgmtnodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..c890a5456b5306f1478070e3f329fc57adc51340 --- /dev/null +++ b/CICD/plays/mgmtnodes.yml @@ -0,0 +1,43 @@ +# Basic stuff to make the nodes functionl +# i.e. upgrade operating systems, etc +# + +- hosts: 'ManagementNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + tasks: + # - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' } + - { name: set use shared state, set_fact: usesharedstatedir=True } + tags: [ always ] + +- hosts: 'ManagementNodes' + strategy: free + gather_facts: False + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + roles: +# - { role: ldapclient, tags: [ authentication ] } +# - { role: ssh-password-login } +# - { role: enable_sudo_group } +# - { role: make_filesystems, volumes: "{{ glustervolumes }}" } +# - { role: gluster_server, volname: "gv", brickmnt: '/gbrick', gluster_servers: "{{ groups['ManagementNodes'] }}", replicas: 2, tags: [ gluster_server ] } +# - { role: gluster_volcreate, volname: "gv", gluster_servers: "{{ groups['ManagementNodes'] }}", brickmnt: '/gbrick', replicas: 2 } +# - { role: gluster_client, volname: "gv", gluster_servers: ['mgmt0','mgmt1','sql0'], volmnt: '/glusterVolume' } + - { role: nfs-client, nfsMounts: "{{ mgmtNfsMounts }}", tags: [ nfs ] } + - { role: slurmdb-config, tags: [ slurm, slurmdb-config ] } + - { role: slurm-common, tags: [ slurm, slurm-common ] } + - { role: slurm_config, tags: [ slurm, slurm-config ] } + - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] } +# - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] } +# - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] } + diff --git a/CICD/plays/nfssqlnodes.yml b/CICD/plays/nfssqlnodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..24a7338397f32ac7e0ca448935f394c77b112d86 --- /dev/null +++ b/CICD/plays/nfssqlnodes.yml @@ -0,0 +1,83 @@ +# Role to initialize nfs and SQL Nodes +# +# + +- hosts: 'all' + tasks: + - { name: setup, setup: } + tags: [ always ] + +#we need this here to gather facts and fill required variables. +- hosts: 'ManagementNodes' + gather_facts: True + tasks: + - include_vars: vars/passwords.yml + - include_vars: vars/names.yml + - include_vars: vars/ldapConfig.yml + - include_vars: vars/filesystems.yml + - include_vars: vars/slurm.yml + - include_vars: vars/vars.yml + - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' } + - { name: set use shared state, set_fact: usesharedstatedir=True } + tags: [ always ] + +- hosts: 'SQLNodes,NFSNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + pre_tasks: + - { name: set hostgroup, set_fact: hostgroup='SQLNodes', tags: [ always ] } + - { name: set use shared state, set_fact: usesharedstatedir=True, tags: [ always ] } + +- hosts: 'SQLNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + gather_facts: True + roles: + - { role: upgrade, tags: [ upgrade ] } + - { role: make_filesystems, volumes: "{{ dbvolumes }}" } + - { role: mysql, mysql_type: mysql_server, mysql_root_password: "{{ sqlrootPasswd }}", mysql_user_name: slurmdb, mysql_user_db_name: slurm_acct_db, mysql_user_hosts_group: "{{ groups['ManagementNodes'] }}", mysql_user_password: "{{ slurmdb_passwd }}", tags: [ database ] } + - { role: slurm-mysql-config, tags: [database,slurmdb] } + tags: [ sql ] + +- hosts: 'NFSNodes' + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + strategy: free + gather_facts: False + roles: + - { role: make_filesystems, volumes: "{{ nfsvolumes }}" } + tasks: + - { name: make homedir, file: { path: /nfsvol/home, state: directory }, become: true, become_user: root } + - { name: make projects, file: { path: /nfsvol/projects, state: directory }, become: true, become_user: root } + - { name: make projects, file: { path: /nfsvol/scratch, state: directory }, become: true, become_user: root } + tags: [ nfs ] + +- hosts: 'NFSNodes' + strategy: free + gather_facts: False + vars_files: + - vars/passwords.yml + - vars/names.yml + - vars/ldapConfig.yml + - vars/filesystems.yml + - vars/slurm.yml + - vars/vars.yml + roles: + - { role: nfs-server } + tags: [ nfs,nfs-server ] diff --git a/CICD/plays/roles b/CICD/plays/roles new file mode 120000 index 0000000000000000000000000000000000000000..b741aa3dbce62c5259099ec357a14dfd1ac7e2ff --- /dev/null +++ b/CICD/plays/roles @@ -0,0 +1 @@ +../../roles \ No newline at end of file diff --git a/CICD/plays/vars b/CICD/plays/vars new file mode 120000 index 0000000000000000000000000000000000000000..e8d9a6429b3aaab679b98557469104f0f7cc952b --- /dev/null +++ b/CICD/plays/vars @@ -0,0 +1 @@ +../vars \ No newline at end of file diff --git a/CICD/tests/LoginNodes/run_slurm_testsuite.inactive b/CICD/tests/LoginNodes/run_slurm_testsuite.inactive new file mode 100755 index 0000000000000000000000000000000000000000..c5d2f24f1f3cf99c3f7481f3bc467907444425d6 --- /dev/null +++ b/CICD/tests/LoginNodes/run_slurm_testsuite.inactive @@ -0,0 +1,29 @@ +#!/bin/bash +OUTPUT_LOG=$(realpath ${1-slurmtest.out}) +if ! type "scontrol" > /dev/null; then + echo "cannot find slurm" + exit 1 +fi +SLURM_DIR=${2-$(dirname $(dirname $(which scontrol)))} +#SLURM_DIR=$slurm_dir + +#if [[ -d $2 ]];then +# SLURM_SRC_DIR=$2 +#else +# SLURM_SRC_DIR=./slurm_src +# git clone https://github.com/SchedMD/slurm.git $SLURM_SRC_DIR +# cd $SLURM_SRC_DIR && ./configure +#fi +#cd $SLURM_SRC_DIR/testsuite/expect +#echo -en "set slurm_dir=$SLURM_DIR\nset max_job_delay 300\n" > globals.local +#make +#echo "log is written to $OUTPUT_LOG" +#echo "slurm dir is defined as $SLURM_DIR" +./regression > /dev/null 2> $OUTPUT_LOG +failures="$(sed -n 's/Failures: \(.*\)/\1/p' $OUTPUT_LOG)" +if (( $failures > 0 ));then + echo "$failures failures found, refer to $OUTPUT_LOG for log" + exit 1 +fi +exit 0 + diff --git a/CICD/tests/ManagementNodes/check.yml b/CICD/tests/ManagementNodes/check.yml new file mode 100644 index 0000000000000000000000000000000000000000..95e06a0a034c32c5e8ae30c2a58c40e10a738afc --- /dev/null +++ b/CICD/tests/ManagementNodes/check.yml @@ -0,0 +1,8 @@ +--- +- hosts: ManagementNodes + gather_facts: false + tasks: + - name: have ssh running + service: + name: sshd + state: started \ No newline at end of file diff --git a/CICD/tests/Readme.md b/CICD/tests/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..feca06268d107c2aeae9a6a8f61a2ed59e8648bc --- /dev/null +++ b/CICD/tests/Readme.md @@ -0,0 +1,7 @@ +this folder should contain tests that will be run automatically by the CICD pipeline + +all files with fileending .sh will be executed by a shell +all files with fileending yml will be executed by ansible-playbook +./tmp can be used as temporary folder and will be cleaned after execution + +because I can I am prefixing tests with 0-9 to give the execution some priority \ No newline at end of file diff --git a/CICD/tests/all/0_EXAMPLE_FALSE.sh b/CICD/tests/all/0_EXAMPLE_FALSE.sh new file mode 100755 index 0000000000000000000000000000000000000000..10c48607688d030fbbf054b1046e18d431b869c3 --- /dev/null +++ b/CICD/tests/all/0_EXAMPLE_FALSE.sh @@ -0,0 +1,5 @@ +#!/bin/bash +/bin/false + +status=$? +[ $status -eq 1 ] \ No newline at end of file diff --git a/CICD/tests/all/0_EXAMPLE_TRUE.sh b/CICD/tests/all/0_EXAMPLE_TRUE.sh new file mode 100755 index 0000000000000000000000000000000000000000..3634c7aa3076c2e8cd2159aca337adb35f1f31cf --- /dev/null +++ b/CICD/tests/all/0_EXAMPLE_TRUE.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +/bin/true + +status=$? +[ $status -eq 0 ] diff --git a/CICD/tests/run_tests.sh b/CICD/tests/run_tests.sh new file mode 100644 index 0000000000000000000000000000000000000000..d063e98d1d7e4617882bb14a5e1c51d9e8cda381 --- /dev/null +++ b/CICD/tests/run_tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +function usage { + echo $"Usage: $0 {all, ComputeNodes, LoginNodes, ManagementNodes, NFSNodes, sql}" INVENTORY_FILE KEY + exit 1 +} + +function run_them () +{ + #limit='--limit '"$1" + #if [ "$1" = "all" ] + #then + # limit="all" + #fi + for filename in ./tests/$1/*.sh; do # this is not sorted yet + [ -e "$filename" ] || continue + #/bin/bash -x $filename # local execution. nice for dev + ansible -i $2 --key-file $3 -m script -a "$filename" $1 + done + for filename in ./tests/$1/*.yml; do # this is not sorted yet + [ -e "$filename" ] || continue + ansible-playbook -i $2 --key-file $3 $filename # I am assuming the playbook cares about visibility here. might have to change later + done +} + +# I think I am just checking the if $1 is one of the listes strings (see usage) not proud of this at all but works +case "$1" in + all) + ;; + ComputeNodes) + ;; + ManagementNodes) + ;; + NFSNodes) + ;; + SQLNodes) + ;; + LoginNodes) + ;; + *) + usage +esac + +run_them $1 $2 $3 \ No newline at end of file diff --git a/CICD/vars/filesystems.yml b/CICD/vars/filesystems.yml new file mode 100644 index 0000000000000000000000000000000000000000..62d917425c4565d5653797e41947f98b2987375f --- /dev/null +++ b/CICD/vars/filesystems.yml @@ -0,0 +1,21 @@ +--- +computeNfsMounts: + - { name: '/home', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/home", 'opts': 'defaults,nofail', 'fstype':'nfs4' } + - { name: '/usr/local', ipv4: "118.138.235.37", src: "/usr_local", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/projects', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/projects", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/scratch', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/scratch", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } +mgmtNfsMounts: + - { name: '/mnt/home', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/home", 'opts': 'defaults,nofail', 'fstype':'nfs4' } + - { name: '/slurmstate', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/slurmstate", 'opts': 'defaults,nofail', 'fstype':'nfs4' } +dbvolumes: + - { fstype: 'ext4', name: 'dbvol', mntpt: '/dbvol', linkto: '/var/lib/mysql' } +nfsvolumes: + - { fstype: 'ext4', name: 'nfsvol', mntpt: '/nfsvol' } + - { fstype: 'ext4', name: 'slurmstate', mntpt: '/slurmstate' } +exportList: + - { name: '/home', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/home", 'opts': 'defaults,nofail', 'fstype':'nfs4' } + - { name: '/usr/local', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/usr_local_centos7", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/projects', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/projects", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/scratch', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/scratch", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/slurmstate', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/slurmstate", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + diff --git a/CICD/vars/ldapConfig.yml b/CICD/vars/ldapConfig.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ccb6e0fafe68e7b3b601c7ed1efb537eda33c74 --- /dev/null +++ b/CICD/vars/ldapConfig.yml @@ -0,0 +1,50 @@ +--- +ldapServerHostIpLine: "118.138.241.196 hpcldap0.erc.monash.edu.au" +ldapCaCertContents: | + -----BEGIN CERTIFICATE----- + MIIGODCCBCCgAwIBAgIJAJPlOnRdsYibMA0GCSqGSIb3DQEBCwUAMIGoMQswCQYD + VQQGEwJBVTERMA8GA1UECAwIVmljdG9yaWExEDAOBgNVBAcMB0NsYXl0b24xIDAe + BgNVBAoMF01vbmFzaCBlUmVzZWFyY2ggQ2VudGVyMREwDwYDVQQLDAhIUEMgVGVh + bTEeMBwGA1UEAwwVTWVSQyBIUEMgVGVhbSBSb290IENBMR8wHQYJKoZIhvcNAQkB + FhBoZWxwQG1hc3NpdmUub3JnMB4XDTE1MDgxOTAyNDczOFoXDTM1MDgxNDAyNDcz + OFowgagxCzAJBgNVBAYTAkFVMREwDwYDVQQIDAhWaWN0b3JpYTEQMA4GA1UEBwwH + Q2xheXRvbjEgMB4GA1UECgwXTW9uYXNoIGVSZXNlYXJjaCBDZW50ZXIxETAPBgNV + BAsMCEhQQyBUZWFtMR4wHAYDVQQDDBVNZVJDIEhQQyBUZWFtIFJvb3QgQ0ExHzAd + BgkqhkiG9w0BCQEWEGhlbHBAbWFzc2l2ZS5vcmcwggIiMA0GCSqGSIb3DQEBAQUA + A4ICDwAwggIKAoICAQDJxc194E9MGucoutUvmVvT04D6M3S7LlySwQ5XJd4ec22z + csmpoEep+IPVjChVKTN0mRYagAlh5UZ6VYtNA29Lkd4GC5Q2IAlrR9+pgXupuD5v + Qv1pFGEuWEPp5PHn4053gYtdVQ0pZQ7ytkVqSW5TJPNcR9AwHpW7JuQkU1jRGCO0 + t8dthC1msT62UnfjXStznjATm+M253y5PF4IquGb1K6ArR79Os2Ds78NeLyZ24vC + ik2AA6QpzkOZOLzRZLyWn4Gdz/jyblZP/A/zjM83symIdn3dv0wC8A3hZsHP771X + tw2f6uyiXPftiJt0YuPQdw9kdbDda0Dp7UwiTdaUdzBsQYUGuCQhw3T3NurPZu83 + K4ftVnIez9VO+5buJQxX0dc0/w0fwIZVtMesdMt+08x6Cf9nVmDrheArTKYWOq0r + 5eNntg16JAVBixRMwiV+KL4VP/pSKXQK2a9WptzEjVHLSsN0oMAoHkBVz47fSIdD + O79jYak+yvPORMkqd0iwMnt0F+wg9JrMVhhCmU5vdqgwQy60LCHn23IX7x821YTt + inQM43FsvRCAwWabWinn1prPHLpzaeMgE0wSVBtd4CvPqQ0fW5HJjdOjzyKRim8d + 1jN+1opa7CbcM2byfUU0yd1wU4jp5DSeZokV8ECr43pUymcc2dJwmTNApcg92wID + AQABo2MwYTAdBgNVHQ4EFgQUJ4sfHiRrNF3i/yAaV+OnIvfOAwgwHwYDVR0jBBgw + FoAUJ4sfHiRrNF3i/yAaV+OnIvfOAwgwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8B + Af8EBAMCAYYwDQYJKoZIhvcNAQELBQADggIBAF/gyOaUKw0AUxfoWfC4+hsD/QFg + h+GvOTrT+xA5Z8qpaPJDJijVQ8zAVFRyUsZZ9ZXe+QkIqP1WXnX0ROeDJ3LRnaoO + Vq/jy1OratWDsoNCvhjY5ZY2eZh2CXQVj40BD6iZJpfgNayDsId7wUKTraBaZ+k4 + NXu65f6objeIx8febnazV7s9C0Ola2fpdv7/JmiiizFMn62codyztA6J9+HXirc5 + Pq+RKVqPvBEWRi2LKAsbOubFklXTwe8cTwmMFUT2BPp6gpwIXtaSOpBQX/Ynthp5 + LRGU/koLZSKAeYIoUPH4pJHe89fpgtOuKBjRlOFdnUjJ90xIh2dyZm3G4JyINwKF + HrdGsu+RunUtE1AfT5S21ilcSjqLvQUfciWEyRcnmAyi/9o7upJlQCNGcPy3l5kJ + VdpRBtmVK08k1S9HtvQvqY82fDEnbxzFOla2uPDQ3sE1LodvY4KUZrA9ML3EUyeG + F5mvvhUOSMkmB8VouE2gt0g4rFXtHL6nHQ7rr1Ha/xcm/dVQY4e4Z43OYEflRkNV + R6VdSNWq3Voh4ASrLfuv4/5Mbt5BnLKvzvnZVeNmJIh2Rc/eYfao1K7K6siAUhP2 + ONklIbbx/WISO5Vchcw65DclkEBZos2KqRoMb/Rxn5sFIvRWgrXvzw39o8agWO0K + 9jGyW0SYdK9x4Qxn + -----END CERTIFICATE----- +ldapCaCertFile: /etc/ssl/certs/cacert.crt +ldapDomain: "erc.monash.edu.au" +ldapURI: "ldaps://hpcldap0.erc.monash.edu.au:636" +ldapROURI: "ldaps://hpcldap1.erc.monash.edu.au:636" +ldapBindDN: "cn=ldapuser,ou=People,dc=erc,dc=monash,dc=edu,dc=au" +ldapBindDNPassword: "thisisafakepassword" +ldapManagerDN: "cn=Manager,dc=erc,dc=monash,dc=edu,dc=au" +ldapBase: "dc=erc,dc=monash,dc=edu,dc=au" +ldapGroupBase: "ou=Groups,dc=erc,dc=monash,dc=edu,dc=au" +ldapRfc2307Pam: "" +ldap_access_filter: "(&(objectClass=posixAccount)(memberOf=cn=m3,ou=aclgroups,dc=erc,dc=monash,dc=edu,dc=au))" diff --git a/CICD/vars/names.yml b/CICD/vars/names.yml new file mode 100644 index 0000000000000000000000000000000000000000..fa7063762a3477f082cd454fce0101dbcb8a0bbc --- /dev/null +++ b/CICD/vars/names.yml @@ -0,0 +1,3 @@ +--- +domain: massive.org.au +smtp_smarthost: smtp.monash.edu.au diff --git a/CICD/vars/passwords.yml b/CICD/vars/passwords.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d9b84303496ba0dc41b2869783b3183310ba271 --- /dev/null +++ b/CICD/vars/passwords.yml @@ -0,0 +1,7 @@ +--- +mungekey: ySdSOpFMyLihx4tQlR0znm07UlvALxB1 +slurmdb_passwd: ySdSOpFMyLihx4tQlR0znm07UlvALxB2 +sqlrootPasswd: ySdSOpFMyLihx4tQlR0znm07UlvALxB3 +sudo_group: systems +default_user_password_clear: ySdSOpFMyLihx4tQlR0znm07UlvALxBL +default_user_password: ySdSOpFMyLihx4tQlR0znm07UlvALxBL diff --git a/CICD/vars/slurm.yml b/CICD/vars/slurm.yml new file mode 100644 index 0000000000000000000000000000000000000000..65def4d949685d32b7f6b705a6390c9a6dfdab2a --- /dev/null +++ b/CICD/vars/slurm.yml @@ -0,0 +1,45 @@ +--- +desktopNodeList: + - { name : 'DesktopNodes', interface : 'eth0' } +clustername: "m3" +projectname: "m3" +slurm_version: 19.05.3-2 +munge_version: 0.5.11 +nhc_version: 1.4.2 +munge_dir: /opt/munge-{{ munge_version }} +slurm_dir: /opt/slurm-{{ slurm_version }} +nhc_dir: /opt/nhc-{{ nhc_version }} +nhc_config_file: nhc.conf +nhc_log_level: 0 +nhc_emails: nobody@nowhere.nowhere +nhc_email_subject: "Node Health Check" +openmpi_version: 1.8.3 +mysql_host: "{{ groups['SQLNodes'][0] }}" +slurmctrl: "{{ groups['ManagementNodes'][0] }}" +slurmctrlbackup: "{{ groups['ManagementNodes'][1] }}" +slurmdbd: "{{ groups['ManagementNodes'][0] }}" +slurmdbdbackup: "{{ groups['ManagementNodes'][1] }}" +slurm_use_vpn: false +slurm_lua: true +slurmqueues: + - {name: batch, group: ComputeNodes, default: yes} +# - {name: vis, group: DesktopNodes, default: no} +slurmlogin: "{{ groups['LoginNodes'][0] }}" +slurmlogdir: "/var/log" +slurmctlddebug: {level: 5, log: '/mnt/slurm-logs/slurmctld.log'} +slurmddebug: {level: 5, log: '/var/log/slurmd.log'} +slurmschedlog: {level: 5, log: '/mnt/slurm-logs/slurmsched.log'} +slurmdbdlog: {level: 5, log: '/mnt/slurm-logs/slurmdbd.log'} +slurmfairshare: {def: false, val: 10000} +slurmdatadir: "/opt/slurm/var/spool" +slurmstatedir: "/opt/slurm/var/state" +slurmsharedstatedir: "/slurmstate" +slurmpiddir: "/opt/slurm-latest/var/run" +slurmdbdpiddir: "/opt/slurm/var/run" +slurmaccount_create_user: "/usr/local/sbin/slurmuseraccount.sh" +slurm_provision: "/cinderVolume/local/sbin/slurm_provision.sh" +slurmselecttype: "select/linear" +slurmfastschedule: "1" +slurmschedulertype: "sched/backfill" +restartServerList: + - slurm diff --git a/CICD/vars/vars.yml b/CICD/vars/vars.yml new file mode 100644 index 0000000000000000000000000000000000000000..d1dc95cfbdf366fc202881732465d8f55bbda36b --- /dev/null +++ b/CICD/vars/vars.yml @@ -0,0 +1,10 @@ +--- +sudo_group: systems +nagios_home: "/var/lib/nagios" +nvidia_version: "390.46" + +gpumap: + 'K1': 'K1' + 'K80': 'K80' + 'P100-PCIE-16GB': 'P100' + 'V100-PCIE-16GB': 'V100' diff --git a/README.md b/README.md index df8c0a84b279f979830ad1230ffff2060cd239a6..f06edfcda4693187e6cff8e9daaa00f40a8ba763 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ We are working from https://docs.google.com/a/monash.edu/spreadsheets/d/1IZNE7vMid_SHYxImGVtQcNUiUIrs_Nu1xqolyblr0AE/edit#gid=0 as our architecture document. +[](https://gitlab.erc.monash.edu.au/hpc-team/ansible_cluster_in_a_box/commits/cicd) + We aim to make these roles as generic as possible. You should be able to start from an inventory file, an ssh key and a git clone of this and end up with a working cluster. In the longer term we might branch to include utilities to make an inventory file using NeCTAR credentials. If you need a password use get_or_make_password.py (delegated to the passwword server/localhost) to generate a random one that can be shared between nodes diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 7e2d16a9b83a2bf3d47340ead9b62de1618f5fd0..02bc89770d1639870f12678aa05b218318dcb01a 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -1,8 +1,11 @@ --- - name: make sure out repo server is resolvable - lineinfile: dest=/etc/hosts line="118.138.244.7 consistency0" - become: true + lineinfile: + dest: /etc/hosts + line: "118.138.244.7 consistency0" + owner: root + group: root #- name: remove default repos diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 86eb7381aa2b5d5651799f1d5e9b23d1d9be2dbc..f76796f1881d3a0efb9b3eb3974e261e2b9dab58 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -130,8 +130,15 @@ template: src=xserver.j2 dest=/etc/pam.d/xserver become: true -- name: build nvidia driver - shell: chmod 755 /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run; /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run --silent +- name: chmod nvidia driver builder + file: + path: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + mode: 0755 + become: true + when: install_driver + +- name: build nvidia driver + shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run become: true when: install_driver diff --git a/roles/mysql/tasks/mysql_server.yml b/roles/mysql/tasks/mysql_server.yml index 8ea439879957fb68a8e845732367b63082e9c0a9..f8edd4e66ceed4323aa0ad83364ec890b93e80c1 100644 --- a/roles/mysql/tasks/mysql_server.yml +++ b/roles/mysql/tasks/mysql_server.yml @@ -10,17 +10,18 @@ with_items: "{{ server_packages }}" become: true when: ansible_os_family == "RedHat" + +- name: make sure mysql conf directory exists + file: dest=/etc/mysql/conf.d state=directory + become: true + register: mysqldb_confdir_create - name: "Starting MySQL" service: name={{ sqlServiceName }} state=started enabled=true become: true -- name: make sure mysql conf directory exists - file: dest=/etc/mysql/conf.d state=directory - become: true - #- name: "Adding root" -# sudo: true +# become: true # mysql_user: name=root host="{{ item }}" password="{{ mysql_root_password }}" login_user=root login_password="{{ mysql_root_password }}" check_implicit_admin=yes # with_items: # - "{{ ansible_hostname }}" @@ -28,9 +29,10 @@ # - ::1 # - localhost -- name: update mysql root password for all root accounts +- name: update mysql root password for all root accounts # this will only work if a completely fresh db gets installed because it gets shipped with a blank root pw mysql_user: name=root host=localhost password={{ mysql_root_password }} login_user=root - + when: mysqldb_confdir_create.changed + - name: "Adding user database" mysql_db: name={{ mysql_user_db_name }} state=present login_user=root login_password={{ mysql_root_password }} diff --git a/roles/slurm-common/tasks/createSlurmDirectories.yml b/roles/slurm-common/tasks/createSlurmDirectories.yml index 1f4aabac17986038c2f22ef8eb274d30e7ac8aa3..738956823167ca062efe85940774a45c9a547423 100644 --- a/roles/slurm-common/tasks/createSlurmDirectories.yml +++ b/roles/slurm-common/tasks/createSlurmDirectories.yml @@ -37,7 +37,7 @@ - name: create slurmdbdpiddir directory file: path={{ slurmdbdpiddir }} state=directory owner=slurm group=slurm mode=755 - sudo: true + become: true - name: create shared state directory file: path={{slurmsharedstatedir }} state=directory owner=slurm group=slurm mode=750 diff --git a/roles/slurm-common/tasks/installSlurmFromSource.yml b/roles/slurm-common/tasks/installSlurmFromSource.yml index 90816f18893593e92e11587eb71e88f307a3feef..9d1a326c634ede300ccbe6571b6123b88903cf50 100644 --- a/roles/slurm-common/tasks/installSlurmFromSource.yml +++ b/roles/slurm-common/tasks/installSlurmFromSource.yml @@ -1,10 +1,14 @@ - name: remove all install - shell: rm -rf /tmp/slurm-{{ slurm_version }} - become: true + file: + path: "/tmp/slurm-{{ slurm_version }}" + state: absent + become: true when: force_slurm_recompile is defined - name: remove all install - shell: rm -rf {{ slurm_dir }} + file: + path: "{{ slurm_dir }}" + state: absent become: true when: force_slurm_recompile is defined @@ -94,12 +98,17 @@ become: true - name: remove exist-slurm-latest-link - shell: rm -f /opt/slurm-latest + file: + path: /opt/slurm-latest + state: absent become: true when: force_slurm_recompile is defined or not stat_srun.stat.exists - name: put slurm-latest-link - shell: ln -s {{ slurm_dir }} /opt/slurm-latest + file: + src: "{{ slurm_dir }}" + dest: /opt/slurm-latest + state: link become: true when: force_slurm_recompile is defined or not stat_srun.stat.exists diff --git a/roles/slurmdb-config/tasks/main.yml b/roles/slurmdb-config/tasks/main.yml index 1a50bdaf5c99f0f6e91d4c1c017a22637f48a3da..c189183bab51ca97da66ddbae06aba5c73931bed 100644 --- a/roles/slurmdb-config/tasks/main.yml +++ b/roles/slurmdb-config/tasks/main.yml @@ -23,11 +23,16 @@ - name: create slurm group group: name=slurm system=yes gid=497 - sudo: true + become: true - name: create slurm user # this is duplicated from slurm-common - user: name=slurm group=slurm system=yes createhome=no uid=497 - sudo: true + user: + name: slurm + group: slurm + system: yes + createhome: no + uid: 497 + become: true - name: install slurmdb.conf copy: diff --git a/scripts/make_inventory.py b/scripts/make_inventory.py index 9e7997bcf72dc43b633fa3ae53d979bdea2acd06..48bd21d85e1a7314d0982d062227c33ac2b87783 100755 --- a/scripts/make_inventory.py +++ b/scripts/make_inventory.py @@ -34,9 +34,11 @@ def gatherInfo(md_key,md_value,authDict,project_id,inventory): if groupName not in inventory: inventory[groupName] = [] inventory[groupName].append(hostname) # Add other metadata + if not hostname in inventory['_meta']['hostvars']: + inventory['_meta']['hostvars'][hostname] = {} for md in server.metadata.items(): if md[0] not in (md_key,'ansible_host_groups'): - inventory['_meta']['hostvars'][hostname] = { md[0]:md[1] } + inventory['_meta']['hostvars'][hostname].update({ md[0]:md[1] }) if novaVolumes: volDict = {} for volume in novaVolumes: @@ -51,6 +53,8 @@ def gatherInfo(md_key,md_value,authDict,project_id,inventory): for nn in server.networks.keys(): if 'internal' in nn: network_name = nn + else: + inventory['_meta']['hostvars'][hostname]['public_host'] = server.networks[nn][0] if network_name == None: network_name = list(server.networks.keys())[0]