diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 24b4dcbd65298b92768dc13857f9c679750b3a9f..1f974383314e5a0c373650570161e1a47ea3d88d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,7 +23,7 @@ trigger_pipeline_in_Clusterbuild: - ansible script: - echo ${CI_JOB_TOKEN} - - curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=aciab_upstream https://gitlab.erc.monash.edu.au/api/v4/projects/193/trigger/pipeline # ID is from clusterbuild + - curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=master https://gitlab.erc.monash.edu.au/api/v4/projects/193/trigger/pipeline # ID is from clusterbuild trigger_pipeline_in_monarch: @@ -32,7 +32,7 @@ trigger_pipeline_in_monarch: - ansible script: - echo ${CI_JOB_TOKEN} - - curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=cicd https://gitlab.erc.monash.edu.au/api/v4/projects/385/trigger/pipeline # ID is from monarch + - curl --request POST --form token=${CI_JOB_TOKEN} --form "variables[TRIGGER_CI_COMMIT_SHA]=${CI_COMMIT_SHA}" --form ref=master https://gitlab.erc.monash.edu.au/api/v4/projects/385/trigger/pipeline # ID is from monarch yamllint: @@ -134,9 +134,9 @@ tests: - grep -qv "I could not find any resouces tagged with project_name:" ./files/inventory.$STACKNAME #fail if inventory file is empty - ansible -m ping -i files/inventory.$STACKNAME --key-file ../gc_key.pem all - ansible -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sudo ls" all - - - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "sinfo" ManagementNodes - - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "squeue" ManagementNodes + # Need to find a better check for sinfo + #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name sinfo -type f" ManagementNodes + #- ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "find /opt/ -name squeue -type f" ManagementNodes - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet mariadb" SQLNodes - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmctld" ManagementNodes - ansible -B 30 -i files/inventory.$STACKNAME --key-file ../gc_key.pem -a "systemctl is-active --quiet slurmdbd" ManagementNodes @@ -147,6 +147,7 @@ tests: - bash -e ./tests/run_tests.sh ManagementNodes "files/inventory.$STACKNAME" "../gc_key.pem" - bash -e ./tests/run_tests.sh NFSNodes "files/inventory.$STACKNAME" "../gc_key.pem" - bash -e ./tests/run_tests.sh SQLNodes "files/inventory.$STACKNAME" "../gc_key.pem" + - bash -e ./tests/run_tests.sh slurm "files/inventory.$STACKNAME" "../gc_key.pem" extended: stage: extended @@ -159,7 +160,7 @@ extended: - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh script: - source ./$NECTAR_ALLOCATION-openrc.sh - - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $STACKNAME + - bash -x ./CICD/heat/heatcicdwrapper.sh delete_if_exists $STACKNAME ${CI_PROJECT_NAME} only: variables: - $EXTENDED != null @@ -180,7 +181,7 @@ manual_cluster_spawn: - echo "$HPCCICD_openrc" > ./$NECTAR_ALLOCATION-openrc.sh script: - source ./$NECTAR_ALLOCATION-openrc.sh - - bash -x ./CICD/heat/heatcicdwrapper.sh create $MANUAL_STACKNAME + - bash -x ./CICD/heat/heatcicdwrapper.sh create $MANUAL_STACKNAME ${CI_PROJECT_NAME} - openstack stack list - export STACKNAME=$MANUAL_STACKNAME - sleep 25 diff --git a/CICD/heat/gc_HOT.yaml b/CICD/heat/gc_HOT.yaml index 982306e03e5a167aae0f5a66cc67b41e2839e064..75f12b0ee4902e6b2c3a742914ef282e0d869814 100644 --- a/CICD/heat/gc_HOT.yaml +++ b/CICD/heat/gc_HOT.yaml @@ -114,7 +114,7 @@ resources: volume_id: { get_resource: DBVolume } instance_uuid: { get_resource: SQLNode0 } - MgmtNodes: + MgmtNodesCentos7: type: "OS::Heat::ResourceGroup" properties: count: 2 @@ -130,7 +130,23 @@ resources: security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID }, { get_param: MySQLSecGroupID } ] project_name: { get_param: project_name } - LoginNodes: + MgmtNodesU: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: My::Server::MgmtNode + properties: + #avz: { get_param: avz } + image: { get_param: ubuntu_1804_image_id } + ansible_ssh_user: ubuntu + mynodename: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'mgmtU%index%' ]] + ssh_key: { get_param: ssh_key } + security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID }, { get_param: MySQLSecGroupID } ] + project_name: { get_param: project_name } + + LoginNodesC: type: "OS::Heat::ResourceGroup" properties: count: 1 @@ -151,6 +167,27 @@ resources: networks: - network: { get_param: NetID } + LoginNodesU: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: m3.xsmall + image: { get_param: ubuntu_1804_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'loginU%index%' ]] + security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID } ] + metadata: + ansible_host_groups: [ LoginNodes ] + ansible_ssh_user: ubuntu + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + DesktopNodes: type: "OS::Heat::ResourceGroup" properties: @@ -172,7 +209,28 @@ resources: networks: - network: { get_param: NetID } - ComputeNodes: + ComputeNodesU: + type: "OS::Heat::ResourceGroup" + properties: + count: 0 + resource_def: + type: "OS::Nova::Server" + properties: + #availability_zone: { get_param: avz } + flavor: m3.xsmall + image: { get_param: ubuntu_1804_image_id } + key_name: { get_param: ssh_key } + name: + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'computeU%index%' ]] + security_groups: [ default, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID }, { get_param: SSHMonashSecGroupID } ] + metadata: + ansible_host_groups: [ ComputeNodes ] + ansible_ssh_user: ubuntu + project_name: { get_param: project_name } + networks: + - network: { get_param: NetID } + + ComputeNodesCentos7: type: "OS::Heat::ResourceGroup" properties: count: 1 @@ -184,8 +242,8 @@ resources: image: { get_param: centos_7_image_id } key_name: { get_param: ssh_key } name: - list_join: [ '-', [ { get_param: "OS::stack_name" }, 'computec%index%' ]] - security_groups: [ default, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID }, { get_param: SSHMonashSecGroupID } ] + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'computec7%index%' ]] + security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID } ] metadata: ansible_host_groups: [ ComputeNodes ] ansible_ssh_user: ec2-user @@ -201,14 +259,14 @@ resources: type: "OS::Nova::Server" properties: #availability_zone: { get_param: avz } - flavor: m3.xsmall + flavor: mon.c10r35.gpu-k2 image: { get_param: ubuntu_1804_image_id } key_name: { get_param: ssh_key } name: - list_join: [ '-', [ { get_param: "OS::stack_name" }, 'desktopu%index%' ]] + list_join: [ '-', [ { get_param: "OS::stack_name" }, 'gpudesktopu%index%' ]] security_groups: [ default, { get_param: SSHMonashSecGroupID }, { get_param: SlurmSecGroupID }, { get_param: NFSSecGroupID } ] metadata: - ansible_host_groups: [ DesktopNodes ] + ansible_host_groups: [ DesktopNodes, GPU, ComputeNodes, K1, VisNodes ] ansible_ssh_user: ubuntu project_name: { get_param: project_name } networks: diff --git a/CICD/heat/heatcicdwrapper.sh b/CICD/heat/heatcicdwrapper.sh index 26afdebda88b5ba150f9e947b997f21e7f6b461d..e6554d664e78d68acbf5bc2ec9160a12541fee00 100644 --- a/CICD/heat/heatcicdwrapper.sh +++ b/CICD/heat/heatcicdwrapper.sh @@ -8,14 +8,8 @@ function usage { exit 1 } -if [ "$#" -ne 2 ]; then - echo "Illegal number of parameters expecting 2" - usage -fi - STACKNAME=$2 - if [[ "$STACKNAME" == "CICD"* ]]; then echo "CICD found in stackname. doing nothing" else diff --git a/CICD/heat/resource_registry.yaml b/CICD/heat/resource_registry.yaml index 0638b887c8c09d5d6a98f51a34d3b4eeb6e9aafb..421a309d5ce769fdaa0cfcf590fc927a4104eab1 100644 --- a/CICD/heat/resource_registry.yaml +++ b/CICD/heat/resource_registry.yaml @@ -1,2 +1,2 @@ resource_registry: - My::Server::MgmtNode: mgmtnode_HOT.yaml + My::Server::MgmtNode: ./mgmtnode_HOT.yaml diff --git a/CICD/plays/allnodes.yml b/CICD/plays/allnodes.yml index d6eed3ef32a32d9b702b20534e2f916652c3c3e4..812ca4e0b6f10b46734702629bb28635695c193a 100644 --- a/CICD/plays/allnodes.yml +++ b/CICD/plays/allnodes.yml @@ -24,7 +24,7 @@ # - { role: disable_selinux, tags: [ disableselinux ] } - { role: etcHosts, tags: [ networking ] } - { role: config_repos, tags: [ repos ] } - - { role: upgrade } + - { role: upgrade, tags: [ upgrade ]} - { role: set_password } diff --git a/CICD/plays/mgmtnodes.yml b/CICD/plays/mgmtnodes.yml index 5d4241194324fe13739e074b4ee749c969935dfb..c37d10a86fef7555560e5d2586fb4f07e74ecd78 100644 --- a/CICD/plays/mgmtnodes.yml +++ b/CICD/plays/mgmtnodes.yml @@ -29,10 +29,7 @@ # - { role: ldapclient, tags: [ authentication ] } # - { role: ssh-password-login } # - { role: enable_sudo_group } -# - { role: make_filesystems, volumes: "{{ glustervolumes }}" } -# - { role: gluster_server, volname: "gv", brickmnt: '/gbrick', gluster_servers: "{{ groups['ManagementNodes'] }}", replicas: 2, tags: [ gluster_server ] } -# - { role: gluster_volcreate, volname: "gv", gluster_servers: "{{ groups['ManagementNodes'] }}", brickmnt: '/gbrick', replicas: 2 } -# - { role: gluster_client, volname: "gv", gluster_servers: ['mgmt0','mgmt1','sql0'], volmnt: '/glusterVolume' } + - { role: nfs-client, nfsMounts: "{{ mgmtNfsMounts }}", tags: [ nfs ] } - { role: slurmdb-config, tags: [ slurm, slurmdb-config ] } - { role: slurm-common, tags: [ slurm, slurm-common ] } diff --git a/CICD/tests/all/check.yml b/CICD/tests/all/check.yml new file mode 100644 index 0000000000000000000000000000000000000000..fd95357394f0c3d582043aace19cd80b3dad9dd5 --- /dev/null +++ b/CICD/tests/all/check.yml @@ -0,0 +1,16 @@ +--- +- hosts: ManagementNodes + gather_facts: false + tasks: + - name: have ssh running + service: + name: sshd + state: started + +- hosts: ComputeNodes + gather_facts: false + tasks: + - name: have munge service running + service: + name: munge + state: started \ No newline at end of file diff --git a/CICD/tests/run_tests.sh b/CICD/tests/run_tests.sh index d063e98d1d7e4617882bb14a5e1c51d9e8cda381..bfb8278ee1a8f2a8534236990d6ac11455ebb7e7 100644 --- a/CICD/tests/run_tests.sh +++ b/CICD/tests/run_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash function usage { - echo $"Usage: $0 {all, ComputeNodes, LoginNodes, ManagementNodes, NFSNodes, sql}" INVENTORY_FILE KEY + echo $"Usage: $0 {all, ComputeNodes, LoginNodes, ManagementNodes, NFSNodes, sql, slurm}" INVENTORY_FILE KEY exit 1 } @@ -23,22 +23,4 @@ function run_them () done } -# I think I am just checking the if $1 is one of the listes strings (see usage) not proud of this at all but works -case "$1" in - all) - ;; - ComputeNodes) - ;; - ManagementNodes) - ;; - NFSNodes) - ;; - SQLNodes) - ;; - LoginNodes) - ;; - *) - usage -esac - run_them $1 $2 $3 \ No newline at end of file diff --git a/CICD/tests/slurm/srunHostname.yml b/CICD/tests/slurm/srunHostname.yml new file mode 100644 index 0000000000000000000000000000000000000000..2e1c0886b847674b68cf3cd5f186cc50cfa54bdf --- /dev/null +++ b/CICD/tests/slurm/srunHostname.yml @@ -0,0 +1,55 @@ +--- +- hosts: ManagementNodes,LoginNodes,ComputeNodes + gather_facts: false + tasks: + - name: add user hpctest + user: + name: hpctest + shell: /bin/bash + become: true + +- hosts: ManagementNodes + gather_facts: false + tasks: + - name: Create a parent account + command: ./sacctmgr -i add account parentAccount cluster=m3 Description="Test parent account" Organization="Monash" + args: + chdir: '/opt/slurm-latest/bin' + become: true + register: result + failed_when: result.rc != 0 and result.stdout != " Nothing new added." + + - name: Create a project associated with a given parent + command: ./sacctmgr -i add account testProject parent=parentAccount cluster=m3 Organization="Monash" + args: + chdir: '/opt/slurm-latest/bin' + become: true + register: result + failed_when: result.rc != 0 and result.stdout != " Nothing new added." + + - name: Create a user and associate them with a project + command: ./sacctmgr -i create user hpctest cluster=m3 account=testProject partition=batch + args: + chdir: '/opt/slurm-latest/bin' + become: true + register: result + failed_when: result.rc != 0 and result.stdout != " Nothing new added." + +#sudo `which sacctmgr` modify user where name=hpctest set maxjobs=200 +## 18 sudo `which sacctmgr` update account hpctest set qos=normal +# 22 sudo `which sacctmgr` update account testProject set qos=normal + +- hosts: LoginNodes + gather_facts: false + tasks: + - name: make sure munge is running + service: + name: munge + state: started + become: true + - name: simple srun test + command: ./srun --ntasks=1 --partition=batch hostname + args: + chdir: '/opt/slurm-latest/bin' + become: true + become_user: hpctest diff --git a/CICD/vars/slurm.yml b/CICD/vars/slurm.yml index 65def4d949685d32b7f6b705a6390c9a6dfdab2a..09dfc0af95ef91b97ad30576a92662dd7feecc55 100644 --- a/CICD/vars/slurm.yml +++ b/CICD/vars/slurm.yml @@ -3,8 +3,8 @@ desktopNodeList: - { name : 'DesktopNodes', interface : 'eth0' } clustername: "m3" projectname: "m3" -slurm_version: 19.05.3-2 -munge_version: 0.5.11 +slurm_version: 19.05.4 +munge_version: 0.5.13 nhc_version: 1.4.2 munge_dir: /opt/munge-{{ munge_version }} slurm_dir: /opt/slurm-{{ slurm_version }} diff --git a/CICD/vars/vars.yml b/CICD/vars/vars.yml index 83485426b7e370a91d2fd15a5083156c483a1f4e..7def1ce714e85c7a5325fa6ccfe5ce3a9141a508 100644 --- a/CICD/vars/vars.yml +++ b/CICD/vars/vars.yml @@ -1,7 +1,7 @@ --- sudo_group: systems nagios_home: "/var/lib/nagios" -nvidia_version: "390.46" +nvidia_version: "367.134" yumdisablerepo: - 'base' @@ -16,6 +16,7 @@ yumenablerepo: gpumap: 'K1': 'K1' + 'K2': 'K2' 'K80': 'K80' 'P100-PCIE-16GB': 'P100' 'V100-PCIE-16GB': 'V100' diff --git a/roles/calculateSlurmConf/templates/slurm.conf.j2 b/roles/calculateSlurmConf/templates/slurm.conf.j2 index dc833e3e78c8fa191c434c0795c0f936cfbb1e7c..d460ef811ddb8f3474b26a5e3ff72bf7434a718b 100644 --- a/roles/calculateSlurmConf/templates/slurm.conf.j2 +++ b/roles/calculateSlurmConf/templates/slurm.conf.j2 @@ -119,8 +119,8 @@ JobCompType=jobcomp/none Prolog={{ slurmjob.prolog }} Epilog={{ slurmjob.epilog }} {% else %} -Prolog={{ slurm_dir }}/bin/slurm.prolog -Epilog={{ slurm_dir }}/bin/slurm.epilog +Prolog=/opt/slurm/etc/slurm.prolog +Epilog=/opt/slurm/etc/slurm.epilog {% endif %} # # ACCOUNTING diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 684c327aa9be6fb9c90f588bf44857f94794bfe7..402ac09c2cbebd578ff29f16468c64b30385d497 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -6,8 +6,7 @@ line: "{{ reposerverip }} {{ reposervername }}" #this is duplicated in the role calculateEtcHosts owner: root group: root - become: true - + become: True #- name: remove default repos # file: @@ -44,6 +43,7 @@ - name: get enabled repos #shell: yum repolist | grep -v "repo id" | grep -v "Loaded plugins" | head -n -1 | cut -f 1 -d '/' | sed -s 's/\!//' shell: yum repolist all | grep enabled | cut -f 1 -d '/' | sed -s 's/\!//' + when: ansible_os_family == 'RedHat' register: repolist check_mode: no changed_when: False @@ -55,7 +55,8 @@ with_items: "{{ repolist.stdout_lines|difference(yumenablerepo) }}" become: true become_user: root - ignore_errors: false + ignore_errors: true + when: ansible_os_family == 'RedHat' #- name: Enable epel @@ -75,11 +76,6 @@ become: true when: ansible_distribution_release == 'trusty' -- name: add repos apt - shell: "add-apt-repository -y ppa:gluster/glusterfs-3.7" - become: true - when: ansible_distribution == 'Ubuntu' - - name: apt-get update apt: update_cache=True become: true diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index f76796f1881d3a0efb9b3eb3974e261e2b9dab58..96425d40b43dffc3b055f0c4b3479a6f1ec3bab1 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -25,12 +25,62 @@ - xorg-x11-xauth - xorg-x11-proto-devel - xorg-x11-xkb-utils + when: ansible_os_family == 'RedHat' + +- name: install deps + apt: + name: + - 'gcc' + - 'perl' + - 'wget' + - 'pciutils' + - 'linux-headers-generic' + - 'xterm' + - 'libx11-dev' + - 'libx11-6' + - 'libglvnd-dev' + - 'xserver-xorg' + - 'vim' + state: present + update_cache: yes + become: true + become_user: root + when: ansible_distribution == 'Ubuntu' + +- name: install deps + yum: name={{ item }} state=installed + become: true + with_items: + - gcc + - perl + - wget + - pciutils + - kernel-headers + - kernel-devel + - xterm + - libX11-common + - libX11-devel + - libX11 + - libglvnd-devel + - xorg-x11-server-common + - xorg-x11-util-macros + - xorg-x11-server-utils + - xorg-x11-font-utils + - xorg-x11-server-Xorg + - xorg-x11-glamor + - xorg-x11-xinit + - xorg-x11-utils + - xorg-x11-xauth + - xorg-x11-proto-devel + - xorg-x11-xkb-utils + when: ansible_os_family == 'RedHat' - name: install development tools yum: name="@Development Tools" state=installed become: true become_user: root ignore_errors: yes + when: ansible_os_family == 'RedHat' - name: disable nouveau template: src=blacklist-nouveau.conf.j2 dest=/etc/modprobe.d/blacklist-nouveau.conf @@ -99,7 +149,6 @@ become: true when: install_driver - - name: stop the persistence daemon service: name=nvidia-persistenced state=stopped become: true @@ -138,7 +187,7 @@ when: install_driver - name: build nvidia driver - shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run + shell: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_version }}.run -q -a -n -X -s become: true when: install_driver @@ -164,6 +213,15 @@ # become: true # become_user: root # when: template_xorgconf is defined and template_xorgcon +- name: install dependencies for nvidia-xconf-gen + apt: + name: + - python-jinja2 + - python3-jinja2 + update_cache: yes + state: present + become: true + become_user: root - name: run nvidia-xconf-gen script: scripts/nvidia-xconf-gen.py diff --git a/roles/lmod/tasks/main.yml b/roles/lmod/tasks/main.yml index 9e2ac4af909db08388e570ca586bec83e0889118..e33c2ee5664f527213be4c64f09f882fb9cb3957 100644 --- a/roles/lmod/tasks/main.yml +++ b/roles/lmod/tasks/main.yml @@ -12,8 +12,7 @@ - gcc - lua-devel become: true - when: - - '"CentOS" in ansible_distribution' + when: ansible_os_family == 'RedHat' - name: install lua RHEL7 yum: name={{ item }} state=installed update_cache=yes enablerepo="Monash_University_EPEL7_EPEL_7_-_x86_64" @@ -30,18 +29,8 @@ - '"RedHat" in ansible_distribution' become: true - - name: install lua debian - apt: name={{ item }} state=installed - with_items: - - lua5.2 - - lua5.2 - - lua-filesystem - - lua-bitop - - lua-posix - - liblua5.2-0 - - liblua5.2-dev - - tcl + apt: name=lmod state=installed become: true when: ansible_os_family == 'Debian' @@ -49,13 +38,12 @@ stat: path="{{ soft_dir }}/lmod/{{ lmod_version }}" register: lmodstat - - name: Download LMOD get_url: url=http://consistency0/src/Lmod-{{ lmod_version }}.tar.bz2 dest={{ source_dir }}/Lmod-{{ lmod_version }}.tar.bz2 mode=0444 - when: not lmodstat.stat.exists + when: ansible_os_family == 'RedHat' and not lmodstat.stat.exists - name: Uncompress LMOD unarchive: @@ -63,10 +51,11 @@ dest={{ source_dir }} copy=no creates={{ source_dir }}/Lmod-{{ lmod_version }}/README - when: not lmodstat.stat.exists + when: ansible_os_family == 'RedHat' and not lmodstat.stat.exists - name: Compile and install Lmod shell: cd {{ source_dir }}/Lmod-{{ lmod_version }}; ./configure --prefix={{ soft_dir }} --with-mpathSearch=YES --with-caseIndependentSorting=YES && make install LUA_INCLUDE={{ lua_include }} args: creates: "{{ soft_dir }}/lmod/{{ lmod_version }}" become: true + when: ansible_os_family == 'RedHat' \ No newline at end of file diff --git a/roles/mysql/tasks/CentOS_7_mysql_server.yml b/roles/mysql/tasks/CentOS_7_mysql_server.yml new file mode 100644 index 0000000000000000000000000000000000000000..33f65d3d5eecdc877103b3ba9fa656588b1e7b37 --- /dev/null +++ b/roles/mysql/tasks/CentOS_7_mysql_server.yml @@ -0,0 +1,57 @@ +--- +- name: Make sure OS is updated since apt install might fail + apt: + update_cache: yes + become: true + when: ansible_os_family == "Debian" + +- name: "Installing MySQL Debian" + apt: name="{{ server_packages }}" update_cache=yes state=present + become: true + when: ansible_os_family == "Debian" + +- name: Installing MySQL RedHat + yum: name={{ item }} + with_items: "{{ server_packages }}" + become: true + when: ansible_os_family == "RedHat" + +- name: make sure mysql conf directory exists + file: dest=/etc/mysql/conf.d state=directory + become: true + register: mysqldb_confdir_create + +- name: "Starting MySQL" + service: name={{ sqlServiceName }} state=started enabled=true + become: true + +#- name: "Adding root" +# become: true +# mysql_user: name=root host="{{ item }}" password="{{ mysql_root_password }}" login_user=root login_password="{{ mysql_root_password }}" check_implicit_admin=yes +# with_items: +# - "{{ ansible_hostname }}" +# - 127.0.0.1 +# - ::1 +# - localhost + +- name: Check that the slurm_acct_db_directory exists + stat: + path: /var/lib/mysql/slurm_acct_db/ #defined in /vars/filesystems.yaml + register: slurm_acct_db_directory_result + +# this will only work if a completely fresh db gets installed because it gets shipped with a blank root pw +- name: update mysql root password for all root accounts + mysql_user: name=root host=localhost password={{ mysql_root_password }} login_user=root + when: not slurm_acct_db_directory_result.stat.exists and mysqldb_confdir_create.changed + +- name: "Adding user database" + mysql_db: name={{ mysql_user_db_name }} state=present login_user=root login_password={{ mysql_root_password }} + +- name: "Giving priviliges to user" + mysql_user: name={{ mysql_user_name }} host={{ mysql_user_host }} password={{ mysql_user_password }} login_user=root login_password={{ mysql_root_password }} priv={{ mysql_user_db_name }}.*:ALL,GRANT state=present + when: mysql_user_host is defined + +- name: "Giving priviliges to user" + mysql_user: name={{ mysql_user_name }} host={{ hostvars[item].ansible_fqdn }} password={{ mysql_user_password }} login_user=root login_password={{ mysql_root_password }} priv={{ mysql_user_db_name }}.*:ALL,GRANT state=present + with_items: "{{ mysql_user_hosts_group }}" + when: mysql_user_hosts_group is defined diff --git a/roles/mysql/tasks/Ubuntu_18_mysql_server.yml b/roles/mysql/tasks/Ubuntu_18_mysql_server.yml new file mode 100644 index 0000000000000000000000000000000000000000..e573a9187a9ffc76168341b23c37e04675d7c54b --- /dev/null +++ b/roles/mysql/tasks/Ubuntu_18_mysql_server.yml @@ -0,0 +1,54 @@ +--- +- name: Make sure OS is updated since apt install might fail + apt: + update_cache: yes + become: true + +- name: "Installing MySQL for Ubuntu" + apt: name="{{ server_packages }}" update_cache=yes state=present + become: true + +- name: Comment out bind address so it doesn't bind to 127.0.0.1 + replace: + path: /etc/mysql/mariadb.conf.d/50-server.cnf + regexp: '(.*bind.*)' + replace: '#\1' + become: true + +- name: make sure mysql conf directory exists + file: dest=/etc/mysql/conf.d state=directory + become: true + register: mysqldb_confdir_create + +- name: "Starting MySQL" + service: name={{ sqlServiceName }} state=started enabled=true + become: true + +- name: Check that the slurm_acct_db_directory exists + stat: + path: /var/lib/mysql/slurm_acct_db/ #defined in /vars/filesystems.yaml + register: slurm_acct_db_directory_result + +# this will only work if a completely fresh db gets installed because it gets shipped with a blank root pw +- name: update mysql root password for all root accounts + mysql_user: name=root host=localhost password={{ mysql_root_password }} login_user=root check_implicit_admin=yes + become: true + become_user: root + +- name: "Adding user database" + mysql_db: name={{ mysql_user_db_name }} state=present login_user=root login_password={{ mysql_root_password }} + become: true + become_user: root + +- name: "Giving priviliges to user" + mysql_user: name={{ mysql_user_name }} host={{ mysql_user_host }} password={{ mysql_user_password }} login_user=root login_password={{ mysql_root_password }} priv={{ mysql_user_db_name }}.*:ALL,GRANT state=present + when: mysql_user_host is defined + become: true + become_user: root + +- name: "Giving priviliges to user" + mysql_user: name={{ mysql_user_name }} host={{ hostvars[item].ansible_fqdn }} password={{ mysql_user_password }} login_user=root login_password={{ mysql_root_password }} priv={{ mysql_user_db_name }}.*:ALL,GRANT state=present + with_items: "{{ mysql_user_hosts_group }}" + when: mysql_user_hosts_group is defined + become: true + become_user: root \ No newline at end of file diff --git a/roles/mysql/tasks/main.yml b/roles/mysql/tasks/main.yml index fd7181ba5206b53ab92a9a0802a239a2f0b0fde2..29bd62272f9c7e68812d95caff8ff4105a31da0c 100644 --- a/roles/mysql/tasks/main.yml +++ b/roles/mysql/tasks/main.yml @@ -1,3 +1,4 @@ --- - include_vars: "{{ ansible_distribution }}_{{ ansible_distribution_major_version }}.yml" -- include: "{{ mysql_type }}.yml" +- include: "{{ ansible_distribution }}_{{ ansible_distribution_major_version }}_{{ mysql_type }}.yml" +- include: mysql_client.yml \ No newline at end of file diff --git a/roles/mysql/tasks/mysql_server.yml b/roles/mysql/tasks/mysql_server.yml index 5ad085830619f71689d367cf48f9d8bc230e0df0..33f65d3d5eecdc877103b3ba9fa656588b1e7b37 100644 --- a/roles/mysql/tasks/mysql_server.yml +++ b/roles/mysql/tasks/mysql_server.yml @@ -1,7 +1,12 @@ --- +- name: Make sure OS is updated since apt install might fail + apt: + update_cache: yes + become: true + when: ansible_os_family == "Debian" + - name: "Installing MySQL Debian" - apt: name="{{ item }}" update_cache=yes cache_valid_time=3600 state=present - with_items: "{{ server_packages }}" + apt: name="{{ server_packages }}" update_cache=yes state=present become: true when: ansible_os_family == "Debian" @@ -10,7 +15,7 @@ with_items: "{{ server_packages }}" become: true when: ansible_os_family == "RedHat" - + - name: make sure mysql conf directory exists file: dest=/etc/mysql/conf.d state=directory become: true diff --git a/roles/mysql/vars/Ubuntu_18.yml b/roles/mysql/vars/Ubuntu_18.yml new file mode 100644 index 0000000000000000000000000000000000000000..ecfd81694c1c68b36bf7d23905c7e2002465a1d6 --- /dev/null +++ b/roles/mysql/vars/Ubuntu_18.yml @@ -0,0 +1,15 @@ +server_packages: + - python + - python-dev + - libmariadb-dev + - python-pip + - libapache2-mod-wsgi + - python-mysql.connector + - mariadb-server + - python-mysqldb + +client_packages: + - python + - mariadb-client + +sqlServiceName: "mariadb" diff --git a/roles/nfs-client/tasks/main.yml b/roles/nfs-client/tasks/main.yml index 1a3ea5fd54a102c95ec8276b8e59d6187f19ac7d..23ac6d08e10c0927ac3c680bc9a2a349771d01fe 100644 --- a/roles/nfs-client/tasks/main.yml +++ b/roles/nfs-client/tasks/main.yml @@ -15,4 +15,13 @@ - nfs-utils-lib when: ansible_os_family == "RedHat" and ansible_distribution_major_version < "7" +- name: install dependencies nfs-common ubuntu + apt: + name: nfs-common + state: present + update_cache: yes + become: true + become_user: root + when: ansible_distribution == 'Ubuntu' + - include: mountFileSystem.yml diff --git a/roles/nfs-client/tasks/mountFileSystem.yml b/roles/nfs-client/tasks/mountFileSystem.yml index 80dc3cb332385fb6154fdef6ded63ca748a47689..41ecd052629ed1de9acd3d1953bde2836a7a2cbe 100644 --- a/roles/nfs-client/tasks/mountFileSystem.yml +++ b/roles/nfs-client/tasks/mountFileSystem.yml @@ -4,6 +4,6 @@ mount: name={{ item.name }} src="{{ item.ipv4 }}:{{ item.src }}" fstype={{ item.fstype }} opts={{ item.opts }} state=mounted with_items: "{{ nfsMounts }}" become: true - ignore_errors: true + ignore_errors: false register: firstMount when: nfsMounts is defined diff --git a/roles/nfs-common/tasks/aptPackages.yml b/roles/nfs-common/tasks/aptPackages.yml index d8e07d6195e9f1012970c375cc8b6c5c236570cc..5324005b6cad3b3f896c97320cb2d129ae0f3552 100644 --- a/roles/nfs-common/tasks/aptPackages.yml +++ b/roles/nfs-common/tasks/aptPackages.yml @@ -6,4 +6,3 @@ - nfs-kernel-server apt: "name={{ item }} state=present" become: true - diff --git a/roles/nfs-server/tasks/startServer.yml b/roles/nfs-server/tasks/startServer.yml index 7ac79c0fa9ad39b43463dc2a3c4f6e8b2f1e0304..a2e0cbea827fc4dcc251b05f997dcb75581d4d9e 100644 --- a/roles/nfs-server/tasks/startServer.yml +++ b/roles/nfs-server/tasks/startServer.yml @@ -29,7 +29,15 @@ become: true when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" +- name: "Run exportfs" + command: /usr/sbin/exportfs -a + become: true + when: ansible_os_family == "Debian" + - name: "Start the Server" service: "name=nfs-kernel-server state=started enabled=true" become: true when: ansible_os_family == "Debian" + + + diff --git a/roles/set_timezone/tasks/main.yml b/roles/set_timezone/tasks/main.yml index 177969103af146ee970584e774bf2d4731209e77..5a89a6482d7414dd557936954afa4617cade1da1 100644 --- a/roles/set_timezone/tasks/main.yml +++ b/roles/set_timezone/tasks/main.yml @@ -4,15 +4,29 @@ become: true become_user: root -- name: restart ntpd +- name: restart ntpd redhat service: name=ntpd state=restarted become: true become_user: root + when: ansible_os_family == "RedHat" -- name: ensure ntpd is enabled and started +- name: ensure ntpd is enabled and started redhat service: name=ntpd state=started enabled=yes become: true become_user: root + when: ansible_os_family == "RedHat" + +- name: restart ntpd ubuntu + service: name=ntp state=restarted + become: true + become_user: root + when: ansible_os_family == "Debian" + +- name: ensure ntpd is enabled and started ubuntu + service: name=ntp state=started enabled=yes + become: true + become_user: root + when: ansible_os_family == "Debian" - name: set local timezone file: path=/etc/localtime state=link src={{ TIMEZONE_PATH }} diff --git a/roles/slurm-common/files/scripts/nvidia-probe.py b/roles/slurm-common/files/scripts/nvidia-probe.py index 7fd743ef41b91c85842973e623e1cbfd9f3c6535..7bc00899e6f3416003aa8dea5c00519f3e78bf4c 100755 --- a/roles/slurm-common/files/scripts/nvidia-probe.py +++ b/roles/slurm-common/files/scripts/nvidia-probe.py @@ -1,4 +1,4 @@ -#!/bin/env python +#!/usr/bin/env python # prints a list of NIDIA devices and their type in json format for # parsing by ansible program; # fields are 'name':'gpu' (fixed) diff --git a/roles/slurm-common/tasks/installCgroup.yml b/roles/slurm-common/tasks/installCgroup.yml index c7f4253d3dfcb0540421c27249d7aee0a4920118..6ba970cb140351a2f44f05eafd9638404fe2615d 100644 --- a/roles/slurm-common/tasks/installCgroup.yml +++ b/roles/slurm-common/tasks/installCgroup.yml @@ -3,25 +3,22 @@ with_items: - libcgroup become: True - become_method: sudo when: ansible_os_family == "RedHat" - name: apt install cgroup - apt: name={{ item }} state=installed update_cache=yes - with_items: - - cgmanager - - cgmanager-utils - - libcgmanager0 + package: + state: installed + name: + - libcgroup1 + - cgroupfs-mount + - cgroup-tools when: ansible_os_family == "Debian" become: True - become_method: sudo - name: config cgroup.conf file template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644 become: True - become_method: sudo - name: config cgroup_allowed_devices.conf file template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644 become: True - become_method: sudo diff --git a/roles/slurm-common/tasks/installMungeFromSource.yml b/roles/slurm-common/tasks/installMungeFromSource.yml index 656d35c9ff04a253224e44c9031e2c37c67c777e..7a24698ec82a8c6eac5c263891e8c00536e6e85e 100644 --- a/roles/slurm-common/tasks/installMungeFromSource.yml +++ b/roles/slurm-common/tasks/installMungeFromSource.yml @@ -27,11 +27,16 @@ creates: "{{ munge_dir }}/bin/munge" when: not munge_binary.stat.exists -- name: set use_systemd +- name: set use_systemd Redhat set_fact: use_systemd: True when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and ( ansible_distribution_major_version == "7") +- name: set use_systemd Debian + set_fact: + use_systemd: True + when: ansible_os_family == "Debian" + - name: copy init script template: dest=/etc/init.d/munge src=munge.initd.j2 mode=755 become: true diff --git a/roles/slurm-common/tasks/main.yml b/roles/slurm-common/tasks/main.yml index d2351af627d7d6b32aa7d720d236c3a5139d84d5..99a64ff3d1fd22fff2ba2efed0aaeaf7c0eed961 100644 --- a/roles/slurm-common/tasks/main.yml +++ b/roles/slurm-common/tasks/main.yml @@ -44,13 +44,14 @@ with_items: - gcc - wget - - libssl-dev + - libssl-dev # downgrade needed for bionic see https://github.com/dun/munge/issues/54 - libpam0g-dev - libbz2-dev - make - perl - libdbi-perl - lua5.2 + - liblua5.2-dev - hwloc - libhwloc-dev when: ansible_os_family == "Debian" diff --git a/roles/slurm-mysql-config/tasks/main.yml b/roles/slurm-mysql-config/tasks/main.yml index 52f06b184ac0f5487e09b633a97b2db40e712f2a..6be48e8ad7c042b24912166da59b7c9b5b21ab2d 100644 --- a/roles/slurm-mysql-config/tasks/main.yml +++ b/roles/slurm-mysql-config/tasks/main.yml @@ -2,3 +2,10 @@ template: src=slurm.cnf.j2 dest=/etc/my.cnf.d/slurm.cnf become: true become_user: root + when: ansible_os_family == "RedHat" + +- name: "Copy slurm db tuning config" + template: src=slurm.cnf.j2 dest=/etc/mysql/mariadb.conf.d/slurm.cnf + become: true + become_user: root + when: ansible_os_family == "Debian" \ No newline at end of file diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml index 85255ce6e8c1be524eb118ff7846767f1304d8f8..09d633ffbb6e18bbbfe4fb87e6579c77a2efdf8b 100644 --- a/roles/upgrade/tasks/main.yml +++ b/roles/upgrade/tasks/main.yml @@ -6,10 +6,10 @@ become_user: root when: ansible_os_family=="Debian" -- name: apt-get upgrade - apt: upgrade=safe - become: true - when: ansible_os_family=="Debian" +#- name: apt-get upgrade +# apt: upgrade=safe +# become: true +# when: ansible_os_family=="Debian" - name: yum remove yum: name=ipa-client-common state=absent