diff --git a/CICD/master_playbook.yml b/CICD/master_playbook.yml index 17c90a33d6a318ae5dad43502dc96f4e3fa772f1..07cd51e8647893c94c37c9d83a386e27be446998 100644 --- a/CICD/master_playbook.yml +++ b/CICD/master_playbook.yml @@ -1,9 +1,18 @@ --- - import_playbook: plays/make_files.yml + tags: [make_files] - import_playbook: plays/allnodes.yml + tags: [allnodes] - import_playbook: plays/init_slurmconf.yml # this requires management nodes + tags: [init_slurm] - import_playbook: plays/nfssqlnodes.yml + tags: [nfssql] - import_playbook: plays/mockldap.yml + tags: [mockldap] - import_playbook: plays/mgmtnodes.yml + tags: [mgmtnodesplaybook] - import_playbook: plays/computenodes.yml -- import_playbook: plays/loginnodes.yml \ No newline at end of file + tags: [computenodesplaybook] +- import_playbook: plays/loginnodes.yml + tags: [loginnodesplaybook] + diff --git a/CICD/plays/mgmtnodes.yml b/CICD/plays/mgmtnodes.yml index 838ae35575bb70f73b8dc4597c2849e773beed25..9aee41f94f94cc53a7b08696f13bd250574c2d76 100644 --- a/CICD/plays/mgmtnodes.yml +++ b/CICD/plays/mgmtnodes.yml @@ -3,6 +3,7 @@ # - hosts: 'ManagementNodes' + gather_facts: True vars_files: - vars/passwords.yml - vars/names.yml @@ -12,7 +13,10 @@ - vars/vars.yml - vars/vars_centos78.yml tasks: - # - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' } + - { name: unmount vdb if absent, mount: { path: "/mnt", src: "/dev/vdb", state: absent}, + when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true } + - { name: keep mnt present, file: { path: "/mnt", owner: root, group: root, mode: "u=rwx,g=rx,o=rx", state: directory}, + when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true } - { name: set use shared state, set_fact: usesharedstatedir=True } tags: [ always ] @@ -35,7 +39,7 @@ - { role: slurmdb-config, tags: [ slurm, slurmdb-config ] } - { role: slurm-common, tags: [ slurm, slurm-common ] } - { role: slurm_config, tags: [ slurm, slurm-config ] } - - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, tags: [ slurm-start ] } + - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, slurmd_enabled: False, start_slurmd: False, use_glusterfs: False, EMAIL_DEST: "nobody@nowhere.com", tags: [ slurm-start ] } - { role: telegraf, telegraf_install_rpm_url: 'http://consistency0/src/telegraf-1.12.6-1.x86_64.rpm', tags: [ monitoring, SiteSpecific ] } # - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] } # - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] } diff --git a/CICD/vars/filesystems.yml b/CICD/vars/filesystems.yml index 62d917425c4565d5653797e41947f98b2987375f..268b6f1099838a03ffe404941397f9e8a0ccade3 100644 --- a/CICD/vars/filesystems.yml +++ b/CICD/vars/filesystems.yml @@ -1,7 +1,7 @@ --- computeNfsMounts: - { name: '/home', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/home", 'opts': 'defaults,nofail', 'fstype':'nfs4' } - - { name: '/usr/local', ipv4: "118.138.235.37", src: "/usr_local", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } + - { name: '/usr/local', ipv4: "118.138.235.55", src: "/usr_local", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } - { name: '/projects', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/projects", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } - { name: '/scratch', ipv4: "{{ groups['NFSNodes'][0] }}", src: "/nfsvol/scratch", 'opts': 'defaults,rw,nofail', 'fstype':'nfs4' } mgmtNfsMounts: diff --git a/CICD/vars/passwords.yml b/CICD/vars/passwords.yml index 393458dd312fdb17c0909d1729a681d8816e7c68..a522c96015c86f9d71e43aedfe38d69cda8a0b80 100644 --- a/CICD/vars/passwords.yml +++ b/CICD/vars/passwords.yml @@ -5,6 +5,7 @@ sqlrootPasswd: EXAMPLESQLROOTPASSWORD sudo_group: systems default_user_password_clear: EXAMPLEDEFAULTUSERPASSWORDCLEAR default_user_password: EXAMPLEDEFAULTUSERPASSWORD -ldapManagerDNPassword: EXAMPLELDAPMANAGERDNPASSWORD -ldapManagerPassword: EXAMPLELDAPMANAGERPASSWORD -ldapBindDNPassword: EXAMPLEBINDDNPASSWORD +# the redhat passwords are also example passwords +ldapManagerDNPassword: redhat +ldapManagerPassword: redhat +ldapBindDNPassword: redhat diff --git a/CICD/vars/slurm.yml b/CICD/vars/slurm.yml index 78e7dc8e1e61a219f4e24fdebabadff81bcc3286..f9bdad2d073523273b69607ba3de0dfee864c041 100644 --- a/CICD/vars/slurm.yml +++ b/CICD/vars/slurm.yml @@ -3,7 +3,7 @@ desktopNodeList: - { name : 'DesktopNodes', interface : 'eth0' } clustername: "cicd" projectname: "cicd" -slurm_version: 19.05.4 +slurm_version: 20.02.6 munge_version: 0.5.13 nhc_version: 1.4.2 munge_dir: /opt/munge-{{ munge_version }} @@ -38,7 +38,7 @@ slurmsharedstatedir: "/slurmstate" slurmpiddir: "/opt/slurm/var/run" slurmaccount_create_user: "/usr/local/sbin/slurmuseraccount.sh" slurm_provision: "/cinderVolume/local/sbin/slurm_provision.sh" -slurmselecttype: "select/linear" +slurmselecttype: "select/cons_tres" slurmfastschedule: "1" slurmschedulertype: "sched/backfill" restartServerList: diff --git a/roles/calculateSlurmConf/templates/slurm.conf.j2 b/roles/calculateSlurmConf/templates/slurm.conf.j2 index 9bbe9263f6aae4d3da7fd4b8151867c9efe1db59..3911638ef4e48018f53b90ed1b103435fab27844 100644 --- a/roles/calculateSlurmConf/templates/slurm.conf.j2 +++ b/roles/calculateSlurmConf/templates/slurm.conf.j2 @@ -17,6 +17,7 @@ BackupController={{ slurmctrlbackup }} #BackupController= #BackupAddr= # +SlurmctldParameters=enable_configless SlurmUser=slurm SlurmdUser=root SlurmctldPort=6817 @@ -36,7 +37,7 @@ ProctrackType=proctrack/cgroup CacheGroups=0 #FirstJobId= ReturnToService=1 -#RebootProgram=/sbin/reboot +RebootProgram=/sbin/reboot #ResumeTimeout=300 #MaxJobCount= #PlugStackConfig= @@ -77,10 +78,9 @@ SchedulerType={{ slurmschedulertype }} #SchedulerPort= #SchedulerRootFilter= SelectType={{ slurmselecttype }} -{% if slurmselecttype.find("cons_res") > 0 %} +{% if slurmselecttype.find("cons_tres") > 0 %} SelectTypeParameters=CR_Core_Memory {% endif %} -FastSchedule={{ slurmfastschedule }} PriorityType=priority/multifactor #PriorityFlags=Ticket_Based #PriorityCalcPeriod=5 @@ -140,7 +140,7 @@ AccountingStorageEnforce=limits,safe #AccountingStorageUser= # #GRES -#GresTypes=gpu +GresTypes=gpu # HealthCheckInterval=300 diff --git a/roles/slurm-common/tasks/installSlurmFromSource.yml b/roles/slurm-common/tasks/installSlurmFromSource.yml index 4885cb68d4f608082b5e55a0091f791dbba47194..04304d902ea1d5abe67c6da6d17c63603d39b6ea 100644 --- a/roles/slurm-common/tasks/installSlurmFromSource.yml +++ b/roles/slurm-common/tasks/installSlurmFromSource.yml @@ -31,9 +31,9 @@ file: path: '/usr/local/ucx/' state: directory - owner: damienl - group: systems - mode: '0755' + owner: root + group: root + mode: u=rw,g=rx,o=rx become: true when: not stat_ucx.stat.exists @@ -52,9 +52,9 @@ src: '/usr/local/ucx/1.8.0' dest: '/usr/local/ucx/latest' state: link - owner: damienl - group: systems - mode: '0755' + owner: root + group: root + mode: u=rw,g=rx,o=rx become: true when: newucx.changed @@ -68,7 +68,7 @@ - ansible_os_family == 'RedHat' - name: configure slurm ubuntu - command: /tmp/slurm-{{ slurm_version }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix --with-ucx + command: /tmp/slurm-{{ slurm_version }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix --with-ucx=/usr/local/ucx/1.8.0 args: creates: "{{ slurm_dir }}/bin/srun" chdir: /tmp/slurm-{{ slurm_version }} diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml index 4d0ce6264cf796e4b90c9a6d45f4d1a90ca4ad0c..f0bf5e86b76efc5ce44c7180bb21299a01c2f34a 100644 --- a/roles/slurm-start/tasks/main.yml +++ b/roles/slurm-start/tasks/main.yml @@ -44,9 +44,9 @@ register: slurmd_service_installed - name: deploy glusterfsheltest - template: dest=/etc/systemd/system/glusterfsheltest.sh src=glusterfsheltest.j2 mode=744 + template: dest=/etc/systemd/system/glusterfsheltest.sh src=glusterfsheltest.sh.j2 mode=744 become: true - when: use_systemd is defined and start_slurmctld is defined + when: use_systemd is defined and start_slurmctld is defined and use_glusterfs register: slurmctld_service_installed - name: slurmctld.service @@ -93,15 +93,15 @@ become: true when: start_slurmdbd is defined and slurmdbd_service_installed.changed -- name: start slurmctld +- name: enable slurmctld service: name=slurmctld state=stopped enabled={{ start_slurmctld }} become: true - when: use_systemd is defined and start_slurmctld is defined + when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed -- name: restart slurmctld - service: name=slurmctld state=stopped enabled={{ start_slurmctld }} +- name: start slurmctld + service: name=slurmctld state=started become: true - when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed + when: use_systemd is defined and start_slurmctld is defined and start_slurmctld - name: "count clusters in slurm db" shell: "{{ slurm_dir }}/bin/sacctmgr show cluster -p | wc -l" @@ -130,4 +130,4 @@ - name: start slurm service: name=slurm state=restarted enabled={{ slurmd_enabled }} become: true - when: use_systemd is not defined and ( start_slurmd is defined or start_slurmctld is defined ) + when: use_systemd is not defined and start_slurmd is defined diff --git a/roles/slurm-start/templates/slurmctld.service.j2 b/roles/slurm-start/templates/slurmctld.service.j2 index cc914758a5f91d5d4647eb330cb4e220d8ff25ef..c8ae48e4809f75f84cbc2026e70991f9fb254347 100644 --- a/roles/slurm-start/templates/slurmctld.service.j2 +++ b/roles/slurm-start/templates/slurmctld.service.j2 @@ -6,7 +6,9 @@ ConditionPathExists={{ slurm_dir }}/etc/slurm.conf [Service] Type=forking #EnvironmentFile=/etc/default/slurmctld +{% if use_glusterfs %} ExecStartPre=/etc/systemd/system/glusterfsheltest.sh +{% endif %} ExecStart={{ slurm_dir }}/sbin/slurmctld $SLURMCTLD_OPTIONS PIDFile={{ slurmpiddir }}/slurmctld.pid [Install]