diff --git a/roles/calculateNhcConfig/tasks/main.yml b/roles/calculateNhcConfig/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6f9f9f7e6c0ce8de690265995faeab47328879e --- /dev/null +++ b/roles/calculateNhcConfig/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: "Templating nhc.conf" + template: src=nhc.conf.j2 dest=/tmp/nhc.conf owner=root group=root mode=644 + sudo: true + +- name: fetch nhc.conf + fetch: src=/tmp/nhc.conf dest=files/nhc.conf flat=yes + diff --git a/roles/calculateNhcConfig/templates/nhc.conf.j2 b/roles/calculateNhcConfig/templates/nhc.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..a5921218b83415835be2f8c168520bd9d7b05866 --- /dev/null +++ b/roles/calculateNhcConfig/templates/nhc.conf.j2 @@ -0,0 +1,29 @@ + + +####################################################################### +### +### Filesystem checks +### +# * || check_fs_mount_rw -t "fuse.glusterfs" -s "mgmt0:/gv" -f "/glusterVolume" + * || check_fs_used / 90% +# * || check_fs_used /glusterVolume 90% + * || check_fs_iused / 100% +# * || check_fs_iused /glusterVolume 100% + + +####################################################################### +### +### Hardware checks +### + * || check_hw_cpuinfo 1 1 1 +# * || check_hw_physmem 4048416kB 4048416kB 3% + * || check_hw_swap 0kB 0kB 3% + * || check_hw_eth eth0 + * || check_hw_eth lo + + +####################################################################### +### +### Process checks +### + * || check_ps_service -S -u root sshd diff --git a/roles/calculateSlurmConf/templates/slurm.conf.j2 b/roles/calculateSlurmConf/templates/slurm.conf.j2 index 50e869b57f0bdbb4f4870d961cfbfc032d44d824..dc833e3e78c8fa191c434c0795c0f936cfbb1e7c 100644 --- a/roles/calculateSlurmConf/templates/slurm.conf.j2 +++ b/roles/calculateSlurmConf/templates/slurm.conf.j2 @@ -165,13 +165,9 @@ MpiParams=ports=12000-12999 {% endfor %} {% endfor %} {% for node in nodelist|unique %} -NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN +NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total - 1024 }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN {% endfor %} -#monarch specific to stop stupid warning messages -NodeName={{ hostvars[groups['LoginNodes'][0]]['ansible_hostname'] }} State=DOWN -NodeName={{ slurmctrl }} State=DOWN - {% for queue in slurmqueues %} {% set nodenames = [] %} {% for node in groups[queue.group] %} diff --git a/roles/config_repos/files/monashhpc_base.repo b/roles/config_repos/files/monashhpc_base.repo new file mode 100644 index 0000000000000000000000000000000000000000..8f0d9aeee8c78a5a4d56b826e8ab100833d88bd0 --- /dev/null +++ b/roles/config_repos/files/monashhpc_base.repo @@ -0,0 +1,25 @@ +# Place this file in your /etc/yum.repos.d/ directory + +[monashhpc_base] +name=MonashHPC base repository mirrored to control the update process +baseurl=https://consistency0/centos/$releasever/os/$basearch/ +enabled=1 +sslverify=false + +[monashhpc_udpates] +name=MonashHPC base repository mirrored to control the update process +baseurl=https://consistency0/centos/$releasever/updates/$basearch/ +enabled=1 +sslverify=false + +[monashhpc_extras] +name=MonashHPC base repository mirrored to control the update process +baseurl=https://consistency0/centos/$releasever/extras/$basearch/ +enabled=1 +sslverify=false + +[monashhpc_centosplus] +name=MonashHPC base repository mirrored to control the update process +baseurl=https://consistency0/centos/$releasever/centosplus/$basearch/ +enabled=1 +sslverify=false diff --git a/roles/config_repos/files/monashhpc_others.repo b/roles/config_repos/files/monashhpc_others.repo new file mode 100644 index 0000000000000000000000000000000000000000..e78702bf53f5fe0a1284c0474aac75bba615aabd --- /dev/null +++ b/roles/config_repos/files/monashhpc_others.repo @@ -0,0 +1,8 @@ +# Place this file in your /etc/yum.repos.d/ directory + +[monashhpc_otherstuff] +name=MonashHPC base repository mirrored to control the update process +baseurl=https://consistency0/centos/hpcsystems/$releasever/$basearch/ +enabled=1 +sslverify=false +gpgcheck=0 diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml index 6d2efb6b166f5be9d2e4d3efaeee94ecef7e3058..7feaa7de6355f7e648ed437cdf6b86b3b76a3431 100644 --- a/roles/config_repos/tasks/main.yml +++ b/roles/config_repos/tasks/main.yml @@ -1,18 +1,33 @@ --- -# this repository was broken on some CentOS images. Remove it. -- name: Removing the RDO repository - file: path=/etc/yum.repos.d/rdo-release.repo state=absent - sudo: true -- name: add gluster repo - copy: src=glusterfs-epel.repo dest=/etc/yum.repos.d/glusterfs-epel.repo +- name: make sure out repo server is resolvable + lineinfile: dest=/etc/hosts line="118.138.244.7 consistency0" sudo: true - when: ansible_os_family == 'RedHat' -- name: Install epel-release - yum: name=epel-release-7-5.noarch state=present +- name: remove default repos + file: + path: /usr/yum.repos.d/{{ item }} + state: absent + become: true + become_user: root + with_items: + - CentOS-Base.repo + - CentOS-Debuginfo.repo + - CentOS-fasttrack.repo + - CentOS-Sources.repo + - CentOS-Vault.repo + - foreman.repo + - puppetlabs.repo + - rdo-release.repo + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == "7" + +- name: add our repos + copy: src={{ item }} dest=/etc/yum.repos.d/{{ item }} sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == "7" + with_items: + - monashhpc_base.repo + - monashhpc_others.repo #- name: Enable epel diff --git a/roles/gluster_client/tasks/main.yml b/roles/gluster_client/tasks/main.yml index 7c769da5dbf8558cf13ac35d6b6c0db5e2098642..e3939459cea19ff7569b41be9f2f6238d4f1d668 100644 --- a/roles/gluster_client/tasks/main.yml +++ b/roles/gluster_client/tasks/main.yml @@ -14,5 +14,5 @@ - name: mount volume #mount: name="{{ volmnt }}" src="{{ gluster_servers[0] }}:/{{ volname }}" state="mounted" fstype="glusterfs" opts="defaults,acl,_netdev,backupvolfile-server={{ gluster_servers[1] }}" - mount: name="{{ volmnt }}" src="{{ gluster_servers[0] }}:/{{ volname }}" state="mounted" fstype="glusterfs" opts="defaults,acl,backupvolfile-server={{ gluster_servers[1] }},noauto,comment=systemd.automount" + mount: name="{{ volmnt }}" src="{{ gluster_servers[0] }}:/{{ volname }}" state="mounted" fstype="glusterfs" opts="defaults,acl,backupvolfile-server={{ gluster_servers[1] }},comment=systemd.automount" sudo: true diff --git a/roles/lmod/tasks/main.yml b/roles/lmod/tasks/main.yml index 393a4f0058132827daa29be45ad5fe6b9df5f53f..d62f9fad88ca149730917753b032c74a09a11ee3 100644 --- a/roles/lmod/tasks/main.yml +++ b/roles/lmod/tasks/main.yml @@ -1,8 +1,8 @@ --- - include_vars: "{{ ansible_os_family }}.yml" -- name: install lua - yum: name={{ item }} state=installed +- name: install lua centos + yum: name={{ item }} state=installed update_cache=yes with_items: - lua - lua-filesystem @@ -14,7 +14,7 @@ sudo: true when: ansible_os_family == 'RedHat' -- name: install lua +- name: install lua debian apt: name={{ item }} state=installed with_items: - lua5.2 diff --git a/roles/nagios_monitored/tasks/main.yml b/roles/nagios_monitored/tasks/main.yml index 5ff9185dc4c0ae7eae41056e439bc1da9d573d95..8e4ac078a1d9fe3ad3d1e908ebb1aa236a180bf2 100644 --- a/roles/nagios_monitored/tasks/main.yml +++ b/roles/nagios_monitored/tasks/main.yml @@ -16,7 +16,20 @@ sudo: true - name: install monitor scripts - copy: dest={{ nagios_home }}/scripts/{{ item }} src=scripts/{{ item }} mode=755 - with_items: "{% set script_list = [] %}{% for s in nagios_services %}{%for g in hostvars[ansible_hostname].group_names %}{% if g in s.groups and s.script %}{% if script_list.append(s.script) %}{% endif %}{% endif %}{% endfor %}{% endfor %}{{ script_list }}" + copy: dest={{ nagios_home }}/scripts/{{ item }} src=files/scripts/{{ item }} mode=755 + with_items: + - check_apache2 + - check_blocked_vis_jobs + - check_ldap_client + - check_mysql + - check_slurmdbd + - check_blocked_beamline_jobs + - check_disk + - check_localfs.sh + - check_ntp + - check_blocked_compute_jobs + - check_ldap + - check_munge + - check_slurm sudo: true diff --git a/roles/provision_homedir/tasks/main.yml b/roles/provision_homedir/tasks/main.yml index a057f44fc0932ef5f67a2f2a5de9e1531a1120b5..ced40255ba4037daf4129e2d200c74e59d3bc4ab 100644 --- a/roles/provision_homedir/tasks/main.yml +++ b/roles/provision_homedir/tasks/main.yml @@ -2,7 +2,6 @@ - name: make dir file: path="{{ provision_homedir | dirname }}" state=directory mode=755 owner=root sudo: true - run_once: true - name: install python packages yum: name=python-ldap state=installed @@ -17,7 +16,6 @@ - name: copy provision_homedir template template: src=provision_homedir.py.j2 dest={{ provision_homedir }} mode=700 owner=root sudo: true - run_once: true # the lockfile for makeing home directories should be located on the shared directory where the home directories will be created. Otherwise it will be racey - name: provision_homedir cron job diff --git a/roles/provision_slurm/templates/provision_slurm.py.j2 b/roles/provision_slurm/templates/provision_slurm.py.j2 index a8a6bca3cb4b0cb180364e13755b57595735410d..986d7c153d9fb73f10bb1642f21adb80a2029500 100644 --- a/roles/provision_slurm/templates/provision_slurm.py.j2 +++ b/roles/provision_slurm/templates/provision_slurm.py.j2 @@ -68,9 +68,21 @@ s.searchFilter = "{{ search_filter }}" users=get_users(s) mk_slurmaccount("default") +usergrouplist=[] +userlist=[] +i=0 for user in users: + if i==200: + i=0 + usergrouplist.append(",".join(userlist)) + userlist=[] + i=i+1 + userlist.append(users[user].entry['uid'][0]) +usergrouplist.append(",".join(userlist)) +for usergroup in usergrouplist: + try: - mk_slurmuser(users[user].entry['uid'][0],"default") + mk_slurmuser(usergroup,"default") except: print traceback.format_exc() pass diff --git a/roles/slurm-common/tasks/installNhc.yml b/roles/slurm-common/tasks/installNhc.yml index 8889a929be9c6956ca8ef19d8fd6623b4024eef9..1b093c6d2964dc1b894e626c17715dd97ccdff81 100644 --- a/roles/slurm-common/tasks/installNhc.yml +++ b/roles/slurm-common/tasks/installNhc.yml @@ -47,20 +47,7 @@ template: dest=/etc/logrotate.d/nhc src=nhclog.j2 mode=644 sudo: true -- name: check configure file - shell: ls {{ nhc_dir }}/etc/nhc/{{ nhc_config_file }} - ignore_errors: true - register: generate_nhc_config_file - -- name: generate config file - shell: "{{ nhc_dir }}/sbin/nhc-genconf -c {{ nhc_dir }}/etc/nhc/{{ nhc_config_file }} CONFDIR={{ nhc_dir }}/etc/nhc" - sudo: true - ignore_errors: true - when: generate_nhc_config_file - -- name: config file extension - lineinfile: dest="{{ nhc_dir }}/etc/nhc/{{ nhc_config_file }}" line="{{ item }}" - with_items: - nhc_user_conf - sudo: true - when: nhc_user_conf is defined and generate_nhc_config_file \ No newline at end of file +- name: install nhc config file + copy: src=nhc.conf dest={{ nhc_dir }}/etc/nhc/{{ nhc_config_file }} + become: true + become_user: root diff --git a/roles/slurm-common/templates/cgroup.conf.j2 b/roles/slurm-common/templates/cgroup.conf.j2 index c19d5c5f41e16f4187fe5cf1a889a7e796806497..bb603827d7f4d643c938c7a7b24b8e97a799bc94 100644 --- a/roles/slurm-common/templates/cgroup.conf.j2 +++ b/roles/slurm-common/templates/cgroup.conf.j2 @@ -2,5 +2,6 @@ CgroupAutomount=yes ConstrainDevices=yes TaskAffinity=yes ConstrainCores=yes +ConstrainRAMSpace=yes AllowedDevicesFile={{ slurm_dir }}/etc/cgroup_allowed_devices.conf diff --git a/roles/strudel_config/templates/generic_slurm_config.json.j2 b/roles/strudel_config/templates/generic_slurm_config.json.j2 index 976557af150bb75b6fbc75891a895099d1a6ce66..6ae93bc423409cff6a54b516853b159382007189 100644 --- a/roles/strudel_config/templates/generic_slurm_config.json.j2 +++ b/roles/strudel_config/templates/generic_slurm_config.json.j2 @@ -124,7 +124,7 @@ "__class__": "cmdRegEx", "__module__": "siteConfig", "async": false, - "cmd": "{{ slurm_dir }}/bin/squeue -u {username} -o \\\"%i %L\\\" | tail -n -1", + "cmd": "\"{{ slurm_dir }}/bin/squeue -u {username} -o \\\"%i %L\\\" | tail -n -1\"", "failFatal": true, "formatFatal": false, "host": "login", diff --git a/roles/strudel_config/templates/turbo_slurm_config.json.j2 b/roles/strudel_config/templates/turbo_slurm_config.json.j2 index 3acb443e96311ceec9d09bed0739a2be95c32884..bb00e81e42803eb1f8ed7850b041d89f5ad4bf88 100644 --- a/roles/strudel_config/templates/turbo_slurm_config.json.j2 +++ b/roles/strudel_config/templates/turbo_slurm_config.json.j2 @@ -121,7 +121,7 @@ "__class__": "cmdRegEx", "__module__": "siteConfig", "async": false, - "cmd": "squeue -u {username} -o \\\"%i %L\\\" | tail -n -1", + "cmd": "\"squeue -u {username} -o \\\"%i %L\\\" | tail -n -1\"", "failFatal": true, "formatFatal": false, "host": "login", @@ -449,4 +449,4 @@ } } } -] \ No newline at end of file +]