Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 571 additions and 0 deletions
../tests/ManagementNodes/check.yml
\ No newline at end of file
---
# just calculates an etc hosts
- hosts: 'all'
tasks:
- include_vars: vars/passwords.yml
- include_vars: vars/names.yml
- include_vars: vars/ldapConfig.yml
- include_vars: vars/filesystems.yml
- include_vars: vars/slurm.yml
- include_vars: vars/vars.yml
- hosts: 'all'
tasks:
- { name: setup, setup: }
- hosts: 'ManagementNodes'
roles:
- { role: calculateEtcHosts }
#- hosts: 'NFSNodes'
# roles:
# - { role: calculateExports }
# Basic stuff to make the nodes functionl
# i.e. upgrade operating systems, etc
#
- hosts: 'ManagementNodes'
gather_facts: True
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/vars_centos79.yml
- vars/versions.yml
tasks:
- { name: unmount vdb if absent, mount: { path: "/mnt", src: "/dev/vdb", state: absent},
when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true }
- { name: keep mnt present, file: { path: "/mnt", owner: root, group: root, mode: "u=rwx,g=rx,o=rx", state: directory},
when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true }
- { name: set use shared state, set_fact: usesharedstatedir=True }
tags: [ always ]
- hosts: 'ManagementNodes'
gather_facts: False
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
roles:
# - { role: ldapclient, tags: [ authentication ] }
# - { role: ssh-password-login }
# - { role: enable_sudo_group }
- { role: slurmdb-config, tags: [ slurm, slurmdb-config ] }
- { role: slurm-common, tags: [ slurm, slurm-common ] }
- { role: slurm_config, tags: [ slurm, slurm-config ] }
- { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, slurmd_enabled: False, start_slurmd: False, use_glusterfs: False, EMAIL_DEST: "nobody@nowhere.com", tags: [ slurm-start ] }
# - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ] }
# - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] }
---
- hosts: LDAPServer
vars_files:
- vars/passwords.yml
- vars/ldapConfig.yml
gather_facts: true
tasks:
- include_vars: vars/passwords.yml
roles:
- { role: ldapservertest, ssl: false, tags: [ ldapserver ], become: true }
#- hosts: ComputeNodes, LoginNodes
# vars_files:
# - vars/passwords.yml
# - vars/ldapConfig.yml
# gather_facts: true
# tasks:
# - include_vars: vars/passwords.yml
# roles:
# - { role: ldapclient, ssl: false
# , tags: [ ldapclient ] }
\ No newline at end of file
# Role to initialize nfs and SQL Nodes
#
#
- hosts: 'all'
tasks:
- { name: setup, setup: }
tags: [ always ]
#we need this here to gather facts and fill required variables.
- hosts: 'ManagementNodes'
gather_facts: True
tasks:
- include_vars: vars/passwords.yml
- include_vars: vars/slurm.yml
- include_vars: vars/vars.yml
- include_vars: vars/versions.yml
- include_vars: vars/vars_centos79.yml
- { name: set hostgroup, set_fact: hostgroup='ManagementNodes' }
- { name: set use shared state, set_fact: usesharedstatedir=True }
tags: [ always ]
- hosts: 'SQLNodes,NFSNodes'
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
pre_tasks:
- { name: set hostgroup, set_fact: hostgroup='SQLNodes', tags: [ always ] }
- { name: set use shared state, set_fact: usesharedstatedir=True, tags: [ always ] }
- hosts: 'SQLNodes'
vars_files:
- vars/passwords.yml
- vars/slurm.yml
- vars/vars.yml
- vars/versions.yml
- vars/vars_centos79.yml
strategy: free
gather_facts: True
roles:
# - { role: upgrade, tags: [ upgrade ] }
- { role: mysql, mysql_type: mysql_server, mysql_user_name: slurmdb, mysql_user_db_name: slurm_acct_db, mysql_user_hosts_group: "{{ groups['ManagementNodes'] }}", mysql_user_password: "{{ slurmdb_passwd }}", tags: [ database ] }
- { role: slurm-mysql-config, tags: [database,slurmdb] }
tags: [ sql ]
- hosts: 'LogNodes'
roles:
- { role: etcHosts, tags: [ networking, etcHosts ] }
- { role: rsyslog_server }
../../roles
\ No newline at end of file
../files
\ No newline at end of file
options lnet networks=tcp0(eth0)
\ No newline at end of file
#!/bin/sh
if [ ! -c /dev/lnet ] ; then
exec /sbin/modprobe lnet >/dev/null 2>&1
fi
#!/bin/sh
/sbin/lsmod | /bin/grep lustre 1>/dev/null 2>&1
if [ ! $? ] ; then
/sbin/modprobe lustre >/dev/null 2>&1
fi
[lustre-server]
name=lustre-server
baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/server
# exclude=*debuginfo*
gpgcheck=0
[lustre-client]
name=lustre-client
baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/client
# exclude=*debuginfo*
gpgcheck=0
[e2fsprogs-wc]
name=e2fsprogs-wc
baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7
# exclude=*debuginfo*
gpgcheck=0
\ No newline at end of file
../roles
\ No newline at end of file
---
- hosts: 'LustreONodes,LustreMNodes'
tasks:
- name: install epel release repo
package:
name: epel-release
state: present
- name: Enable ZFS on Linux repo
command: yum localinstall --nogpgcheck -y http://download.zfsonlinux.org/epel/zfs-release.el7_6.noarch.rpm
become: true
- name: add lustre repos
copy:
src: lustre.repo
dest: /etc/yum.repos.d/lustre.repo
become: true
- name: Upgrade e2fsprogs
package:
name: e2fsprogs
state: latest
become: true
- name: install the lustre-tests package
package:
name: lustre-tests
state: present
become: true
- name: Create the following file
copy:
src: lnet.conf
dest: /etc/modprobe.d/lnet.conf
become: true
- hosts: 'LustreONodes,LustreMNodes'
tasks:
- name: On the MGS and OSS only, create the file /etc/sysconfig/modules/lnet.modules
copy:
src: lustre.repo
dest: /etc/sysconfig/modules/lnet.modules
become: true
register: lnetmodules
- name: Reboot a slow machine that might have lots of updates to apply
reboot:
reboot_timeout: 30 # still tweaking this !
become: true
when: lnetmodules.changed
- hosts: 'LustreMNodes' #TODO
tasks:
- name: Intialise a disk or partition to use for lustre
#command: mkfs.lustre --fsname=whatevs --mgs --mdt --index=0 /dev/disk/by-id/virtio-f6705d5a-62d8-4d93-b # change ID, done before?
command: mkfs.lustre --fsname=whatevs --mgs --mdt --index=0 {{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }} # change ID, done before?
become: true
ignore_errors: yes # i am noughty and lazy here. this is in case of a rerun to not fail
- name: Create a directory if it does not exist
file:
path: /mnt/mdt
state: directory
mode: '0755'
become: true
ignore_errors: yes #this will not work if already mounted
- name: Create a mount point and mount the lustre FS
#command: mount -t lustre {{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }} /mnt/mdt
become: true
mount:
path: /mnt/mdt
src: "{{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }}"
fstype: lustre
state: mounted
- hosts: 'LustreONodes'
tasks:
- name: Create a lustre OST
#command: mkfs.lustre --ost --fsname=whatevs --mgsnode=118.138.233.250@tcp0 --index=0 /dev/disk/by-id/virtio-3c2223cd-e3d3-4264-9
command: mkfs.lustre --ost --fsname=whatevs --mgsnode={{ hostvars[groups['LustreMNodes'][0]]['ansible_host'] }}@tcp0 --index=0 {{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }} # /dev/disk/by-id/virtio-3c2223cd-e3d3-4264-9
become: true
ignore_errors: yes # if filesystem already exists do nothing
register: ost_mkfs
- name: Create a directory if it does not exist
file:
path: /ostoss_mount
state: directory
mode: '0755'
become: true
when: ost_mkfs.changed
ignore_errors: yes
- name: Create a mount point and mount the lustre FS
#command: mount -t lustre {{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }} /ostoss_mount
become: true
mount:
path: /ostoss_mount
src: "{{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }}"
fstype: lustre
state: mounted
- hosts: 'LoginNodes' # this does not work. we need to config_repos first
tasks:
- include_vars: vars/vars_centos78.yml
- hosts: 'LoginNodes'
roles:
- { role: config_repos, tags: [ repos ] }
- { role: upgrade, tags: [ upgrade ] } # upgrade from centos7.6 base image to centos7.8
- { role: mellanox_drivers, start_roce_service: false }
- hosts: 'LoginNodes' # this does not work. we need to config_repos first
tasks:
- name: install rpms
package:
name:
- kmod-lustre-client
- lustre-client
become: true
- name: Add the lustre module
modprobe:
name: lustre
state: present
become: true
- name: Create the file /etc/sysconfig/modules/lustre.modules to load the lustre module on boot
copy:
src: lustre.modules
dest: /etc/sysconfig/modules/lustre.modules
become: true
- name: Create a directory if it does not exist
file:
path: /mnt/lustre
state: directory
mode: '0755'
become: true
- name: Create a mount point and mount the lustre FS
command: mount -t lustre {{ hostvars[groups['LustreMNodes'][0]]['ansible_host'] }}@tcp0:/whatevs /mnt/lustre
become: true
- name: create a directory for the users group. ID 100
command: mkdir -p /mnt/lustre/users
become: true
- name: set a quota
command: lfs setquota -g 100 -b 1G -B 1.1G /mnt/lustre/users/
become: true
- name: poll the quota
command: lfs quota -gh 100 /mnt/lustre/users/
register: quota
become: true
failed_when: "'is using default block quota setting' in quota.stdout"
\ No newline at end of file
../../vars
\ No newline at end of file
../vars
\ No newline at end of file
CgroupAutomount=yes
ConstrainDevices=yes
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainKmemSpace=no
domain: {{ domain }}
volumes:
{% for name in exports %}
- host: {{ clustername}}-sql0
dev: {{ disks[name] }}
mnt: /mnt/{{ name }}
{% endfor %}
nfsexports:
- host: {{ clustername }}-sql0
exportList:
{% for name in exports %} - src: /mnt/{{ name }}
{% endfor %}
{% for name in ['/home','/nfs/opt'] %} - src: {{ name }}
{% endfor %}
nfsmounts:
- group: LoginNodes
nfsMounts:
{% for name in exports %}
- name: /mnt/{{ name }}
ipv4: {{ clustername }}-sql0
src: /mnt/{{ name }}
fstype: nfs4
opts: "defaults,rw"
{% endfor %}
{% for name in ['/home','/nfs/opt'] %}
- name: {{ name }}
ipv4: {{ clustername }}-sql0
src: {{ name }}
fstype: nfs4
opts: "defaults,rw"
{% endfor %}
- group: ComputeNodes
nfsMounts:
{% for name in exports %}
- name: /mnt/{{ name }}
ipv4: {{ clustername }}-sql0
src: /mnt/{{ name }}
fstype: nfs4
opts: "defaults,rw" {% endfor %}
- name: /home
ipv4: {{ clustername }}-sql0
src: /home
fstype: nfs4
opts: "defaults,rw"
- group: ManagementNodes
nfsMounts:
{% for name in exports %}
- name: /mnt/{{ name }}
ipv4: {{ clustername }}-sql0
src: /mnt/{{ name }}
fstype: nfs4
opts: "defaults,rw" {% endfor %}
AutoBasePath=true
BasePath=/mnt/privatedir
#######################################################################
###
### Filesystem checks
###
# * || check_fs_mount_rw -t "fuse.glusterfs" -s "mgmt0:/gv" -f "/glusterVolume"
* || check_fs_used / 90%
# * || check_fs_used /glusterVolume 90%
* || check_fs_iused / 100%
# * || check_fs_iused /glusterVolume 100%
#######################################################################
###
### Hardware checks
###
# * || check_hw_cpuinfo 1 1 1
# * || check_hw_physmem 4048416kB 4048416kB 3%
# * || check_hw_swap 0kB 0kB 3%
# * || check_hw_eth eth0
# * || check_hw_eth lo
#######################################################################
###
### Process checks
###
# * || check_ps_service -S -u root sshd
......@@ -9,11 +9,13 @@
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
ControlMachine={{ controller }}
BackupController={{ backup }}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmctldParameters=enable_configless
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
......@@ -21,17 +23,19 @@ SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation={{ slurmdatadir }}
SlurmdSpoolDir={{ slurmdatadir }}
StateSaveLocation=/opt/slurm/var/state
SlurmdSpoolDir=/opt/slurm/var/spool
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
MpiDefault=pmi2
SlurmctldPidFile=/opt/slurm/var/run/slurmctld.pid
SlurmdPidFile=/opt/slurm/var/run/slurmd.pid
#ProctrackType=proctrack/linuxproc
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
ReturnToService=1
RebootProgram=/sbin/reboot
#ResumeTimeout=300
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
......@@ -43,35 +47,44 @@ ReturnToService=0
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
TaskPlugin=task/affinity,task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
#JobSubmitPlugins=lua
OverTimeLimit=1
CompleteWait=10
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
SlurmctldTimeout=3000
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
SchedulerType="sched/backfill"
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
SelectType="select/cons_tres"
SelectTypeParameters=CR_Core_Memory
JobContainerType=job_container/tmpfs
PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
PriorityWeightFairshare=10000
PriorityWeightAge=10000
PriorityWeightPartition=10000
PriorityWeightJobSize=10000
PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
......@@ -98,22 +111,35 @@ SlurmSchedLogFile={{ slurmschedlog.log }}
JobCompType=jobcomp/none
#JobCompLoc=
#
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% else %}
Prolog=/opt/slurm/etc/slurm.prolog
Epilog=/opt/slurm/etc/slurm.epilog
{% endif %}
PrologFlags=contain
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmctrl }}
#AccountingStorageEnforce=limits,safe
AccountingStorageHost={{ controller }}
{% if slurmdbdbackup is defined %}
AccountingStorageBackupHost={{ backup }}
{% endif %}
AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
#
# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}
HealthCheckInterval=300
HealthCheckProgram={{ nhc_dir }}/sbin/nhc
DisableRootJobs=YES
MpiParams=ports=12000-12999
......@@ -126,9 +152,14 @@ MpiParams=ports=12000-12999
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_cores'] }} State=UNKNOWN
NodeName={{ node }} Procs=1 RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 Gres=gpu:0 Weight=1 State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }}
{% set nodenames = [] %}
{% for node in groups[queue.group] %}
{% if nodenames.append(node) %}
{% endif %}
{% endfor %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ nodenames|join(',') }} {% if queue.DefaultTime is defined %} DefaultTime={{ queue.DefaultTime }} {% endif %} {% if queue.DefMemPerCPU is defined %} DefMemPerCPU={{ queue.DefMemPerCPU }} {% endif %} {% if queue.MaxTime is defined %} MaxTime={{ queue.MaxTime}} {% endif %} State=UP
{% endfor %}