Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 614 additions and 0 deletions
---
- name: create package hold files in apt prefferences
template:
src: hold.pref.j2
dest: /etc/apt/preferences.d/hold-{{ item.pkg_name }}.pref
become: true
when: ansible_os_family=="Debian"
loop: "{{ apt_preferences }}"
# example data
#apt_preferences:
# - { pkg_name: 'yaru-theme-sound', pkg_version: "20.04.10.1", pkg_priority: "1002" }
# - { pkg_name: 'yaru-theme-gnome-shell', pkg_version: "20.04.10.1" }
# This file is generated by ansible role apt_preferences
Explanation: : {{ item.pkg_name }}
Package: {{ item.pkg_name }}
Pin: version {{ item.pkg_version }}
Pin-Priority: {{ item.pkg_priority|default("1001", true) }}
---
lustreVersion: v2_12_7
---
# see documentation at https://sites.google.com/a/monash.edu/hpc-services/work-instructions/system-configuration/lustre/compile-lustre-client-from-source
- name: install dependencies
package:
name: git
state: present
become: true
- name: get lustre sources for latest tags see https://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
git:
repo: git://git.whamcloud.com/fs/lustre-release.git
dest: /tmp/lustre-release
version: "{{ lustreVersion }}"
#command: git clone git://git.whamcloud.com/fs/lustre-release.git -b v2_12_5
# chdir: /tmp/lustre-release
# creates: /tmp/lustre-release
#- name: get a magic patch
# command: git fetch https://review.whamcloud.com/fs/lustre-release refs/changes/67/32267/1 && git checkout FETCH_HEAD
# chdir: /tmp/lustre-release
- name: install dependencies
package: #libelf-dev, libelf-devel or elfutils-libelf-devel
name:
- libyaml
- libyaml-devel
- libtool
- elfutils-libelf-devel
- libselinux-devel
#- kernel-rpm-macros #centos8.2
- kernel-abi-whitelists
state: present
become: true
when: ansible_os_family == 'RedHat'
- name: install dependencies
package: #libelf-dev, libelf-devel or elfutils-libelf-devel
name:
#- libyaml
- libyaml-dev
- libtool
#- elfutils-libelf-devel
- libselinux1-dev
- libpython3.8-dev #unconfirmed
#- kernel-abi-whitelists
- zlib1g-dev
- libssl-dev
- libiberty-dev
- module-assistant
- libreadline-dev
- libsnmp-dev
- mpi-default-dev
state: present
become: true
when: ansible_os_family == 'Debian'
- name: autogen
command:
cmd: '/bin/sh ./autogen.sh'
chdir: /tmp/lustre-release
- name: configure
command:
cmd: './configure --disable-server --with-o2ib=/usr/src/ofa_kernel/default'
chdir: /tmp/lustre-release
when: "'fitdgx' not in inventory_hostname"
- name: fail on fitdgx for a manuall process
fail:
msg: "check the linux src directory. likely with uname -r. and comment this fail and edit the task below. sorry this is manual for now."
when: "'fitdgx' in inventory_hostname"
- name: configure on fitdgx
command:
cmd: './configure --with-linux=/usr/src/linux-headers-5.4.0-74-generic --disable-server --with-o2ib=yes --disable-doc --disable-tests'
chdir: /tmp/lustre-release
when: "'fitdgx' in inventory_hostname"
- name: make
make:
chdir: /tmp/lustre-release
params:
NUM_THREADS: 6
#- name: make rpms
# command: make rpms
# chdir: /tmp/lustre-release
- name: make rpms
make:
target: rpms
chdir: /tmp/lustre-release
when: ansible_os_family == 'RedHat'
- name: make debs
make:
target: debs
chdir: /tmp/lustre-release
when: ansible_os_family == 'Debian'
#!/usr/bin/python
import sys
import json
import socket
filename = sys.argv[1]
try:
domain = sys.argv[2]
except IndexError:
domain = None
f=open(filename,'r')
s=f.read()
d=json.loads(s)
f.close()
hosts={}
for group in d['groups'].keys():
i=0
for h in d['groups'][group]:
name = d['hostvars'][h]['ansible_hostname']
name = h
if not domain:
hosts[h] = [name]
else:
hosts[h] = ['%s.%s %s'%(name,domain,name)]
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
for addr in d['hostvars'][h]['ansible_all_ipv4_addresses']:
if "172.16.200" in addr:
string="%s"%addr
for name in hosts[h]:
string=string+" %s"%(name)
print string
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
string="%s"%(d['hostvars'][h]['ansible_default_ipv4']['address'])
for name in hosts[h]:
string=string+" %s"%(name)
print string
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
if d['hostvars'][h].has_key('ansible_tun0'):
string="%s"%(d['hostvars'][h]['ansible_tun0']['ipv4']['address'])
string=string+" %s-vpn"%h
print string
#!/usr/bin/python3
import sys
import json
import socket
filename = sys.argv[1]
try:
domain = sys.argv[2]
except IndexError:
domain = None
f=open(filename,'r')
s=f.read()
d=json.loads(s)
f.close()
hosts={}
for group in list(d['groups'].keys()):
i=0
for h in d['groups'][group]:
name = d['hostvars'][h]['ansible_hostname']
name = h
if not domain:
hosts[h] = [name]
else:
hosts[h] = ['%s.%s %s'%(name,domain,name)]
for h in sorted(hosts.keys()):
if h in d['hostvars']:
for addr in d['hostvars'][h]['ansible_all_ipv4_addresses']:
if "172.16.200" in addr:
string="%s"%addr
for name in hosts[h]:
string=string+" %s"%(name)
print(string)
for h in sorted(hosts.keys()):
if h in d['hostvars']:
string="%s"%(d['hostvars'][h]['ansible_default_ipv4']['address'])
for name in hosts[h]:
string=string+" %s"%(name)
print(string)
for h in sorted(hosts.keys()):
if h in d['hostvars']:
if 'ansible_tun0' in d['hostvars'][h]:
string="%s"%(d['hostvars'][h]['ansible_tun0']['ipv4']['address'])
string=string+" %s-vpn"%h
print(string)
---
- name: get_groups_json
template: dest=/tmp/groups src=groups.j2
- name: copy script
copy: src=makehosts.py dest=/tmp/makehosts.py mode=755
when: ansible_os_family == "RedHat"
- name: copy script
copy: src=makehostsp3.py dest=/tmp/makehosts.py mode=755
when: ansible_os_family == "Debian"
- name: make hosts data
command: /tmp/makehosts.py /tmp/groups {{ domain }}
register: hosts_data
- name: write hosts file
template: dest=/tmp/etcHosts src=etcHosts.j2
- name: fetch hosts file
fetch: src=/tmp/etcHosts dest=files/etcHosts flat=yes
- name: make sure our repo server is resolvable
lineinfile:
path: files/etcHosts
line: "{{ reposerverip }} {{ reposervername }}"
owner: root
group: root
become: True
when: reposerverip is defined and reposervername is defined
127.0.0.1 localhost
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
{% for item in hosts_data.stdout_lines %}
{{ item }}
{% endfor %}
{
"groups": {{ groups | to_nice_json }},
"hostvars": {
{% for host in groups['all'] %}
"{{ host }}" : {{ hostvars[host]|to_nice_json }}
{% if not loop.last %}
,
{% endif %}
{% endfor %}
}
}
- name: "Templating /etc/exports"
template: src=exports.j2 dest=/tmp/exports owner=root group=root mode=644
become: true
- name: "Fetch etcExports"
fetch: src=/tmp/exports dest=files/etcExports flat=yes
{% for export in exportList %}
{% set iplist = [] %}
{% for group in export.group %}
{% for node in groups[group] %}
{% if hostvars[node]['ansible_'+export.interface] is defined %}
{% if iplist.append(hostvars[node]['ansible_'+export.interface]['ipv4']['address']) %}
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
{{ export.src }} {% for ip in iplist|unique %}{{ ip }}({{ export.srvopts }}) {% endfor %}
{% endfor %}
- name: "Templating /etc/ssh/known_hosts"
template: src=known_hosts.j2 dest=/tmp/ssh_known_hosts owner=root group=root mode=644
become: true
register: sshknownhost
- name: fetch known_hosts file
fetch: src=/tmp/ssh_known_hosts dest=files/ssh_known_hosts flat=yes
- name: delete ssh_known_hosts
file: path=/tmp/ssh_known_hosts state=absent
become: true
{% set keytypes = [ { 'type': 'ssh-rsa', 'fact': 'ansible_ssh_host_key_rsa_public' }, { 'type': 'ecdsa-sha2-nistp256', 'fact': 'ansible_ssh_host_key_ecdsa_public'} ] %}
{% for node in groups['all'] %}
{% for hostkey in keytypes %}
{% if hostkey['fact'] in hostvars[node] %}
{{ node }} {{ hostkey['type'] }} {{ hostvars[node][hostkey['fact']] }}
{% for ip in hostvars[node]['ansible_all_ipv4_addresses'] %}
{{ ip }} {{ hostkey['type'] }} {{ hostvars[node][hostkey['fact']] }}
{% endfor %}
{% endif %}
{% endfor %}
{% endfor %}
---
- name: "Templating nhc.conf"
template: src=nhc.conf.j2 dest=/tmp/nhc.conf owner=root group=root mode=644
become: true
- name: fetch nhc.conf
fetch: src=/tmp/nhc.conf dest=files/nhc.conf flat=yes
#######################################################################
###
### Filesystem checks
###
# * || check_fs_mount_rw -t "fuse.glusterfs" -s "mgmt0:/gv" -f "/glusterVolume"
* || check_fs_used / 90%
# * || check_fs_used /glusterVolume 90%
* || check_fs_iused / 100%
# * || check_fs_iused /glusterVolume 100%
#######################################################################
###
### Hardware checks
###
* || check_hw_cpuinfo 1 1 1
# * || check_hw_physmem 4048416kB 4048416kB 3%
* || check_hw_swap 0kB 0kB 3%
* || check_hw_eth eth0
* || check_hw_eth lo
#######################################################################
###
### Process checks
###
* || check_ps_service -S -u root sshd
- name: "Templating slurm.conf"
template: src=slurm.conf.j2 dest=/tmp/slurm.conf owner=root group=root mode=644
become: true
- name: fetch slurm.conf
fetch: src=/tmp/slurm.conf dest=files/slurm.conf flat=yes
- name: "Templating slurmdbd.conf"
template: src=slurmdbd.conf.j2 dest=/tmp/slurmdbd.conf owner=root group=root mode=644
become: true
- name: fetch slurm.conf
fetch: src=/tmp/slurmdbd.conf dest=files/slurmdbd.conf flat=yes
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
{% if slurmctrlbackup is defined %}
BackupController={{ slurmctrlbackup }}
{% endif %}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmctldParameters=enable_configless
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation={{ slurmstatedir }}
SlurmdSpoolDir={{ slurmdatadir }}
SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
#ProctrackType=proctrack/linuxproc
ProctrackType=proctrack/cgroup
#PluginDir=
#FirstJobId=
ReturnToService=1
RebootProgram=/sbin/reboot
#ResumeTimeout=300
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
{% if slurm_lua is defined %}
JobSubmitPlugins=lua
{% endif %}
OverTimeLimit=1
CompleteWait=10
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=3000
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
#
# SCHEDULING
SchedulerType={{ slurmschedulertype }}
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
{% if slurmselecttype.find("cons_tres") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
PriorityWeightFairshare=10000
PriorityWeightAge=10000
PriorityWeightPartition=10000
PriorityWeightJobSize=10000
PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
SlurmctldDebug={{ slurmctlddebug.level }}
SlurmctldLogFile={{ slurmctlddebug.log }}
{% else %}
#SlurmctldDebug=
#SlurmctldLogFile=
{% endif %}
{% if slurmddebug %}
SlurmdDebug={{ slurmddebug.level }}
SlurmdLogFile={{ slurmddebug.log }}
{% else %}
#SlurmdDebug=
#SlurmdLogFile=
{% endif %}
{% if slurmschedlog %}
SlurmSchedlogLevel={{ slurmschedlog.level }}
SlurmSchedLogFile={{ slurmschedlog.log }}
{% else %}
#SlurmSchedlogLevel=
#SlurmSchedLogFile=
{% endif %}
JobCompType=jobcomp/none
#JobCompLoc=
#
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% else %}
Prolog=/opt/slurm/etc/slurm.prolog
Epilog=/opt/slurm/etc/slurm.epilog
{% endif %}
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmdbd }}
{% if slurmdbdbackup is defined %}
AccountingStorageBackupHost={{ slurmdbdbackup }}
{% endif %}
AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
#
HealthCheckInterval=300
HealthCheckProgram={{ nhc_dir }}/sbin/nhc
#array jobs. max number
{% if slurm_max_array_size is defined %}
MaxArraySize={{ slurm_max_array_size }}
{% endif %}
# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}
DisableRootJobs=YES
MpiParams=ports=12000-12999
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total - 1024 }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
{% set nodenames = [] %}
{% for node in groups[queue.group] %}
{% if nodenames.append(hostvars[node]['ansible_hostname']) %}
{% endif %}
{% endfor %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ nodenames|join(',') }} {% if queue.DefaultTime is defined %} DefaultTime={{ queue.DefaultTime }} {% endif %} {% if queue.DefMemPerCPU is defined %} DefMemPerCPU={{ queue.DefMemPerCPU }} {% endif %} {% if queue.MaxTime is defined %} MaxTime={{ queue.MaxTime}} {% endif %} State=UP
{% endfor %}
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
#DbdAddr=
DbdHost={{ slurmdbd }}
{% if slurmdbdbackup is defined %}
DbdBackupHost={{ slurmdbdbackup }}
{% endif %}
#DbdPort=7031
SlurmUser=slurm
#MessageTimeout=300
#DefaultQOS=normal,standby
{% if slurmdbdlog is defined %}
DebugLevel={{ slurmdbdlog.level }}
LogFile={{ slurmdbdlog.log }}
{% else %}
#DebugLevel=
#LogFile=
{% endif %}
PidFile=/opt/slurm/var/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost={{ mysql_host }}
#StoragePort=1234
StoragePass={{ slurmdb_passwd }}
StorageUser=slurmdb
StorageLoc=slurm_acct_db
---
-
name: Removing the RDO repository
file: path=/etc/yum.repos.d/rdo-release.repo state=absent
become: true
-
name: Install epel-release
yum: name=epel-release-7-5.noarch state=present
become: true
-
name: Enable epel
command: yum-config-manager --enable epel
become: true
-
name: Installing Base Packages
yum: name={{ item }} state=present
with_items:
- yum-utils
- deltarpm-3.6-3.el7.x86_64
- yum-plugin-versionlock
become: true
-
name: Installing Core packages
yum: name="{{ item.software }}-{{ item.version }}.{{ item.arch }}" state=present
with_items: package_list
become: true
-
name: Performing version lock on the packages
shell: yum versionlock \*
become: true
---
- include: installBasePackages.yml