Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 321 additions and 37 deletions
......@@ -6,27 +6,35 @@
- name: "Create apache key directory"
file: path={{ x509_key | dirname }} state=directory owner={{ apache_user }} mode=700
sudo: true
become: true
- name: "Create apache cert directory"
file: path={{ x509_cert | dirname }} state=directory owner={{ apache_user }} mode=755
sudo: true
become: true
- name: "Copying the apache key file"
template: src="files/{{ apache_key_file }}" dest="{{ x509_key }}" mode=0600 owner={{ apache_user }} group={{ apache_group }}
sudo: true
become: true
when: apache_key_file is defined
- name: "Copying the apache cert file"
template: src="files/{{ apache_cert_file }}" dest="{{ x509_cert }}" mode=0644 owner={{ apache_user }} group={{ apache_group }}
sudo: true
become: true
when: apache_cert_file is defined
- name: "Create log directory, start aoacge will have errors without it"
file: dest=/etc/apache2/logs state=directory
sudo: true
become: true
-
name: "Change permissions for /var/www"
file: path=/var/www state=directory owner=root group={{ apache_user }} mode=0775
sudo: true
become: true
-
name: "Starting Apache2"
service: name=apache2 state=started enabled=yes
become: true
when: ansible_os_family=="Debian"
---
- name: apt-get update
apt: update_cache=True
sudo: true
when: ansible_os_family=="Debian"
- name: apt-get upgrade
apt: upgrade=safe
sudo: true
become: true
when: ansible_os_family=="Debian"
---
- name: create package hold files in apt prefferences
template:
src: hold.pref.j2
dest: /etc/apt/preferences.d/hold-{{ item.pkg_name }}.pref
become: true
when: ansible_os_family=="Debian"
loop: "{{ apt_preferences }}"
# example data
#apt_preferences:
# - { pkg_name: 'yaru-theme-sound', pkg_version: "20.04.10.1", pkg_priority: "1002" }
# - { pkg_name: 'yaru-theme-gnome-shell', pkg_version: "20.04.10.1" }
# This file is generated by ansible role apt_preferences
Explanation: : {{ item.pkg_name }}
Package: {{ item.pkg_name }}
Pin: version {{ item.pkg_version }}
Pin-Priority: {{ item.pkg_priority|default("1001", true) }}
---
lustreVersion: v2_12_7
---
# see documentation at https://sites.google.com/a/monash.edu/hpc-services/work-instructions/system-configuration/lustre/compile-lustre-client-from-source
- name: install dependencies
package:
name: git
state: present
become: true
- name: get lustre sources for latest tags see https://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
git:
repo: git://git.whamcloud.com/fs/lustre-release.git
dest: /tmp/lustre-release
version: "{{ lustreVersion }}"
#command: git clone git://git.whamcloud.com/fs/lustre-release.git -b v2_12_5
# chdir: /tmp/lustre-release
# creates: /tmp/lustre-release
#- name: get a magic patch
# command: git fetch https://review.whamcloud.com/fs/lustre-release refs/changes/67/32267/1 && git checkout FETCH_HEAD
# chdir: /tmp/lustre-release
- name: install dependencies
package: #libelf-dev, libelf-devel or elfutils-libelf-devel
name:
- libyaml
- libyaml-devel
- libtool
- elfutils-libelf-devel
- libselinux-devel
#- kernel-rpm-macros #centos8.2
- kernel-abi-whitelists
state: present
become: true
when: ansible_os_family == 'RedHat'
- name: install dependencies
package: #libelf-dev, libelf-devel or elfutils-libelf-devel
name:
#- libyaml
- libyaml-dev
- libtool
#- elfutils-libelf-devel
- libselinux1-dev
- libpython3.8-dev #unconfirmed
#- kernel-abi-whitelists
- zlib1g-dev
- libssl-dev
- libiberty-dev
- module-assistant
- libreadline-dev
- libsnmp-dev
- mpi-default-dev
state: present
become: true
when: ansible_os_family == 'Debian'
- name: autogen
command:
cmd: '/bin/sh ./autogen.sh'
chdir: /tmp/lustre-release
- name: configure
command:
cmd: './configure --disable-server --with-o2ib=/usr/src/ofa_kernel/default'
chdir: /tmp/lustre-release
when: "'fitdgx' not in inventory_hostname"
- name: fail on fitdgx for a manuall process
fail:
msg: "check the linux src directory. likely with uname -r. and comment this fail and edit the task below. sorry this is manual for now."
when: "'fitdgx' in inventory_hostname"
- name: configure on fitdgx
command:
cmd: './configure --with-linux=/usr/src/linux-headers-5.4.0-74-generic --disable-server --with-o2ib=yes --disable-doc --disable-tests'
chdir: /tmp/lustre-release
when: "'fitdgx' in inventory_hostname"
- name: make
make:
chdir: /tmp/lustre-release
params:
NUM_THREADS: 6
#- name: make rpms
# command: make rpms
# chdir: /tmp/lustre-release
- name: make rpms
make:
target: rpms
chdir: /tmp/lustre-release
when: ansible_os_family == 'RedHat'
- name: make debs
make:
target: debs
chdir: /tmp/lustre-release
when: ansible_os_family == 'Debian'
......@@ -17,19 +17,29 @@ for group in d['groups'].keys():
i=0
for h in d['groups'][group]:
name = d['hostvars'][h]['ansible_hostname']
name = h
if not domain:
hosts[h] = [name]
else:
hosts[h] = ['%s.%s %s'%(name,domain,name)]
for h in hosts.keys():
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
string="%s"%(d['hostvars'][h]['ansible_eth0']['ipv4']['address'])
for addr in d['hostvars'][h]['ansible_all_ipv4_addresses']:
if "172.16.200" in addr:
string="%s"%addr
for name in hosts[h]:
string=string+" %s"%(name)
print string
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
string="%s"%(d['hostvars'][h]['ansible_default_ipv4']['address'])
for name in hosts[h]:
string=string+" %s"%(name)
print string
for h in hosts.keys():
for h in sorted(hosts.keys()):
if d['hostvars'].has_key(h):
if d['hostvars'][h].has_key('ansible_tun0'):
string="%s"%(d['hostvars'][h]['ansible_tun0']['ipv4']['address'])
......
#!/usr/bin/python3
import sys
import json
import socket
filename = sys.argv[1]
try:
domain = sys.argv[2]
except IndexError:
domain = None
f=open(filename,'r')
s=f.read()
d=json.loads(s)
f.close()
hosts={}
for group in list(d['groups'].keys()):
i=0
for h in d['groups'][group]:
name = d['hostvars'][h]['ansible_hostname']
name = h
if not domain:
hosts[h] = [name]
else:
hosts[h] = ['%s.%s %s'%(name,domain,name)]
for h in sorted(hosts.keys()):
if h in d['hostvars']:
for addr in d['hostvars'][h]['ansible_all_ipv4_addresses']:
if "172.16.200" in addr:
string="%s"%addr
for name in hosts[h]:
string=string+" %s"%(name)
print(string)
for h in sorted(hosts.keys()):
if h in d['hostvars']:
string="%s"%(d['hostvars'][h]['ansible_default_ipv4']['address'])
for name in hosts[h]:
string=string+" %s"%(name)
print(string)
for h in sorted(hosts.keys()):
if h in d['hostvars']:
if 'ansible_tun0' in d['hostvars'][h]:
string="%s"%(d['hostvars'][h]['ansible_tun0']['ipv4']['address'])
string=string+" %s-vpn"%h
print(string)
---
- name: get_groups_json
template: dest=/tmp/groups src=groups.j2
- name: copy script
copy: src=makehosts.py dest=/tmp/makehosts.py mode=755
when: ansible_os_family == "RedHat"
- name: copy script
copy: src=makehostsp3.py dest=/tmp/makehosts.py mode=755
when: ansible_os_family == "Debian"
- name: make hosts data
command: /tmp/makehosts.py /tmp/groups {{ domain }}
register: hosts_data
- name: write hosts file
template: dest=/tmp/etcHosts src=etcHosts.j2
- name: fetch hosts file
fetch: src=/tmp/etcHosts dest=files/etcHosts flat=yes
- name: make sure our repo server is resolvable
lineinfile:
path: files/etcHosts
line: "{{ reposerverip }} {{ reposervername }}"
owner: root
group: root
become: True
when: reposerverip is defined and reposervername is defined
......@@ -4,6 +4,7 @@ fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
{% for item in hosts_data.stdout_lines %}
{{ item }}
{% endfor %}
{
"groups": {{ groups | to_nice_json }},
"hostvars": {{ hostvars | to_nice_json }}
"hostvars": {
{% for host in groups['all'] %}
"{{ host }}" : {{ hostvars[host]|to_nice_json }}
{% if not loop.last %}
,
{% endif %}
{% endfor %}
}
}
- name: "Templating /etc/exports"
template: src=exports.j2 dest=/tmp/exports owner=root group=root mode=644
sudo: true
become: true
- name: "Fetch etcExports"
fetch: src=/tmp/exports dest=files/etcExports flat=yes
- name: "Templating /etc/ssh/known_hosts"
template: src=known_hosts.j2 dest=/tmp/ssh_known_hosts owner=root group=root mode=644
become: true
register: sshknownhost
- name: fetch known_hosts file
fetch: src=/tmp/ssh_known_hosts dest=files/ssh_known_hosts flat=yes
- name: delete ssh_known_hosts
file: path=/tmp/ssh_known_hosts state=absent
become: true
{% set keytypes = [ { 'type': 'ssh-rsa', 'fact': 'ansible_ssh_host_key_rsa_public' }, { 'type': 'ecdsa-sha2-nistp256', 'fact': 'ansible_ssh_host_key_ecdsa_public'} ] %}
{% for node in groups['all'] %}
{% for hostkey in keytypes %}
{% if hostkey['fact'] in hostvars[node] %}
{{ node }} {{ hostkey['type'] }} {{ hostvars[node][hostkey['fact']] }}
{% for ip in hostvars[node]['ansible_all_ipv4_addresses'] %}
{{ ip }} {{ hostkey['type'] }} {{ hostvars[node][hostkey['fact']] }}
{% endfor %}
{% endif %}
{% endfor %}
{% endfor %}
---
- name: "Templating nhc.conf"
template: src=nhc.conf.j2 dest=/tmp/nhc.conf owner=root group=root mode=644
become: true
- name: fetch nhc.conf
fetch: src=/tmp/nhc.conf dest=files/nhc.conf flat=yes
#######################################################################
###
### Filesystem checks
###
# * || check_fs_mount_rw -t "fuse.glusterfs" -s "mgmt0:/gv" -f "/glusterVolume"
* || check_fs_used / 90%
# * || check_fs_used /glusterVolume 90%
* || check_fs_iused / 100%
# * || check_fs_iused /glusterVolume 100%
#######################################################################
###
### Hardware checks
###
* || check_hw_cpuinfo 1 1 1
# * || check_hw_physmem 4048416kB 4048416kB 3%
* || check_hw_swap 0kB 0kB 3%
* || check_hw_eth eth0
* || check_hw_eth lo
#######################################################################
###
### Process checks
###
* || check_ps_service -S -u root sshd
- name: "Templating slurm.conf"
template: src=slurm.conf.j2 dest=/tmp/slurm.conf owner=root group=root mode=644
sudo: true
become: true
- name: fetch slurm.conf
fetch: src=/tmp/slurm.conf dest=files/slurm.conf flat=yes
- name: "Templating slurmdbd.conf"
template: src=slurmdbd.conf.j2 dest=/tmp/slurmdbd.conf owner=root group=root mode=644
sudo: true
become: true
- name: fetch slurm.conf
fetch: src=/tmp/slurmdbd.conf dest=files/slurmdbd.conf flat=yes
......@@ -17,6 +17,7 @@ BackupController={{ slurmctrlbackup }}
#BackupController=
#BackupAddr=
#
SlurmctldParameters=enable_configless
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
......@@ -30,11 +31,13 @@ SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
#ProctrackType=proctrack/linuxproc
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
RebootProgram=/sbin/reboot
#ResumeTimeout=300
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
......@@ -61,7 +64,7 @@ CompleteWait=10
#UsePAM=
#
# TIMERS
#SlurmctldTimeout=300
SlurmctldTimeout=3000
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
......@@ -74,20 +77,19 @@ SchedulerType={{ slurmschedulertype }}
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
{% if slurmselecttype.find("cons_res") > 0 %}
{% if slurmselecttype.find("cons_tres") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
PriorityWeightFairshare=10000
PriorityWeightAge=10000
PriorityWeightPartition=10000
PriorityWeightJobSize=10000
PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
......@@ -117,24 +119,36 @@ JobCompType=jobcomp/none
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% else %}
Prolog=/opt/slurm/etc/slurm.prolog
Epilog=/opt/slurm/etc/slurm.epilog
{% endif %}
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmdbd }}
{% if slurmdbdbackup is defined %}
AccountingStorageBackupHost={{ slurmdbdbackup }}
{% endif %}
#AccountingStorageEnforce=limits,safe
AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
#
HealthCheckInterval=300
HealthCheckProgram={{ nhc_dir }}/sbin/nhc
#array jobs. max number
{% if slurm_max_array_size is defined %}
MaxArraySize={{ slurm_max_array_size }}
{% endif %}
# Fair share
{% if slurmfairshare.def %}
......@@ -152,7 +166,7 @@ MpiParams=ports=12000-12999
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total - 1024 }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
......@@ -161,5 +175,5 @@ NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansib
{% if nodenames.append(hostvars[node]['ansible_hostname']) %}
{% endif %}
{% endfor %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ nodenames|join(',') }} DefaultTime=24:00:00 State=UP
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ nodenames|join(',') }} {% if queue.DefaultTime is defined %} DefaultTime={{ queue.DefaultTime }} {% endif %} {% if queue.DefMemPerCPU is defined %} DefMemPerCPU={{ queue.DefMemPerCPU }} {% endif %} {% if queue.MaxTime is defined %} MaxTime={{ queue.MaxTime}} {% endif %} State=UP
{% endfor %}
......@@ -32,7 +32,7 @@ LogFile={{ slurmdbdlog.log }}
#DebugLevel=
#LogFile=
{% endif %}
PidFile=/var/run/slurmdbd.pid
PidFile=/opt/slurm/var/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
......
......@@ -2,15 +2,15 @@
-
name: Removing the RDO repository
file: path=/etc/yum.repos.d/rdo-release.repo state=absent
sudo: true
become: true
-
name: Install epel-release
yum: name=epel-release-7-5.noarch state=present
sudo: true
become: true
-
name: Enable epel
command: yum-config-manager --enable epel
sudo: true
become: true
-
name: Installing Base Packages
yum: name={{ item }} state=present
......@@ -18,14 +18,14 @@
- yum-utils
- deltarpm-3.6-3.el7.x86_64
- yum-plugin-versionlock
sudo: true
become: true
-
name: Installing Core packages
yum: name="{{ item.software }}-{{ item.version }}.{{ item.arch }}" state=present
with_items: package_list
sudo: true
become: true
-
name: Performing version lock on the packages
shell: yum versionlock \*
sudo: true
become: true