Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 418 additions and 18 deletions
#- hosts: 'all'
#gather_facts: false # not sure if false is clever here
#tasks:
#- include_vars: vars/ldapConfig.yml
#- include_vars: vars/filesystems.yml
#- include_vars: vars/slurm.yml
#- include_vars: vars/vars.yml
#- { name: set use shared state, set_fact: usesharedstatedir=False }
#tags: [ always ]
# this playbook is roughly sorted by
# - hostgroupstopics like ComputeNodes or ComputeNodes,LoginNodes, last VisNodes
# - "tag_groups" each starting after a #comment see #misc or misc tag
- hosts: 'ComputeNodes'
gather_facts: false
tasks:
# these are just templates.
#Note the tag never! Everything with never is only executed if called explicitly aka ansible-playbook --tags=foo,bar OR -tags=tag_group
- { name: template_shell, shell: ls, tags: [never,tag_group,uniquetag_foo] }
- { name: template_command, command: uname chdir=/bin, tags: [never,tag_group,uniquetag_bar] }
- { name: template_scipt, script: ./scripts/qa/test.sh, tags: [never,tag_group,uniquetag_script] }
#mpi stuff
- { name: run mpi on one computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local,TODO] }
- { name: run mpi on two computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local_two,TODO] }
#- { name: run mpi via sbatch, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,slurm_mpi,TODO] }
#- { name: mpi_pinging, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_ping,TODO] }
#module load openmpi/3.1.6-ucx;mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot
#module load openmpi/3.1.6-ucx;srun mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot
#slurm
- { name: slurmd should be running, service: name=slurmd state=started, tags: [never,slurm,slurmd] }
- { name: munged should be running, service: name=munged state=started, tags: [never,slurm,munged] }
- { name: ensure connectivity to the controller, shell: scontrol ping, tags: [never,slurm,scontrol_ping] }
- { name: the most simple srun test, shell: srun --reservation=AWX hostname, tags: [never,slurm,srun_hostname] }
#nhc, manually run nhc because it contains many tests
- { name: run nhc explicitly, command: /opt/nhc-1.4.2/sbin/nhc -c /opt/nhc-1.4.2/etc/nhc/nhc.conf, become: true , tags: [never,slurm,nhc] }
# networking
- { name: ping license server, shell: ls, tags: [never,network,ping_license] }
- { name: ping something outside monash, command: ping -c 1 8.8.8.8, tags: [never,network,ping_external] }
#mounts
- hosts: 'ComputeNodes,LoginNodes'
gather_facts: false
tasks:
- { name: check mount for usr_local, shell: "mount | grep -q local", tags: [never,mountpoints,mountpoints_local] }
- { name: check mount for projects, shell: "lfs df -h", tags: [never,mountpoints_projects] }
- { name: check mount for home, shell: "mount | grep -q home", tags: [never,mountpoints,mountpoints_home] }
- { name: check mount for scratch, shell: "mount | grep -q scratch" , tags: [never,mountpoints_scratch] }
#misc
- { name: check singularity, shell: module load octave && octave --version, tags: [never,misc,singularity3] }
- { name: module test, shell: cmd="module load gcc" executable="/bin/bash", tags: [never,misc,modulecmd] }
- { name: contact ldap, shell: maybe test ldapsearch, failed_when: "TODO is TRUE", tags: [never,misc,ldap,TODO] }
#gpu
- hosts: 'VisNodes'
gather_facts: false
tasks:
- { name: run nvida-smi to see if a gpu driver is present, command: "/bin/nvidia-smi", tags: [never,gpu,smi] }
- { name: run gpu burn defaults to 30 seconds, command: "/usr/local/gpu_burn/1.0/run_silent.sh", tags: [never,gpu,long,gpuburn] }
# extended time-consuming tests
# relion see https://docs.massive.org.au/communities/cryo-em/tuning/tuning.html
# linpack
#module load openmpi/1.10.7-mlx;ldd /usr/local/openmpi/1.10.7-mlx/bin/* | grep -ic found
#!/usr/bin/python
import subprocess
import sys
def getTime():
print "How long do you think you need this computer for?"
print "If you need the computer for 2 days and 12 hours please enter as 2-12 or 2-12:00:00"
time=sys.stdin.readline().strip()
try:
(days,hours)=time.split('-')
except:
days=0
hours=time
try:
(hours,minues) = time.split(':')
except:
pass
return (days,hours)
def getNCPUs():
print "How many CPUs would you like?"
cpus=None
while cpus==None:
cpustr=sys.stdin.readline().strip()
try:
cpus=int(cpustr)
except:
print "Sorry I can't interpret %s as a number"%cpustr
print "How many CPUs would you like?"
return cpus
def getRAM():
print "How much RAM would you like (press enter for the default)?"
ramstr= sys.stdin.readline().strip()
while ramstr!=None and ramstr!="":
try:
ram=int(ramstr)
return ram
except:
print "Sorry I can't interpret %s as a number"%ramstr
print "How much RAM would you like?"
ramstr= sys.stdin.readline()
return None
def subjob(time,cpus,ram):
if ram==None:
ram=cpus*2000
import subprocess
scriptpath='/home/chines'
p=subprocess.Popen(['sbatch','--time=%s-%s'%(time[0],time[1]),'--nodes=1','--mincpu=%s'%cpus,'--mem=%s'%ram,'%s/mbpjob.sh'%scriptpath],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
(stdout,stderr)=p.communicate()
import re
m=re.match('Submitted batch job (?P<jobid>[0-9]+)',stdout)
if m:
return m.groupdict()['jobid']
def isState(jobid,state='RUNNING'):
import re
p=subprocess.Popen(['scontrol','show','job','-d',jobid],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
(stdout,stderr)=p.communicate()
jobidre=re.compile('JobId=(?P<jobid>[0-9]+)\s')
statere=re.compile('^\s+JobState=(?P<state>\S+)\s')
currentjobid=None
for l in stdout.splitlines():
m=jobidre.match(l)
if m:
currentjobid=m.groupdict()['jobid']
m=statere.match(l)
if m:
if m.groupdict()['state']==state:
if jobid==currentjobid:
return True
else:
if jobid==currentjobid:
return False
return False
def waitjob(jobid):
import time
while True:
if isState(jobid,'RUNNING'):
return
else:
print "job %s not running"%jobid
time.sleep(1)
def listJobs():
import re
r=[]
user = subprocess.check_output(['whoami'])
jobs = subprocess.check_output(['squeue','-u',user,'-h','-o','"%i %L %j %c"'])
jobre=re.compile("(?P<jobid>(?P<jobidNumber>[0-9]+)) (?P<time>\S+ (?P<jobname>\S+) (?P<cpus>[0-9]+))$"
for l in jobs.splitlines():
m=jobidre.search(l)
if m:
r.append(m.groupdict())
return r
def getNode(jobid):
import re
stdout=subprocess.check_output(['scontrol','show','job','-d',jobid])
for l in stdout.splitlines():
m=re.search('^\s+Nodes=(?P<nodelist>\S+)\s',l)
if m:
nodes=m.groupdict()['nodelist'].split(',')
return nodes[0]
def createJob(*args,**kwargs):
time=getTime()
#cpus=getNCPUs()
cpus=1
#ram=getRAM()
ram=None
subjob(time,cpus,ram)
def selectJob(jobidlist):
if len(jobidlist)==1:
return jobidlist[0]['jobid']
else:
print "Please select a job (or press enter to cancel)"
i=1
print "\tJob name\tNum CPUs\tRemaining Time"
for j in jobidlist:
print "%s\t%s\t%s\t%s"%(i,j['jobname'],j['numcpus'],j['time'])
try:
jobnum=int(sys.stdin.readline().strip())
if (jobnum>0 and jobnum<=jobidlist):
return jobidlist[jobnum-1]['jobid']
except:
pass
return None
def connect(*args,**kwargs):
jobidlist=listJobs()
jobid=selectJob(jobidlist)
if jobid!=None:
waitjob(jobid)
node=getNode(jobid)
print node
def stop(*args,**kwargs):
jobidlist=listJobs()
jobid=selectJob(jobidlist)
if jobid!=None:
stopjob(jobid)
def main():
import argparse
parser = argparse.ArgumentParser()
subparser = parser.add_subparsers()
start = subparser.add_parser('start', help='alloate a node to the user')
start.set_defaults(func=createJob)
connect = subparser.add_parser('connect')
start.set_defaults(func=connect)
stop = subparser.add_parser('stop')
start.set_defaults(func=stop)
args = parser.parse_args()
args.func(args)
try:
jobidlist=listJobs()
if len(jobidlist)>1:
print "cancel all jobs here"
jobidlist=listJobs()
if len(jobidlist)==0:
time=getTime()
#cpus=getNCPUs()
cpus=1
#ram=getRAM()
ram=None
subjob(time,cpus,ram)
jobidlist=listJobs()
if len(jobidlist)==1:
jobid=jobidlist[0]
waitjob(jobid)
node=getNode(jobid)
print node
sys.exit(0)
except Exception as e:
print e
import traceback
print traceback.format_exc()
sys.exit(1)
main()
#!/bin/bash
mpbctrl='/home/hines/mbp_script/get_node.py'
node=$( $mbpctrl $1 )
if [[ $node ]]; then
ssh -t $node tmux attach-session
fi
---
- name: make sure /usr/local/bin exists
file: path=/usr/local/bin state=directory mode=755 owner=root
become: true
- name: install get_node.py
copy: src=get_node.py dest=/usr/local/bin/get_node.py mode=755 owner=root
become: true
- name: install mbp_node
copy: src=mbp_node dest=/usr/local/bin/mbp_node mode=755 owner=root
become: true
---
# This role is to fix a misconfiguration of some OpenStack Base images at Monash University.
# the misconfiguration is dev/vdb mounted in fstab of the Image and the Openstack Flavour not providing a second disk.
- name: unmount vdb if absent
mount:
path: "/mnt"
src: "/dev/vdb"
state: absent
become: true
when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined'
- name: keep mnt present
file:
path: "/mnt"
owner: root
group: root
mode: "u=rwx,g=rx,o=rx"
state: directory
become: true
when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined'
---
- name: restart openvpn
service: name=openvpn state=restarted
sudo: true
become: true
---
-
---
-
copy: "src=/tmp/{{ inventory_hostname }}/ca.crt dest=/etc/openvpn/ca.crt mode=644 owner=root group=root"
name: "Copying CA certificate"
when: "client_ca_cert.stat.exists == false"
-
-
copy: "src=/tmp/{{ inventory_hostname }}/{{ inventory_hostname }}.crt dest=/etc/openvpn/{{ inventory_hostname }}.crt mode=644 owner=root group=root"
name: "Copying Client certificate"
when: "client_sign_cert.stat.exists == false"
-
-
copy: "src=/tmp/{{ inventory_hostname }}/{{ inventory_hostname }}.key dest=/etc/openvpn/{{ inventory_hostname }}.key mode=600 owner=root group=root"
name: "Copying Client key"
when: "client_key.stat.exists == false"
......
---
---
- name: "Install OpenVPN"
yum: "name=openvpn state=present"
sudo: true
become: true
notify: restart openvpn
- name: "Copying client.conf to the OpenVPN client"
template: "src=client.conf.j2 dest=/etc/openvpn/client.conf"
sudo: true
become: true
notify: restart openvpn
......@@ -3,6 +3,6 @@
include: installOpenVPN.yml
- name: "Start OpenVPN"
service: name=openvpn state=started
sudo: true
service: name=openvpn state=started enabled=yes
become: true
readme.txt
\ No newline at end of file
---
- name: restart openvpn
service: name=openvpn state=restarted
sudo: true
become: true
---
---
- name: "Install OpenVPN"
yum: "name=openvpn state=present"
notify: "restart openvpn"
sudo: true
become: true
- name: Create path
shell: mkdir -p {{ dhparms_file | dirname }}
args:
creates: "{{ dhparms_file | dirname }}"
sudo: true
become: true
- name: "Generate DH parameters"
shell: openssl dhparam -out {{ dhparms_file }} 512
args:
creates: "{{ dhparms_file }}"
sudo: true
become: true
- name: "Configure OpenVPN Server"
template: "src=server.conf.j2 dest=/etc/openvpn/server.conf"
notify: "restart openvpn"
sudo: true
become: true
......@@ -3,5 +3,5 @@
include: installOpenVPN.yml
- name: "Start OpenVPN"
service: name=openvpn state=started
sudo: true
service: name=openvpn state=started enabled=yes
become: true
readme.txt
\ No newline at end of file
- name: install known hosts file
copy: src=files/ssh_known_hosts dest=/etc/ssh/ssh_known_hosts owner=root mode=644
become: true
become_user: root
- name: setup additiona PATHs in /etc/profile.d
template:
src: additional_paths.sh.j2
dest: /etc/profile.d/additional_paths.sh
become: true
when: additional_paths is defined
export PATH=$PATH:{{ additional_paths|join(":") }}
---
- name: place /usr/local/ last in the PATH in /etc/profile
lineinfile:
args:
dest: "/etc/profile"
insertbefore: BOF
line: "PATH=/bin:/usr/bin:/usr/local/bin"
become: true
become_user: root
- name: remove old line
lineinfile:
args:
dest: "/etc/profile"
regexp: "^PATH=/usr/local/bin:/bin:/usr/bin$"
state: absent
become: true
become_user: root
- name: remove /usr/local/ from the PATH in /etc/profile
lineinfile:
args:
dest: "/etc/profile"
regexp: ".*pathmunge /usr/local.*"
state: absent
become: true
become_user: root
- name: dont execute abrt-cli on login
file: path=/etc/profile.d/abrt-console-notification.sh state=absent
become: true
become_user: root
---
-
name: "Install Apache2"
apt: name={{ item }} state=present
with_items:
- apache2
- apache2-dev
become: true
-
name: "Templating default-ssl site"
template: src=default-ssl.j2 dest=/etc/apache2/sites-available/default-ssl.conf owner=www-data group=www-data
become: true
-
name: "Templating default site"
template: src=default.j2 dest=/etc/apache2/sites-available/000-default.conf owner=www-data group=www-data
become: true
-
name: "Enable ssl module"
apache2_module: state=present name=ssl
become: true
-
name: "Enable default-ssl site"
shell: a2ensite default-ssl
become: true
notify: restart apache2
---
-
name: "Installing Apache"
become: true
yum: name={{ item }} state=present
with_items:
- mod_ssl
- mod_wsgi
- openssl
- httpd
- httpd-devel
-
name: Setting httpd.conf
become: true
replace: dest=/etc/httpd/conf/httpd.conf regexp="^#ServerName www.example.com:80" replace="ServerName {{ ansible_fqdn }}"
-
name: "Templating default-ssl site"
template: src=default-ssl.j2 dest=/etc/httpd/conf.d/ssl.conf owner=apache group=apache
become: true
-
name: Templating wsgi.conf
become: true
template: src=wsgi.conf.j2 dest=/etc/httpd/conf.d/wsgi.conf owner=root group=root
-
name: Restarting Apache
become: true
service: name=httpd state=restarted