Newer
Older
#- hosts: 'all'
#gather_facts: false # not sure if false is clever here
#tasks:
#- include_vars: vars/ldapConfig.yml
#- include_vars: vars/filesystems.yml
#- include_vars: vars/slurm.yml
#- include_vars: vars/vars.yml
#- { name: set use shared state, set_fact: usesharedstatedir=False }
#tags: [ always ]
# - hostgroupstopics like ComputeNodes or ComputeNodes,LoginNodes, last VisNodes
# - "tag_groups" each starting after a #comment see #misc or misc tag
- hosts: 'ComputeNodes'
gather_facts: false
tasks:
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#Note the tag never! Everything with never is only executed if called explicitly aka ansible-playbook --tags=foo,bar OR -tags=tag_group
- { name: template_shell, shell: ls, tags: [never,tag_group,uniquetag_foo] }
- { name: template_command, command: uname chdir=/bin, tags: [never,tag_group,uniquetag_bar] }
- { name: template_scipt, script: ./scripts/qa/test.sh, tags: [never,tag_group,uniquetag_script] }
#mpi stuff
- { name: run mpi on one computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local,TODO] }
- { name: run mpi on two computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local_two,TODO] }
#- { name: run mpi via sbatch, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,slurm_mpi,TODO] }
#- { name: mpi_pinging, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_ping,TODO] }
#module load openmpi/3.1.6-ucx;mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot
#module load openmpi/3.1.6-ucx;srun mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot
#slurm
- { name: slurmd should be running, service: name=slurmd state=started, tags: [never,slurm,slurmd] }
- { name: munged should be running, service: name=munged state=started, tags: [never,slurm,munged] }
- { name: ensure connectivity to the controller, shell: scontrol ping, tags: [never,slurm,scontrol_ping] }
- { name: the most simple srun test, shell: srun --reservation=AWX hostname, tags: [never,slurm,srun_hostname] }
#nhc, manually run nhc because it contains many tests
- { name: run nhc explicitly, command: /opt/nhc-1.4.2/sbin/nhc -c /opt/nhc-1.4.2/etc/nhc/nhc.conf, become: true , tags: [never,slurm,nhc] }
# networking
- { name: ping license server, shell: ls, tags: [never,network,ping_license] }
- { name: ping something outside monash, command: ping -c 1 8.8.8.8, tags: [never,network,ping_external] }
#mounts
- hosts: 'ComputeNodes,LoginNodes'
gather_facts: false
tasks:
- { name: check mount for usr_local, shell: "mount | grep -q local", tags: [never,mountpoints,mountpoints_local] }
- { name: check mount for projects, shell: "lfs df -h", tags: [never,mountpoints_projects] }
- { name: check mount for home, shell: "mount | grep -q home", tags: [never,mountpoints,mountpoints_home] }
- { name: check mount for scratch, shell: "mount | grep -q scratch" , tags: [never,mountpoints_scratch] }
#misc
- { name: check singularity, shell: module load octave && octave --version, tags: [never,misc,singularity3] }
- { name: module test, shell: cmd="module load gcc" executable="/bin/bash", tags: [never,misc,modulecmd] }
- { name: contact ldap, shell: maybe test ldapsearch, failed_when: "TODO is TRUE", tags: [never,misc,ldap,TODO] }
#gpu
- hosts: 'VisNodes'
gather_facts: false
tasks:
- { name: run nvida-smi to see if a gpu driver is present, command: "/bin/nvidia-smi", tags: [never,gpu,smi] }
- { name: run gpu burn defaults to 30 seconds, command: "/usr/local/gpu_burn/1.0/run_silent.sh", tags: [never,gpu,long,gpuburn] }
# extended time-consuming tests
# relion see https://docs.massive.org.au/communities/cryo-em/tuning/tuning.html
# linpack
#module load openmpi/1.10.7-mlx;ldd /usr/local/openmpi/1.10.7-mlx/bin/* | grep -ic found