#- hosts: 'all' #gather_facts: false # not sure if false is clever here #tasks: #- include_vars: vars/ldapConfig.yml #- include_vars: vars/filesystems.yml #- include_vars: vars/slurm.yml #- include_vars: vars/vars.yml #- { name: set use shared state, set_fact: usesharedstatedir=False } #tags: [ always ] # this playbook is roughly sorted by # - hostgroupstopics like ComputeNodes or ComputeNodes,LoginNodes, last VisNodes # - "tag_groups" each starting after a #comment see #misc or misc tag - hosts: 'ComputeNodes' gather_facts: false tasks: # these are just templates. #Note the tag never! Everything with never is only executed if called explicitly aka ansible-playbook --tags=foo,bar OR -tags=tag_group - { name: template_shell, shell: ls, tags: [never,tag_group,uniquetag_foo] } - { name: template_command, command: uname chdir=/bin, tags: [never,tag_group,uniquetag_bar] } - { name: template_scipt, script: ./scripts/qa/test.sh, tags: [never,tag_group,uniquetag_script] } #mpi stuff - { name: run mpi on one computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local,TODO] } - { name: run mpi on two computenode, command: ls, args: {chdir: "/tmp"} , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_local_two,TODO] } #- { name: run mpi via sbatch, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,slurm_mpi,TODO] } #- { name: mpi_pinging, command: cmd=ls chdir="/tmp" , failed_when: "TODO is TRUE", tags: [never,mpi,mpi_ping,TODO] } #module load openmpi/3.1.6-ucx;mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot #module load openmpi/3.1.6-ucx;srun mpirun --mca btl self --mca pml ucx -x UCX_TLS=mm -n 24 /projects/pMOSP/mpi/parallel_mandelbrot/parallel/mandelbrot #slurm - { name: slurmd should be running, service: name=slurmd state=started, tags: [never,slurm,slurmd] } - { name: munged should be running, service: name=munged state=started, tags: [never,slurm,munged] } - { name: ensure connectivity to the controller, shell: scontrol ping, tags: [never,slurm,scontrol_ping] } - { name: the most simple srun test, shell: srun --reservation=AWX hostname, tags: [never,slurm,srun_hostname] } #nhc, manually run nhc because it contains many tests - { name: run nhc explicitly, command: /opt/nhc-1.4.2/sbin/nhc -c /opt/nhc-1.4.2/etc/nhc/nhc.conf, become: true , tags: [never,slurm,nhc] } # networking - { name: ping license server, shell: ls, tags: [never,network,ping_license] } - { name: ping something outside monash, command: ping -c 1 8.8.8.8, tags: [never,network,ping_external] } #mounts - hosts: 'ComputeNodes,LoginNodes' gather_facts: false tasks: - { name: check mount for usr_local, shell: "mount | grep -q local", tags: [never,mountpoints,mountpoints_local] } - { name: check mount for projects, shell: "lfs df -h", tags: [never,mountpoints_projects] } - { name: check mount for home, shell: "mount | grep -q home", tags: [never,mountpoints,mountpoints_home] } - { name: check mount for scratch, shell: "mount | grep -q scratch" , tags: [never,mountpoints_scratch] } #misc - { name: check singularity, shell: module load octave && octave --version, tags: [never,misc,singularity3] } - { name: module test, shell: cmd="module load gcc" executable="/bin/bash", tags: [never,misc,modulecmd] } - { name: contact ldap, shell: maybe test ldapsearch, failed_when: "TODO is TRUE", tags: [never,misc,ldap,TODO] } #gpu - hosts: 'VisNodes' gather_facts: false tasks: - { name: run nvida-smi to see if a gpu driver is present, command: "/bin/nvidia-smi", tags: [never,gpu,smi] } - { name: run gpu burn defaults to 30 seconds, command: "/usr/local/gpu_burn/1.0/run_silent.sh", tags: [never,gpu,long,gpuburn] } # extended time-consuming tests # relion see https://docs.massive.org.au/communities/cryo-em/tuning/tuning.html # linpack #module load openmpi/1.10.7-mlx;ldd /usr/local/openmpi/1.10.7-mlx/bin/* | grep -ic found