Compare revisions

cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b
--- a/CICD/pre_templates/slurm_vars.j2
+++ b/CICD/pre_templates/slurm_vars.j2
+---
+desktopNodeList:
+  - { name : 'DesktopNodes', interface : 'eth0' }
+slurmqueues:
+  - {name: batch, group: ComputeNodes, default: yes}
+
+
+slurmsharedstatedir: /slurm_state
+slurm_version: {{ slurm_version }}
+munge_version: {{ munge_version }}
+nhc_version: {{ nhc_version }}
+ucx_version: {{ ucx_version }}
+clustername: "{{ clustername }}"
+
+
+nhc_src_url: {{ nhc_src_url }}
+nhc_src_checksum: {{ nhc_src_checksum }}
+nhc_src_dir: {{ nhc_src_dir }}
+nhc_dir: {{ nhc_dir }}
+slurm_src_url: {{ slurm_src_url }}
+slurm_src_checksum: {{ slurm_src_checksum }}
+slurm_src_dir: {{ slurm_src_dir }}
+slurm_dir: {{ slurm_dir }}
+ucx_src_url: {{ ucx_src_url }}
+ucx_src_checksum: {{ ucx_src_checksum }}
+ucx_src_dir: {{ ucx_src_dir }}
+ucx_dir: {{ ucx_dir }}
+munge_src_url: {{ munge_src_url }}
+munge_src_checksum: {{ munge_src_checksum }}
+munge_src_dir: {{ munge_src_dir }}
+munge_dir: {{ munge_dir }}
+
+nhc_config_file: nhc.conf
+nhc_log_level: 0
+nhc_emails: nobody@nowhere.nowhere
+nhc_email_subject: "Node Health Check"
+mysql_host: "{{ sqlnode }}"
+slurmctrl:  "{{ controller }}"
+slurmctrlbackup: "{{ backup }}"
+slurmdbd: "{{ controller }}"
+slurmdbdpiddir: "/opt/slurm/var/run"
+slurmdbdbackup: "{{ backup }}"
+slurm_lua: true
+slurmlogin: "{{ controller }}"
+slurmlogdir: "/var/log"
+slurmctlddebug: {level: 5, log: '/mnt/slurm-logs/slurmctld.log'}
+slurmddebug: {level: 5, log: '/var/log/slurmd.log'}
+slurmschedlog: {level: 5, log: '/mnt/slurm-logs/slurmsched.log'}
+slurmdbdlog: {level: 5, log: '/mnt/slurm-logs/slurmdbd.log'}
+slurmfairshare: {def: false, val: 10000}
+slurmdatadir: "/opt/slurm/var/spool"
+slurmstatedir: "/opt/slurm/var/state"
+slurmpiddir: "/opt/slurm/var/run"
+slurmselecttype: "select/cons_tres"
+slurmfastschedule: "1"
+slurmschedulertype: "sched/backfill"
+restartServerList:
+  - slurm
--- a/CICD/pre_templates/slurmdbd.conf.j2
+++ b/CICD/pre_templates/slurmdbd.conf.j2
+#
+# Example slurmdbd.conf file.
+#
+# See the slurmdbd.conf man page for more information.
+#
+# Archive info
+#ArchiveJobs=yes
+#ArchiveDir="/tmp"
+#ArchiveSteps=yes
+#ArchiveScript=
+#JobPurge=12
+#StepPurge=1
+#
+# Authentication info
+AuthType=auth/munge
+#AuthInfo=/var/run/munge/munge.socket.2
+#
+# slurmDBD info
+#DbdAddr=
+DbdHost={{ controller }}
+DbdBackupHost={{ backup }}
+#DbdPort=7031
+SlurmUser=slurm
+#MessageTimeout=300
+#DefaultQOS=normal,standby
+#DebugLevel=
+#LogFile=
+PidFile=/opt/slurm/var/run/slurmdbd.pid
+#PluginDir=/usr/lib/slurm
+#PrivateData=accounts,users,usage,jobs
+#TrackWCKey=yes
+#
+# Database info
+StorageType=accounting_storage/mysql
+StorageHost={{ sqlnode }}
+#StoragePort=1234
+StoragePass={{ slurmdb_passwd }}
+StorageUser=slurmdb
+StorageLoc=slurm_acct_db
+
--- a/CICD/pre_templates/ssh_cfg.j2
+++ b/CICD/pre_templates/ssh_cfg.j2
+Host 192.168.*
+    ProxyJump={{ ansible_user }}@{{ bastion_floating_ip }}
+    StrictHostKeyChecking=no
+    ServerAliveInterval 60
+    ControlMaster auto
+    ControlPath ~/.ssh/%r@%h:%p
+    UserKnownHostsFile ./known_hosts
+    GlobalKnownHostsFile /dev/null
+
--- a/CICD/pre_templates/vars_yml.j2
+++ b/CICD/pre_templates/vars_yml.j2
+---
+sudo_group: systems
+nagios_home: "/var/lib/nagios"
+nvidia_version: "450.51.06"
+syslog_server: "{{ sqlnode }}"
+
--- a/CICD/tests/ComputeNodes/modules.sh
+++ b/CICD/tests/ComputeNodes/modules.sh
+#!/bin/bash
+#source /etc/profile.d/modulecmd.sh
+#source /etc/profile.d/modules.sh 
+
+#ubuntu is very picky so lets skip it
+/bin/grep Ubuntu -q /etc/issue && exit 0 
+module purge
+#module load gcc/8.1.0
+module list
+#gcc --version | grep 8.1.0
--- a/CICD/tests/LoginNodes/modules.sh
+++ b/CICD/tests/LoginNodes/modules.sh
+../ComputeNodes/modules.sh
\ No newline at end of file
--- a/CICD/tests/LoginNodes/run_slurm_testsuite.inactive
+++ b/CICD/tests/LoginNodes/run_slurm_testsuite.inactive
+#!/bin/bash
+OUTPUT_LOG=$(realpath ${1-slurmtest.out})
+if ! type "scontrol" > /dev/null; then
+	echo "cannot find slurm"
+	exit 1
+fi
+SLURM_DIR=${2-$(dirname $(dirname $(which scontrol)))}
+#SLURM_DIR=$slurm_dir
+
+#if [[ -d $2 ]];then
+#    SLURM_SRC_DIR=$2    
+#else
+#    SLURM_SRC_DIR=./slurm_src
+#    git clone https://github.com/SchedMD/slurm.git $SLURM_SRC_DIR
+#    cd $SLURM_SRC_DIR && ./configure
+#fi
+#cd $SLURM_SRC_DIR/testsuite/expect
+#echo -en "set slurm_dir=$SLURM_DIR\nset max_job_delay 300\n" > globals.local
+#make
+#echo "log is written to $OUTPUT_LOG"
+#echo "slurm dir is defined as $SLURM_DIR"
+#./regression > /dev/null 2> $OUTPUT_LOG
+#failures="$(sed -n 's/Failures:   \(.*\)/\1/p' $OUTPUT_LOG)"
+#if (( $failures > 0 ));then
+#	echo "$failures failures found, refer to $OUTPUT_LOG for log"
+#	exit 1
+#fi
+#exit 0
+
--- a/CICD/tests/ManagementNodes/check.yml
+++ b/CICD/tests/ManagementNodes/check.yml
+---
+- hosts: ManagementNodes
+  gather_facts: false
+  tasks:
+  - name: have ssh running
+    service:
+      name: sshd
+      state: started
\ No newline at end of file
--- a/CICD/tests/Readme.md
+++ b/CICD/tests/Readme.md
+This folder should contain tests that will be run automatically by the CICD pipeline.
+
+The trigger mechanism to execute these tests is `.gitlab-ci.yml` in the toplevel folder of this repository.
+
+ - *.sh will be executed by a shell, *.yml will be executed by ansible-playbook
+ - A test should return 0 as success or otherwise as fail. See 0_Example*
+ - Tests should aim not to alter the system except /tmp
+ - Prefixing tests with 0-9 to give the execution some priority
+ - Tests should not need elevated priviliges. A special user TODO is considered to be present
+ - TODO a way to deal with Test Payload. This will be determined on test submission
\ No newline at end of file
--- a/CICD/tests/all/0_EXAMPLE_FALSE.sh
+++ b/CICD/tests/all/0_EXAMPLE_FALSE.sh
+#!/bin/bash
+/bin/false
+
+status=$?
+[ $status -eq 1 ] 
\ No newline at end of file
--- a/CICD/tests/all/0_EXAMPLE_TRUE.sh
+++ b/CICD/tests/all/0_EXAMPLE_TRUE.sh
+#!/bin/bash
+
+/bin/true
+
+status=$?
+[ $status -eq 0 ] 
--- a/CICD/tests/all/check.yml
+++ b/CICD/tests/all/check.yml
+---
+- hosts: ManagementNodes
+  gather_facts: false
+  tasks:
+  - name: have ssh running
+    service:
+      name: sshd
+      state: started
+
+- hosts: ComputeNodes
+  gather_facts: false
+  tasks:
+  - name: have munge service running
+    service:
+      name: munge
+      state: started
\ No newline at end of file
--- a/CICD/tests/mockSlurmData.yml
+++ b/CICD/tests/mockSlurmData.yml
+---
+- hosts: ManagementNodes,LoginNodes,ComputeNodes
+  gather_facts: false
+  tasks:
+  - name: add user hpctest
+    user:
+      name: hpctest
+      shell: /bin/bash
+    become: true
+
+- hosts: ManagementNodes
+  gather_facts: false
+  tasks:
+  - name: Create a parent account
+    command: ./sacctmgr -i add account parentAccount cluster={{ clustername }} Description="Test parent account" Organization="Monash"
+    args:
+      chdir: '/opt/slurm-latest/bin'
+    become: true
+    register: result 
+    failed_when: result.rc != 0 and result.stdout != " Nothing new added."
+    
+  - name: Create a project associated with a given parent
+    command: ./sacctmgr -i add account testProject parent=parentAccount cluster={{ clustername }} Organization="Monash"
+    args:
+      chdir: '/opt/slurm-latest/bin'
+    become: true
+    register: result 
+    failed_when: result.rc != 0 and result.stdout != " Nothing new added."
+    
+  - name: Create a user and associate them with a project
+    command: ./sacctmgr -i create user hpctest cluster={{ clustername }} account=testProject partition=batch
+    args:
+      chdir: '/opt/slurm-latest/bin'
+    become: true 
+    register: result 
+    failed_when: result.rc != 0 and result.stdout != " Nothing new added."
+
+#  - name: restart slurmctld
+#    service:
+#      name: slurmctld
+#      state: restarted
+#    become: true
+
+  #- name: reconfigure scontrol
+  #  command: ./scontrol reconfigure 
+  #  args:
+  #    chdir: '/opt/slurm-latest/bin'
+  #  become: true
+  #  become_user: slurm
+    
+#sudo `which sacctmgr` modify user where name=hpctest set maxjobs=200
+##  18  sudo `which sacctmgr` update account hpctest set qos=normal
+#   22  sudo `which sacctmgr` update account testProject set qos=normal
+
+- hosts: LoginNodes,ComputeNodes,ManagementNodes
+  gather_facts: false
+  tasks:
+  - name: make sure munge is running
+    service:
+      name: munge
+      state: started
+    become: true
--- a/CICD/tests/mpi/README.md
+++ b/CICD/tests/mpi/README.md
+MPI tests.
+run a test on a single (updated or rebuilt) node, and two nodes.
+
+run_two_node.sh --newnode=<node> --reservation=<res> [ --partition=<partition> --testnode=<testnode> ]
+e.g. run_two_node.sh --newNode=gf00 --reservation=monMaintenance
+e.g. run_two_node.sh --newNode=gf00 --reservation=monMaintenance --partition=gpu --testnode=ge00
+Where
+<node> is the name of the updated host we wish to test
+<res> is the name of the SLURM reservation
+<partition> is the name of the SLURM partition (defaults to comp)
+<testnode> is a second (unupdated) node to do 2 server MPI (defaults to gf01)
+
+
+This code assumes:
+- slurm controller is up (So have a MPI environment to use)
+- it uses srun to submit the job
+- it assumes there is a SLURM reservation present which you must specify
+- uses openmpi openmpi/1.10.7-mlx
+- it uses a timer to kill the shell in case srun hangs.  This timer still seems to work even when
+   we exit the script normally.i.e. it activates when script has left.
--- a/CICD/tests/mpi/rotate.cpp
+++ b/CICD/tests/mpi/rotate.cpp
+#include <iostream>
+#include <exception>
+#include <string>
+#include <cstring> 
+using namespace std;
+#include <stdlib.h>
+
+#include <mpi.h>
+
+  #include <unistd.h>
+//
+//
+// rotate exercies from nci
+// passes a number around the ranks of a MPI program
+// used to test connectivity. We use non block primitives just because we can
+//
+
+void do_work_non_blocking();
+
+int mpirank=0;
+
+const int MES_TAG=1;
+
+
+#include<iostream>
+#include<fstream>
+#include<string>
+#include<cstdlib>
+#include<sstream>
+
+std::string ssystem (const char *command) {
+    char tmpname [L_tmpnam];
+    std::tmpnam ( tmpname );
+    std::string scommand = command;
+    std::string cmd = scommand + " >> " + tmpname;
+    std::system(cmd.c_str());
+    std::ifstream file(tmpname, std::ios::in );
+    std::string result;
+        if (file) {
+      while (!file.eof()) result.push_back(file.get());
+          file.close();
+    }
+    remove(tmpname);
+    return result;
+}
+
+
+int main( int argc, char* argv[])
+	{
+	try 
+		{
+		cout <<"rotate program called on host ";
+       	size_t BUFFLEN= 256;
+		char hostBuffer[BUFFLEN];
+		hostBuffer[0]='\0';
+		gethostname(hostBuffer, BUFFLEN);
+		cout<<hostBuffer<<endl;
+
+
+		int mpierror, mpisize;
+
+
+
+        cout<<"Before MPI_Init"<<endl;
+		mpierror = MPI_Init(&argc,&argv);
+		if (mpierror != MPI_SUCCESS)
+			{
+			cerr <<"Error in mpi init "<<mpierror<<endl;
+			exit(mpierror);
+			}
+        cout<<"Before MPI_Comm_size"<<endl;
+		mpierror = MPI_Comm_size(MPI_COMM_WORLD,&mpisize);
+		if (mpierror != MPI_SUCCESS)
+			{
+			cerr <<"Error in mpi comm size "<<mpierror<<endl;
+			exit(mpierror);
+			}
+	    cout<< "Mpi size is "<<mpisize<<endl;
+		mpierror = MPI_Comm_rank(MPI_COMM_WORLD,&mpirank);
+		if (mpierror != MPI_SUCCESS)
+			{
+			cerr <<"Error in mpi rank size "<<mpierror<<endl;
+			exit(mpierror);
+			}
+	        //std::string CPU_Affinity = ssystem("cat /proc/self/status | grep -i cpus_allowed_list");		
+	    //std::string CPU_Affinity = ssystem("grep -i cpus_allowed_list /proc/self/status");		
+		//cout<<"Hostname="<<hostBuffer<< ": Mpi rank  is "<<mpirank<<" and mpusize is "<<mpisize<< " and CPU Affinity is "<<CPU_Affinity<<endl;
+		cout<<"Hostname="<<hostBuffer<< ": Mpi rank  is "<<mpirank<<" and mpusize is "<<mpisize<< endl;
+
+		//
+		//a do work here
+		// 	
+		do_work_non_blocking();
+
+		cout <<"Before MPI_Finalize  from rank "<<mpirank<<endl;
+		MPI_Finalize();
+		cout <<"Exit from rank "<<mpirank<<endl;
+		}
+	catch (exception& e)
+  		{
+    		cerr<<"Exception caught " << e.what()<< "from mpirank "<<mpirank << endl;
+  		}
+	}//main
+
+//------------------------------------------
+void sendString(char* s, int destRank)
+	{
+	if (s==NULL)
+		{
+		cerr<<"MPI_Send. error null pointer sent!\n";
+		return;
+		}
+	cout<<"MPI_Send::["<<mpirank<<"=>"<<destRank<< "] string is \""<<s<<"\""<<endl;
+	MPI_Status status;
+	int error=MPI_Ssend(s,strlen(s)+1,MPI_CHAR,destRank,MES_TAG,MPI_COMM_WORLD);
+	if (error != MPI_SUCCESS)
+		{
+		cout<<"error  MPI_Ssend: from "<<mpirank<<" to "<<destRank<<endl;
+		return; 
+		}
+	cout<<"........MPI_Send::["<<mpirank<<"=>"<<destRank<< "] Successful send\n";
+	}//sendString
+//------------------------------------------
+void recvString(char* buffer, int MAX_BUFFER, int destRank)
+	{
+	cout<<"........MPI_Recv: rank "<<mpirank<< "<="<<destRank<<endl;
+	MPI_Status status;
+	int error=MPI_Recv(buffer,MAX_BUFFER,MPI_CHAR,destRank, MPI_ANY_TAG,MPI_COMM_WORLD,&status);
+                       
+	if (error != MPI_SUCCESS)
+		{
+		cout<<"error  MPI_Recv: from "<<mpirank<<endl;
+		return; 
+		}
+	int received;
+	MPI_Get_count(&status,MPI_CHAR,&received);
+	if (strlen(buffer)==0)
+		{
+		cerr<<"Error from rank "<<mpirank<<" no string found!  "<<endl; 
+		return;
+		}
+	if (strlen(buffer)<MAX_BUFFER)
+		{
+		cout <<"MPI_Recv["<<mpirank<< "<=" <<status.MPI_SOURCE << "]: Successful receive of \""<<buffer<<"\" bytes\n"; 
+		}
+	}//recvString
+
+void do_work_non_blocking()
+{
+    cout<< "do_work_non_blocking: "<<mpirank<<endl;
+	int error;
+	double start = MPI_Wtime();
+	bool finish=false;
+	int MPI_SIZE;
+	MPI_Comm_size(MPI_COMM_WORLD,&MPI_SIZE);
+	
+    char  my_cpu_name[BUFSIZ];
+    int my_name_length;
+    MPI_Request requestSend;
+    MPI_Request requestReceive;
+    
+    MPI_Status status;
+    int sendBuffer;
+    int receiveBuffer;
+    
+    int nextRank;
+    if (mpirank==MPI_SIZE-1)
+    {
+        nextRank=0;
+    }
+    else
+    {
+        nextRank=mpirank+1;
+    }
+    
+    MPI_Get_processor_name(my_cpu_name, &my_name_length);
+    cout<<"Rank "<<mpirank<<" Processor name is "<<my_cpu_name<<endl;
+    
+    bool finished=false;
+    int currentRank=mpirank;
+    while (!finished)
+        {  
+        //send rank
+        sendBuffer=currentRank;
+	    cout<<"Send from rank  "<<mpirank<<"=>"<<nextRank<<"  sendBuffer "<<sendBuffer<<endl;
+        error =MPI_Isend(&sendBuffer,1,MPI_INT,nextRank,  MES_TAG,MPI_COMM_WORLD,&requestSend);
+            
+        if (error != MPI_SUCCESS)
+            {
+            cout<<"error  MPI_Isend(int): from "<<mpirank<<endl;
+            return;
+            }
+            
+         //receive rank
+         error=MPI_Irecv(&receiveBuffer,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&requestReceive);
+         if (error != MPI_SUCCESS)
+            {
+            cout<<"error  MPI_Irecv(int): from "<<mpirank<<endl;
+            return;
+            }
+         //wait
+         MPI_Status status;
+         cout<<"Before MPI_Wait for rank "<<mpirank<<endl;
+         MPI_Wait(&requestSend,&status);
+         MPI_Wait(&requestReceive,&status);
+         cout<<"2*waits finished for rank "<<mpirank<<" and received value is "<<receiveBuffer<<endl;
+         currentRank=receiveBuffer;
+         if (currentRank==mpirank)
+		    {
+		    finished=true;
+		    cout<<"SUCCESS from "<<mpirank<<endl;
+		    }
+            
+        }//while
+    double end = MPI_Wtime();
+	cout<<"Time of work["<<	mpirank << "] "<< (end-start) << " seconds "<<endl;
+}
+
+//------------------------------------------
+
+//------------------------------------------
+
--- a/CICD/tests/mpi/run_single_node.sh
+++ b/CICD/tests/mpi/run_single_node.sh
+#!/bin/bash
+#
+# rub single_node MPI test
+# Author Simon Michnowicz
+# 20 Aug 20
+#
+# usage:  run_single_node.sh <NameofNodeRebuilt> <ReservationName>
+
+#check commadn line includes host
+
+hostname=""
+reservationName=""
+partitionName=comp
+
+################################################
+function printUsage
+{
+    echo "$0 To test a new or updated node, it runs a simple MPI job on it "
+    echo "Usage: run_single_node.sh --newnode=<node> --reservation=<res> [ --partition=<partition> ] "
+    echo "e.g. run_single_node.sh --newnode=gf00 --reservation=monMaintenance" 
+    echo "e.g. run_single_node.sh --newnode=gf00 --reservation=monMaintenance --partition=gpu " 
+    echo "Where"
+    echo "<node> is the name of the updated host we wish to test"
+    echo "<res> is the name of the SLURM reservation"
+    echo "<partition> is the name of the SLURM partition (defaults to comp)"
+    echo ""
+    #echo "We also support short options, i.e. -n -r -P "
+    exit 0
+}
+################################################
+
+function parseARG
+{
+OPTS="--long help:: --long newnode: --long reservation: --long partition:: -o nr:,hP::"
+
+#echo "getopt  $OPTS -n $0  -- $@"
+TEMP=`getopt  $OPTS -n $0  -- "$@"`
+if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
+eval set -- "$TEMP"
+#echo "Output is $TEMP"
+while true; do
+    #echo "Parameter is $1"
+    case "$1" in
+     -n|--newnode) # Specify p value.
+       hostname=$2 
+       shift 2
+       #echo "Newnode is $newnode"
+      ;;
+    -r|--reservation) # Specify strength, either 45 or 90.
+      reservation=$2
+      shift 2
+      #echo "Reservation is $reservation"
+      ;;
+    -P|--partition) # Specify p value.
+      partition=$2
+      shift 2
+      #echo "partition is ${partition}"
+      ;;
+    #
+    # -- means the end of the arguments; drop this, and break out of the while loop
+    #
+    --) shift; break ;;
+    h | *) # Display help.
+      echo "In h option"
+      printUsage
+      exit 0
+      ;;
+  esac
+done
+}
+################################################
+#main
+################################################
+
+#force a timeout if srun hangs
+TIMEOUT=2m
+echo "PID is $$"
+(sleep $TIMEOUT && kill $$ ) &
+
+#parse arguments
+parseARG $0 "$@"
+
+echo "hostname=$hostname"
+echo "reservation=$reservation"
+echo "partition=$partition"
+
+if [ -z $hostname ] 
+then
+    echo "You need to specify a nodename "
+    printUsage
+    exit 1
+fi
+
+if [ -z $reservation ] 
+then
+    echo "You need to specify a reservation "
+    printUsage
+    exit 1
+fi
+
+
+#######################
+# compile and run mpi job
+#######################
+module load openmpi/1.10.7-mlx
+mpic++ -o rotateMPI.exe rotate.cpp
+
+if [ ! -e rotateMPI.exe ] 
+then
+    echo "Error trying to compile rotateMPI.exe"
+    echo "Exiting"
+    exit 1
+fi
+
+SLURM_PARAMETERS="--nodes=1 --tasks-per-node=2 --cpus-per-task=1 --partition=${partition}"
+SLURM_PARAMETERS="${SLURM_PARAMETERS} --reservation=${reservation} --nodelist=${hostname}"
+echo "srun ${SLURM_PARAMETERS} rotateMPI.exe"
+srun ${SLURM_PARAMETERS} rotateMPI.exe
+retValue=$?
+echo "retValue is $retValue"
+if [ $retValue -ne "0" ]
+then
+    echo "FAILURE TO RUN rotateMPI.exe as a slurm job"
+    exit $retValue
+fi
+echo "##############################"
+echo "srun test OK"
+echo "##############################"
+exit 0
--- a/CICD/tests/mpi/run_two_nodes.sh
+++ b/CICD/tests/mpi/run_two_nodes.sh
+#!/bin/bash
+#
+# run_two_node MPI test
+# Author Simon Michnowicz
+# 20 Aug 20
+#
+# usage:  run_two_node.sh <NameofNodeRebuilt> <ReservationName>
+
+#check commadn line includes host
+
+function printUsage
+{
+    echo "$0 To test a new or updated node, it runs a simple MPI job between it and a known test node"
+    echo "Usage: run_two_node.sh --newnode=<node> --reservation=<res> [ --partition=<partition> --testnode=<testnode> ] "
+    echo "e.g. run_two_node.sh --newNode=gf00 --reservation=monMaintenance" 
+    echo "e.g. run_two_node.sh --newNode=gf00 --reservation=monMaintenance --partition=gpu --testnode=ge00" 
+    echo "Where"
+    echo "<node> is the name of the updated host we wish to test"
+    echo "<res> is the name of the SLURM reservation"
+    echo "<partition> is the name of the SLURM partition (defaults to comp)"
+    echo "<testnode> is a second (unupdated) node to do 2 server MPI (defaults to gf01)"
+    echo ""
+    #echo "We also support short options, i.e. -n -r -P -t"
+    exit 0
+}
+
+
+newnode=""
+reservation=""
+partition=comp
+testnode=gf01
+
+function parseARG
+{
+OPTS="--long help:: --long newnode: --long reservation: --long partition:: --long testnode:: -o nr:,hPt::"
+
+#echo "getopt  $OPTS -n $0  -- $@"
+TEMP=`getopt  $OPTS -n $0  -- "$@"`
+if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
+eval set -- "$TEMP"
+#echo "Output is $TEMP"
+while true; do
+    #echo "Parameter is $1"
+    case "$1" in
+     #-n|*newnode*) # Specify p value.
+     -n|--newnode) # Specify p value.
+       newnode=$2 
+       shift 2
+       #echo "Newnode is $newnode"
+      ;;
+    -r|--reservation) # Specify strength, either 45 or 90.
+      reservation=$2
+      shift 2
+      #echo "Reservation is $reservation"
+      ;;
+    -P|--partition) # Specify p value.
+      partition=$2
+      shift 2
+      #echo "partition is ${partition}"
+      ;;
+    -t|--testnode) # Specify testnode
+      testnode=$2
+      shift 2
+      #echo "testnode is ${testnode}"
+      ;;
+    #
+    # -- means the end of the arguments; drop this, and break out of the while loop
+    #
+    --) shift; break ;;
+    h | *) # Display help.
+      #echo "In h option"
+      printUsage
+      exit 0
+      ;;
+  esac
+done
+}
+
+#######################
+# main routine here
+#######################
+#force a timeout if srun hangs
+TIMEOUT=2m
+echo "PID is $$"
+(sleep $TIMEOUT && kill $$ ) &
+
+#parse arguments
+parseARG $0 "$@"
+
+echo "newnode=$newnode"
+echo "reservation=$reservation"
+echo "partition=$partition"
+echo "testnode=$testnode"
+
+
+if [ -z $newnode ]
+then
+    echo "You need to specify a nodename "
+    printUsage
+    exit 1
+fi
+
+if [ -z $reservation ]
+then
+    echo "You need to specify a reservation "
+    printUsage
+    exit 1
+fi
+
+#######################
+# compile and run mpi job
+#######################
+module load openmpi/1.10.7-mlx
+mpic++ -o rotateMPI.exe rotate.cpp
+if [ ! -e rotateMPI.exe ] 
+then
+    echo "Error trying to compile rotateMPI.exe"
+    echo "Exiting"
+    exit 1
+fi
+
+SLURM_PARAMETERS="--nodes=2 --tasks-per-node=2 --cpus-per-task=1 --partition=${partition} "
+SLURM_PARAMETERS="${SLURM_PARAMETERS} --reservation=${reservation} --nodelist=${newnode},${testnode}"
+SLURM_PARAMETERS="${SLURM_PARAMETERS} --time=00:01:00 --job-name=MPItest"
+echo "srun ${SLURM_PARAMETERS} rotateMPI.exe"
+srun ${SLURM_PARAMETERS} rotateMPI.exe
+retValue=$?
+echo "retValue is $retValue"
+if [ $retValue -ne "0" ]
+then
+    echo "FAILURE TO RUN rotateMPI.exe as a slurm job"
+    exit $retValue
+fi
+echo 
+echo "#####################"
+echo "srun test OK"
+echo "#####################"
+exit 0
--- a/CICD/tests/run_tests.sh
+++ b/CICD/tests/run_tests.sh
+#!/bin/bash
+
+function usage {
+    echo $"Usage: $0 {all, ComputeNodes, LoginNodes, ManagementNodes, NFSNodes, sql, slurm}" INVENTORY_FILE
+    exit 1
+}
+
+function run_them () 
+{
+    #limit='--limit '"$1"
+    #if [ "$1" = "all" ]
+    #then
+    #  limit="all"
+    #fi
+    for filename in ./tests/$1/*.sh; do   # this is not sorted yet
+        [ -e "$filename" ] || continue
+        #/bin/bash -x $filename # local execution. nice for dev
+        ansible -i $2 -m script -a "$filename" $1
+    done
+    for filename in ./tests/$1/*.yml; do   # this is not sorted yet
+        [ -e "$filename" ] || continue
+        ansible-playbook -i $2 $filename # I am assuming the playbook cares about visibility here. might have to change later
+    done
+}
+
+run_them $1 $2
--- a/CICD/tests/slurm/srunHostname.yml
+++ b/CICD/tests/slurm/srunHostname.yml
+---
+- hosts: LoginNodes
+  gather_facts: false
+  tasks:
+  - name: simple srun test
+    command: ./srun --ntasks=1  --partition=batch  hostname
+    args:
+      chdir: '/opt/slurm-latest/bin'
+    become: true       
+    become_user: hpctest
--- a/CICD/upgrade_and_reboot.yml
+++ b/CICD/upgrade_and_reboot.yml
+- hosts: 'all:!BastionNodes'
+  tasks:
+    - { apt: { update_cache: yes, upgrade: dist }, register: upgrade, become: true }
+    - { reboot: {}, when: upgrade.changed, become: true }
+- hosts: 'BastionNodes'
+  tasks:
+    - { apt: { update_cache: yes, upgrade: dist }, register: upgrade, become: true }
+    - { reboot: {}, when: upgrade.changed, become: true }
No results found