Skip to content
Snippets Groups Projects
Commit 57806e43 authored by Trung Nguyen's avatar Trung Nguyen
Browse files

DGX troubleshooting

Former-commit-id: 3fdb7c6a
parent 131c0440
No related branches found
No related tags found
No related merge requests found
Showing
with 936 additions and 0 deletions
export PATH={{ munge_dir }}/bin:{{ slurm_dir }}/bin:{{ slurm_dir }}/sbin:{{ nhc_dir }}/sbin:$PATH
export LD_LIBRARY_PATH={{ munge_dir }}/lib:{{ slurm_dir }}/lib:{{ slurm_dir }}/lib/slurm:$LD_LIBRARY_PATH
export SLURM_SERVER_HOME={{ slurm_dir }}
export MANPATH={{ slurm_dir }}/share/man:$MANPATH
{% if slurmctrl == inventory_hostname %}
{{ slurmctlddebug.log }}
{{ slurmschedlog.log }}
{% else %}
{{ slurmddebug.log }}
{% endif %}
{
compress
missingok
nocopytruncate
nocreate
nodelaycompress
nomail
notifempty
noolddir
rotate 5
sharedscripts
size=5M
create 640 slurm root
{% if ansible_os_family == 'RedHat' and ansible_distribution_version >= '7' %}
postrotate
{% if slurmctrl == inventory_hostname %}
systemctl kill -s HUP --kill-who=main slurmctld
{% else %}
systemctl kill -s HUP --kill-who=main slurmd
{% endif %}
{% else %}
postrotate /etc/init.d/slurm reconfig
{% endif %}
endscript
}
---
- name: set use_systemd
set_fact:
use_systemd: True
when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and
( ansible_distribution_major_version == "7")
- name: set slurmd_enabled (default enabled)
set_fact:
slurmd_enabled: True
when: slurmd_enabled is not defined
- name: install slurmdbd init
template: src=slurmdbd.initd.j2 dest=/etc/init.d/slurmdbd mode=755
become: true
when: use_systemd is not defined and start_slurmdbd is defined
- name: copy slurmdbd init script if OS contains systemd
template: dest=/etc/systemd/system/slurmdbd.service src=slurmdbd.service.j2 mode=644
become: true
when: use_systemd is defined and start_slurmdbd is defined
register: slurmdbd_service_installed
- name: copy slurm init script
template: dest=/etc/init.d/slurm src=slurm.initd.j2 mode=755
become: true
when: use_systemd is not defined
- name: copy slurmd.service
template: dest=/etc/systemd/system/slurmd.service src=slurmd.service.j2 mode=644
become: true
when: use_systemd is defined and start_slurmd is defined
register: slurmd_service_installed
- name: slurmctld.service
template: dest=/etc/systemd/system/slurmctld.service src=slurmctld.service.j2 mode=644
become: true
when: use_systemd is defined and start_slurmctld is defined
register: slurmctld_service_installed
- name: reload systemd after slurmd install
systemd:
daemon_reload: yes
become: true
when: use_systemd is defined and start_slurmd is defined and slurmd_service_installed.changed
- name: reload systemd after slurmctld _service _installed
systemd:
daemon_reload: yes
become: true
when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed
- name: reload systemd slurmdbd_ service _installed
systemd:
daemon_reload: yes
become: true
when: use_systemd is defined and start_slurmdbd is defined and slurmdbd_service_installed.changed
- name: start munge
service: name=munge state=restarted enabled=yes
become: true
- name: start slurmdbd
service: name=slurmdbd state=restarted enabled=no
become: true
when: start_slurmdbd is defined
- name: "create cluster in slurm db"
shell: "{{slurm_dir}}/bin/sacctmgr -i create cluster {{ clustername }}"
become: true
ignore_errors: true
- name: start slurmctl
service: name=slurmctld state=restarted enabled=no
become: true
when: use_systemd is defined and start_slurmctld is defined
- name: start slurmd
service: name=slurmd state=restarted enabled={{ slurmd_enabled }}
become: true
when: use_systemd is defined and start_slurmd is defined
- name: start slurm
service: name=slurm state=restarted enabled={{ slurmd_enabled }}
become: true
when: use_systemd is not defined and ( start_slurmd is defined or start_slurmctld is defined )
#!/bin/bash
#
# chkconfig: 345 90 10
# description: SLURM is a simple resource management system which \
# manages exclusive access to a set of compute \
# resources and distributes work to those resources.
#
# processname: ${exec_prefix}/sbin/slurmd
# pidfile: /var/run/slurmd.pid
#
# processname: ${exec_prefix}/sbin/slurmctld
# pidfile: /var/run/slurmctld.pid
#
# config: /etc/sysconfig/slurm
#
### BEGIN INIT INFO
# Provides: slurm
# Required-Start: $remote_fs $syslog $network munge
# Required-Stop: $remote_fs $syslog $network munge
# Should-Start: $named
# Should-Stop: $named
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: slurm daemon management
# Description: Start slurm to provide resource management
### END INIT INFO
munge_lib="{{ munge_dir }}/lib"
exec_prefix="{{ slurm_dir }}"
prefix="{{ slurm_dir }}"
BINDIR="${exec_prefix}/bin"
CONFDIR="${prefix}/etc"
LIBDIR="${exec_prefix}/lib:${munge_lib}"
SBINDIR="${exec_prefix}/sbin"
# Source function library.
if [ -f /etc/rc.status ]; then
. /etc/rc.status
SUSE=1
STARTPROC=startproc
rc_reset
else
# Read configuration defaults to override variables:
# $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD
##
for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do
[ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME"
done
[ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS"
[ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER
expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE"
[ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \
&& RELOAD=1 || unset RELOAD
if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then
SYSTEM="DEBIAN"
[ -r /etc/default/rcS ] && . /etc/default/rcS
[ -r /lib/init/vars.sh ] && . /lib/init/vars.sh
[ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions
STARTPROC="start_daemon"
elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then
SYSTEM="REDHAT"
. /etc/rc.d/init.d/functions
RH_LOCK="/var/lock/subsys/$INIT_NAME"
SUSE=0
STARTPROC=daemon
elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then
SYSTEM="SUSE"
. /etc/rc.status
rc_reset
elif [ -r /lib/lsb/init-functions ]; then
SYSTEM="LSB"
. /lib/lsb/init-functions
else
SYSTEM="OTHER"
fi
function rc_status() {
RETVAL=$?
}
function rc_exit () {
exit $RETVAL
}
RETVAL=0
fi
# We can not use a starter program without losing environment
# variables that are critical on Blue Gene systems
if [ -d /bgl/BlueLight/ppcfloor ]; then
STARTPROC=""
fi
# Source slurm specific configuration
# This can be used to alter limits for users jobs or set daemon options.
# For example, the limits for user jobs could be higher or lower than the
# default limits for user root (e.g. "ulimit -t unlimited" sets an unlimited
# CPU time limit for spawned user jobs).
# SLURMCTLD_OPTIONS defines slurmctld command line options. See "man slurmctld"
# SLURMD_OPTIONS defines slurmd command line options. See "man slurmd"
if [ -f /etc/sysconfig/slurm ] ; then
. /etc/sysconfig/slurm
else
SLURMCTLD_OPTIONS=""
SLURMD_OPTIONS=""
fi
if [ ! -x $BINDIR/scontrol ]; then
echo "Could not find $BINDIR/scontrol. Bad path?"
exit 1
fi
if [ ! -f $CONFDIR/slurm.conf ]; then
echo "Could not find $CONFDIR/slurm.conf. Bad path?"
exit 1
fi
# setup library paths for slurm and munge support
export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
start() {
prog=$1
shift
echo -n "starting $prog: "
unset HOME MAIL USER USERNAME
$STARTPROC $SBINDIR/$prog $*
rc_status -v
echo
touch /var/lock/subsys/slurm
}
stop() {
echo -n "stopping $1: "
killproc $1 -TERM
rc_status -v
echo
rm -f /var/lock/subsys/slurm
}
startall() {
for prog in `$BINDIR/scontrol show daemons`; do
optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"`
if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]]
then
for node in $($BINDIR/scontrol show aliases)
do
start $prog -N ${node} ${!optvar}
done
else
start $prog ${!optvar}
fi
done
}
#
# status() with slight modifications to take into account
# instantiations of job manager slurmd's, which should not be
# counted as "running"
#
slurmstatus() {
local base=${1##*/}
local pid
local rpid
local pidfile
local pidfiles
local rc
pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'`
if [ $? = 0 ]; then
pidfile=${pidfile##*=}
pidfile=${pidfile%#*}
pidfile=${pidfile//\"/}
else
pidfile=/var/run/${base}.pid
fi
pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \
pidof -o $$ -o $$PPID -o %PPID -x ${base}`
if [ "$base" == "slurmd" ] ; then
echo ${pidfile} | grep -q %n
if [[ $? -eq 0 ]]
then
for n in $($BINDIR/scontrol show aliases)
do
pidfiles="${pidfiles} $(echo ${pidfile} | sed "s/%n/$n/g")"
done
else
pidfiles=${pidfile}
fi
else
pidfiles=${pidfile}
fi
RETVAL=0
for pidfile in ${pidfiles}
do
rc=1
if [ -f $pidfile ]; then
read rpid < $pidfile
if [ "$rpid" != "" -a "$pid" != "" ]; then
for i in $pid ; do
if [ "$i" = "$rpid" ]; then
echo $"${base} (pid $rpid) is running..."
rc=0
break
fi
done
elif [ "$rpid" != "" -a "$pid" = "" ]; then
# Due to change in user id, pid file may persist
# after slurmctld terminates
if [ "$base" != "slurmctld" ] ; then
echo $"${base} dead but pid file exists"
else
echo $"${base} is stopped"
fi
RETVAL=1
fi
fi
if [[ $rc -eq 0 ]]
then
continue
fi
if [ "$base" = "slurmctld" -a "$pid" != "" ] ; then
echo $"${base} (pid $pid) is running..."
continue
fi
echo $"${base} is stopped"
if [ "$RETVAL" == "0" ]; then
RETVAL=3
fi
done
return $RETVAL
}
#
# stop slurm daemons,
# wait for termination to complete (up to 10 seconds) before returning
#
slurmstop() {
for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
for i in 1 2 3 4
do
sleep $i
slurmstatus $prog
if [ $? != 0 ]; then
break
fi
done
done
# slurmstatus return 1 in case of stopped daemon
# and that is what we are looking for here
if [[ ${RETVAL} == "1" ]]
then
RETVAL=0
else
RETVAL=1
fi
}
#
# The pathname substitution in daemon command assumes prefix and
# exec_prefix are same. This is the default, unless the user requests
# otherwise.
#
# Any node can be a slurm controller and/or server.
#
case "$1" in
start)
startall
;;
startclean)
SLURMCTLD_OPTIONS="-c $SLURMCTLD_OPTIONS"
SLURMD_OPTIONS="-c $SLURMD_OPTIONS"
startall
;;
stop)
slurmstop
;;
status)
anystop=0
for prog in `$BINDIR/scontrol show daemons`; do
slurmstatus $prog
rc=$?
if [ $rc != 0 ] ; then
anystop=$rc
fi
done
RETVAL=$anystop
;;
restart)
$0 stop
$0 start
;;
condrestart)
if [ -f /var/lock/subsys/slurm ]; then
for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
sleep 1
optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"`
if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]]
then
for node in $($BINDIR/scontrol show aliases)
do
start $prog -N ${node}
done
else
start $prog ${!optvar}
fi
done
fi
;;
reconfig|reload)
for prog in `$BINDIR/scontrol show daemons`; do
echo -n $"Reloading $prog daemon configuration: "
killproc $prog -HUP
echo
done
;;
test)
for prog in `$BINDIR/scontrol show daemons`; do
echo "$prog runs here"
done
;;
*)
echo "Usage: $0 {start|startclean|stop|status|restart|reconfig|condrestart|test}"
exit 1
;;
esac
rc_exit
[Unit]
Description=Slurm controller daemon
After=network.target glusterVolume.mount
ConditionPathExists={{ slurm_dir }}/etc/slurm.conf
[Service]
Type=forking
#EnvironmentFile=/etc/default/slurmctld
ExecStart={{ slurm_dir }}/sbin/slurmctld $SLURMCTLD_OPTIONS
PIDFile={{ slurmpiddir }}/slurmctld.pid
[Install]
WantedBy=multi-user.target
[Unit]
Description=Slurm node daemon
After=network.target
ConditionPathExists={{ slurm_dir }}/etc/slurm.conf
[Service]
Type=forking
KillMode=process
LimitMEMLOCK=infinity
#EnvironmentFile=/etc/default/slurmd
ExecStart={{ slurm_dir }}/sbin/slurmd $SLURMD_OPTIONS
PIDFile={{ slurmpiddir }}/slurmd.pid
[Install]
WantedBy=multi-user.target
#!/bin/bash
#
# chkconfig: 345 90 10
# description: SLURMDBD is a database server interface for \
# SLURM (Simple Linux Utility for Resource Management).
#
# processname: ${exec_prefix}/sbin/slurmdbd
# pidfile: /var/run/slurmdbd.pid
#
# config: /etc/sysconfig/slurm
#
### BEGIN INIT INFO
# Provides: slurmbd
# Required-Start: $remote_fs $syslog $network munge
# Required-Stop: $remote_fs $syslog $network munge
# Should-Start: $named
# Should-Stop: $named
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: SLURM database daemon
# Description: Start slurm to provide database server for SLURM
### END INIT INFO
munge_lib="{{ munge_dir }}/lib"
exec_prefix="{{ slurm_dir }}"
prefix="{{ slurm_dir }}"
CONFDIR="${prefix}/etc"
LIBDIR="${exec_prefix}/lib:${munge_lib}"
SBINDIR="${exec_prefix}/sbin"
#Source function library.
if [ -f /etc/rc.status ]; then
. /etc/rc.status
SUSE=1
STARTPROC=startproc
rc_reset
else
# Read configuration defaults to override variables:
# $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD
##
for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do
[ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME"
done
[ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS"
[ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER
expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE"
[ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \
&& RELOAD=1 || unset RELOAD
if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then
SYSTEM="DEBIAN"
[ -r /etc/default/rcS ] && . /etc/default/rcS
[ -r /lib/init/vars.sh ] && . /lib/init/vars.sh
[ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions
STARTPROC="start_daemon"
elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then
SYSTEM="REDHAT"
. /etc/rc.d/init.d/functions
RH_LOCK="/var/lock/subsys/$INIT_NAME"
SUSE=0
STARTPROC=daemon
elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then
SYSTEM="SUSE"
. /etc/rc.status
rc_reset
elif [ -r /lib/lsb/init-functions ]; then
SYSTEM="LSB"
. /lib/lsb/init-functions
else
SYSTEM="OTHER"
fi
function rc_status() {
RETVAL=$?
}
function rc_exit () {
exit $RETVAL
}
RETVAL=0
fi
# We can not use a starter program without losing environment
# variables that are critical on Blue Gene systems
if [ -d /bgl/BlueLight/ppcfloor ]; then
STARTPROC=""
fi
# Source slurm specific configuration
# SLURMDBD_OPTIONS defines slurmdbd command line options. See "man slurmdbd"
if [ -f /etc/sysconfig/slurm ] ; then
. /etc/sysconfig/slurm
else
SLURMDBD_OPTIONS=""
fi
if [ ! -f $CONFDIR/slurmdbd.conf ]; then
echo "Could not find $CONFDIR/slurmdbd.conf. Bad path?"
exit 1
fi
# setup library paths for slurm and munge support
export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
start() {
prog=$1
shift
echo -n "starting $prog: "
unset HOME MAIL USER USERNAME
$STARTPROC $SBINDIR/$prog $SLURMDBD_OPTIONS
rc_status -v
echo
touch /var/lock/subsys/slurmdbd
}
stop() {
echo -n "stopping $1: "
killproc $1 -TERM
rc_status -v
echo
rm -f /var/lock/subsys/slurmdbd
}
slurmstatus() {
local base=${1##*/}
local pid
local rpid
local pidfile
pidfile=`grep -i pidfile $CONFDIR/slurmdbd.conf | grep -v '^ *#'`
if [ $? = 0 ]; then
pidfile=${pidfile##*=}
pidfile=${pidfile%#*}
pidfile=${pidfile//\"/}
else
pidfile=/var/run/slurmdbd.pid
fi
pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \
pidof -o $$ -o $$PPID -o %PPID -x ${base}`
if [ -f $pidfile ]; then
read rpid < $pidfile
if [ "$rpid" != "" -a "$pid" != "" ]; then
for i in $pid ; do
if [ "$i" = "$rpid" ]; then
echo $"${base} (pid $pid) is running..."
return 0
fi
done
elif [ "$rpid" != "" -a "$pid" = "" ]; then
echo $"${base} dead but pid file exists"
return 1
fi
fi
if [ "$base" = "slurmdbd" -a "$pid" != "" ] ; then
echo $"${base} (pid $pid) is running..."
return 0
fi
echo $"${base} is stopped"
return 3
}
#
# stop slurm daemons,
# wait for termination to complete (up to 10 seconds) before returning
#
slurmstop() {
stop $1
for i in 1 2 3 4
do
sleep $i
slurmstatus $1
if [ $? != 0 ]; then
break
fi
done
}
#
# The pathname substitution in daemon command assumes prefix and
# exec_prefix are same. This is the default, unless the user requests
# otherwise.
#
# Any node can be a slurm controller and/or server.
#
case "$1" in
start)
start slurmdbd
;;
stop)
slurmstop slurmdbd
;;
status)
slurmstatus slurmdbd
rc_status -v
;;
restart)
$0 stop
$0 start
;;
condrestart)
if [ -f /var/lock/subsys/slurm ]; then
stop slurmdbd
start slurmdbd
fi
;;
reconfig|reload)
echo -n $"Reloading slurmdbd daemon configuration: "
killproc slurmdbd -HUP
echo
;;
*)
echo "Usage: $0 {start|stop|status|restart|condrestart|reconfig}"
exit 1
;;
esac
rc_exit
[Unit]
Description=Slurm DBD accounting daemon
After=network.target
ConditionPathExists={{ slurm_dir }}/etc/slurmdbd.conf
[Service]
Type=forking
ExecStart={{ slurm_dir }}/sbin/slurmdbd
PIDFile={{ slurmdbdpiddir }}/slurmdbd.pid
[Install]
WantedBy=multi-user.target
THis role sets up trigger events on your slurm cluster.
What you want the triggers to do is up to you, so you will probably modify the templated shell files.
Copy the role to a local role directory?
Triggers used in this role as it stands
- primary_slurmctld_failure
- primary_slurmctld_resumed_operation.sh
- node down
USAGE:
- hosts: 'ManagementNodes'
tasks:
- include_vars: vars/slurm.yml
- hosts: 'ManagementNodes'
roles:
- { role: slurm-trigger, slurm_dir: "/opt/slurm-18.08.6", admin_email: "hpc-alerts-warning-l@monash.edu", tags: [slurm, slurm-trigger] }
The role uses several variables that need to be defined:
{{ slurm_dir }} The directory of slurm install. Shell scripts are copied to sbin
{{ admin_email }} Email address (defined in slurm.yml, or defined some other way) to send alerts to
Each trigger has 2 files. One to respond to a trigger. And one to reset the trigger. The role calls the last one to start the process.
---
############################
- name: template primary_slurmctld_failure
template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_failure.sh" src=primary_slurmctld_failure.sh.j2 mode="0755"
become: true
become_user: root
- name: template set primary_slurmctld_failure trigger
template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" src=set_primary_slurmctld_failure_trigger.sh.j2 mode="0755"
become: true
become_user: root
- name: Execute set_primary_slurmctld_failure)trigger
command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh"
become: true
become_user: slurm
run_once: true
ignore_errors: true
- name: template primary_slurmctld_resumed_operation
template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh" src=primary_slurmctld_resumed_operation.sh.j2 mode="0755"
become: true
become_user: root
- name: template set primary_slurmctld_resumed trigger
template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" src=set_primary_slurmctld_resumed_operation_trigger.sh.j2 mode="0755"
become: true
become_user: root
- name: Execute primary_slurmctld_resumed_operation.sh
command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh"
become: true
become_user: slurm
run_once: true
ignore_errors: true
- name: template node_down
template: dest="{{ slurm_dir }}/sbin/node_down.sh" src=node_down.sh.j2 mode="0755"
become: true
become_user: root
- name: template node_down trigger command
template: dest="{{ slurm_dir }}/sbin/set_node_trigger.sh" src=set_node_trigger.sh.j2 mode="0755"
become: true
become_user: root
- name: Execute set_node_trigger.sh
command: "{{ slurm_dir }}/sbin/set_node_trigger.sh"
become: true
become_user: slurm
run_once: true
ignore_errors: true
#!/bin/bash
# Notify the administrator of the failure using by e-mail
echo "On `hostname`:`date`:`whoami`: slurm-trigger event for NODE_FAILURE: $*" | `which mail` -s "NODE FAILURE $*" {{ admin_email }}
# Submit trigger for next primary slurmctld failure event
TRIGGER_CMD="{{ slurm_dir }}/sbin/set_node_trigger.sh"
FILE=/tmp/node_down.txt
#COMMAND="su slurm -c $TRIGGER_CMD"
echo "node_down.sh: `date`: `whoami`: $TRIGGER_CMD" >> $FILE
$TRIGGER_CMD >> $FILE 2>&1
#!/bin/bash
# Notify the administrator of the failure using by e-mail
echo "On `hostname`:`date`:`who`: slurm-trigger event for Primary_SLURMCTLD_FAILURE" | `which mail` -s Primary_SLURMCTLD_FAILURE {{ admin_email }}
# Submit trigger for next primary slurmctld failure event
TRIGGER_CMD="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh"
FILE=/tmp/primary_down.txt
#COMMAND="su slurm -c $TRIGGER_CMD"
echo "primary_slurmctld_failure.sh:`date`:`whoami`: $TRIGGER_CMD" >> $FILE
$TRIGGER_CMD >> $FILE 2>&1
#!/bin/bash
# Notify the administrator of the failure using by e-mail
echo "On `hostname`:`date`:`whoami`: slurm-trigger event for Primary_SLURMCTLD_RESUMED" | `which mail` -s Primary_SLURMCTLD_RESUMED {{ admin_email }}
# Submit trigger for next primary slurmctld failure event
FILE=/tmp/primary_up.txt
#COMMAND="su slurm -c {{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh"
COMMAND="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh"
echo "primary_slurmctld_resumed_operation.sh.sh:`date`:`whoami`: $COMMAND" >> $FILE
$COMMAND >> $FILE 2>&1
#!/bin/bash
TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --down --program={{ slurm_dir }}/sbin/node_down.sh"
echo "set_node_trigger.sh: `date`: $TRIGGER_CMD"
$TRIGGER_CMD
#!/bin/bash
TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_failure --program={{ slurm_dir }}/sbin/primary_slurmctld_failure.sh"
echo "set_primary_slurmctld_failure_trigger.sh: `date`: $TRIGGER_CMD"
$TRIGGER_CMD
#!/bin/bash
TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_resumed_operation --program={{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh"
echo "set_primary_slurmctld_resumed_operation_trigger.sh: `date`: $TRIGGER_CMD"
$TRIGGER_CMD
---
- name: install slurm.conf
copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf
become: true
become_user: root
- name: setup plugin
template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755
run_once: true
become: true
become_user: root
when: slurm_lua is defined
--[[
Example lua script demonstrating the SLURM job_submit/lua interface.
This is only an example, not meant for use in its current form.
Leave the function names, arguments, local varialbes and setmetatable
set up logic in each function unchanged. Change only the logic after
the lSUCCESSine containing "*** YOUR LOGIC GOES BELOW ***".
For use, this script should be copied into a file name "job_submit.lua"
in the same directory as the SLURM configuration file, slurm.conf.
--]]
function slurm_job_submit(job_desc, part_list, submit_uid)
-- Check no default account
if job_desc.account == "default" then
slurm.log_user("You have to specify your project ID as part of your job submission. The account=default is now deprecated on M3 job scheduler.")
return slurm.ERROR
end
-- Check Desktop requests with more than one node
if ((job_desc.name == "desktop") and (job_desc.min_nodes > 1 )) then
slurm.log_user("The current M3 Desktop applications are unable to utilise more than one node, please select one node instead")
return slurm.ERROR
end
-- Check for gres.gpu requirements in m3c, m3h and m3g, else move job to comp
if ((job_desc.partition == "m3c" ) or (job_desc.partition == "m3h" ) or (job_desc.partition == "m3g" )) then
local partition = ""
if (job_desc.gres == nil) then
partition = "comp"
slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition)
job_desc.partition = partition
end
return slurm.SUCCESS
end
-- Check for QOS rtq in m3c, m3h , m3g and partition=nil, then forward job to rtqp,comp,m3g
if ((job_desc.qos == "rtq") and (job_desc.partition == nil)) then
local partition = ""
partition = "rtqp,comp,m3g"
slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition)
job_desc.partition = partition
return slurm.SUCCESS
end
end
function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
return slurm.SUCCESS
end
slurm.log_info("initialized")
return slurm.SUCCESS
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment