Skip to content
Snippets Groups Projects
Commit 57806e43 authored by Trung Nguyen's avatar Trung Nguyen
Browse files

DGX troubleshooting

Former-commit-id: 3fdb7c6a
parent 131c0440
No related branches found
No related tags found
No related merge requests found
Showing
with 1493 additions and 0 deletions
---
slurm_use_vpn: False
slurmddebug: {level: 5, log: '/var/log/slurm/slurmd.log'}
slurmctlddebug: {level: 5, log: '/mnt/slurm-logs/slurmctld.log'}
slurmdbdlog: {level: 5, log: '/mnt/slurm-logs/slurmdbd.log'}
slurmfairshare: {def: false, val: 10000}
slurmdatadir: "/var/spool/slurm"
slurmselecttype: "select/linear"
slurmfastschedule: "1"
slurmschedulertype: "sched/backfill"
#!/bin/env python
# prints a list of NIDIA devices and their type in json format for
# parsing by ansible program;
# fields are 'name':'gpu' (fixed)
# 'file': devicePath, (i.e. /dev/nvidia0)
# 'type':typeOfDevice (i.e. 80 parsed from nvidia-smi outout)
# program returns nothing upon error (i.e. no error messages)
# Also checks for existance of /dev/nvidia? where ? is number from nvidia-smi GPU count
# nvidia-smi -L produces output like
#GPU 0: Tesla K80 (UUID: GPU-8bdb2956-4c10-7bd0-80d4-46da054663b4)
#GPU 1: Tesla K80 (UUID: GPU-19ed5f7c-435a-036e-54f0-f64209c3cede)
#GPU 2: Tesla K80 (UUID: GPU-a2f8cfe2-5bbc-de2a-8adc-4038f3379b5e)
#GPU 3: Tesla K80 (UUID: GPU-1c9c0d02-4590-c915-18d2-d709efb56d8d)
#GPU 4: Tesla K80 (UUID: GPU-b0f290c8-3b69-a518-ac77-22718f43e946)
#GPU 5: Tesla K80 (UUID: GPU-565ebca2-6b37-3bc0-a355-72330049a349)
#GPU 6: Tesla K80 (UUID: GPU-d8096845-d8a1-e3ef-ad00-c1d069c1b685)
#GPU 7: Tesla K80 (UUID: GPU-20ee0841-22b5-9974-66c0-b49e5be3e469)
import subprocess
import sys
import re
import os
import json
try:
#run nvidia-smi -L to parse output
p = subprocess.Popen(['nvidia-smi', '-L'], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = p.communicate()
lines=out.strip().split('\n')
numberOfDevices=len(lines)
typeofDevice=""
deviceList=[] #return list
for line in lines:
if not line :
break
#print "Line is ",line
pe=re.compile('GPU\s+(\d*):\s+\S+\s+(\S*)')
m=pe.search(line)
if not m:
#print "No match found"
break
numberOfDevice=m.group(1)
typeOfDevice=m.group(2)
#print "Number of Devics is "+numberOfDevice+" Type of device is "+typeOfDevice
#check device file existance
devicePath="/dev/nvidia"+numberOfDevice
if os.path.exists(devicePath):
#print "OK"
deviceList.append( { 'name':'gpu' , 'file': devicePath, 'type':typeOfDevice } )
else:
print("error looking for nvidia device")
sys.exit(1)
#now convert list to json
output=json.dumps(deviceList)
print output
except OSError:
output=json.dumps([])
print output
#if nvidia-smi is not installed on computer then this error is thrown by subprocess.Popen
sys.exit(0)
---
- name: restart munge
service: name=munge state=restarted
become: true
- name: restart slurm
service: name=slurm state=restarted
become: true
- name: restart slurmdbd
service: name=slurmdbd state=restarted
become: true
- name: scontrol reconfigure
shell: sleep 10 ; scontrol reconfigure
become: true
delegate_to: "{{ slurmctrl }}"
run_once: true
---
- name: make sure slurmctld and slurmdb log dir exists
file: dest=/mnt/slurm-logs state=directory owner=root group=root mode=755
become: true
- name: make sure slurm conf dir exists
file: dest={{ slurm_dir }}/etc state=directory
become: true
- name: make sure slurm lock dir exists
file: dest=/var/lock/subsys state=directory owner=root group=root mode=755
become: true
- name: stat run directory
stat: path={{ slurmdatadir }}
become_user: root
become: True
register: runstat
when: slurmdatadir is defined
- name: create data directory
file: path={{ slurmdatadir }} state=directory owner=slurm group=slurm mode=755
become: true
when: slurmdatadir is defined and not runstat.stat.exists
- name: stat pid directory
stat: path={{ slurmpiddir }}
become_user: root
become: True
register: pidstat
when: slurmpiddir is defined
- name: create pid directory
file: path={{ slurmpiddir }} state=directory owner=slurm group=slurm mode=755
become: true
when: slurmpiddir is defined and not pidstat.stat.exists
- name: create slurmdbdpiddir directory
file: path={{ slurmdbdpiddir }} state=directory owner=slurm group=slurm mode=755
become: true
- name: create shared state directory
file: path={{slurmsharedstatedir }} state=directory owner=slurm group=slurm mode=750
become: true
run_once: true
when: usesharedstatedir is defined and usesharedstatedir
- name: symlink shared state dir
file: path={{ slurmstatedir }} src={{ slurmsharedstatedir }} state=link
become: true
when: usesharedstatedir is defined and usesharedstatedir
- name: create state directory
file: path={{ slurmstatedir }} state=directory owner=slurm group=slurm mode=750
become: true
when: slurmstatedir is defined and not usesharedstatedir
- name: stat log directory
stat: path={{ slurmlogdir }}
become_user: root
become: True
register: logstat
when: slurmlogdir is defined
- name: create log directory
file: path={{ slurmlogdir }} state=directory owner=slurm group=slurm mode=750
become: true
when: slurmlogdir is defined and not logstat.stat.exists
- name: make sure slurm conf dir exists
file: dest={{ slurm_dir }}/etc state=directory
become: true
- name: create greps directory
file: path={{ slurm_dir }}/etc/gres state=directory owner=slurm group=slurm mode=755
become: true
- name: yum install cgroup
yum: name={{ item }} state=installed
with_items:
- libcgroup
become: True
become_method: sudo
when: ansible_os_family == "RedHat"
- name: apt install cgroup
apt: name={{ item }} state=installed update_cache=yes
with_items:
- cgmanager
- cgmanager-utils
- libcgmanager0
when: ansible_os_family == "Debian"
become: True
become_method: sudo
- name: config cgroup.conf file
template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644
become: True
become_method: sudo
- name: config cgroup_allowed_devices.conf file
template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644
become: True
become_method: sudo
- name: test if munge is already isntalled
stat: path="{{ munge_dir }}/bin/munge"
register: munge_binary
- name: unarchive munge
unarchive:
args:
src: "http://consistency0/src/munge-{{ munge_version }}.tar.bz2"
copy: no
dest: /tmp
creates: /tmp/munge-{{ munge_version }}/configure
when: not munge_binary.stat.exists
- name: build munge
shell: ./configure --prefix={{ munge_dir }} && make
args:
chdir: /tmp/munge-{{ munge_version }}
creates: /tmp/munge-{{ munge_version }}/src/munge/munge
when: not munge_binary.stat.exists
- name: install munge
shell: make install
become: true
args:
chdir: /tmp/munge-{{ munge_version }}
creates: "{{ munge_dir }}/bin/munge"
when: not munge_binary.stat.exists
- name: set use_systemd
set_fact:
use_systemd: True
when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and ( ansible_distribution_major_version == "7")
- name: copy init script
template: dest=/etc/init.d/munge src=munge.initd.j2 mode=755
become: true
register: systemd_script_installed
when: use_systemd is not defined
- name: copy slurm init script if OS contains systemd
template: dest=/etc/systemd/system/munge.service src=munge.service.j2 mode=644
become: true
when: use_systemd is defined
- name: reload systemd
shell: systemctl daemon-reload
become: true
when: use_systemd is defined and systemd_script_installed.changed
- name: remove all install
file:
path: "/tmp/slurm-{{ slurm_version }}"
state: absent
become: true
when: force_slurm_recompile is defined
- name: remove all install
file:
path: "{{ slurm_dir }}"
state: absent
become: true
when: force_slurm_recompile is defined
- name: unarchive slurm
unarchive:
src: "http://consistency0/src/slurm-{{ slurm_version }}.tar.bz2"
dest: /tmp
remote_src: yes
creates: "{{ slurm_dir }}/bin/srun"
- name: stat srun
stat: path="{{ slurm_dir }}/bin/srun"
register: stat_srun
- name: configure slurm
command: /tmp/slurm-{{ slurm_version }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix=/usr/local/pmix/latest
args:
creates: "{{ slurm_dir }}/bin/srun"
chdir: /tmp/slurm-{{ slurm_version }}
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: build slurm
command: make
args:
creates: "{{ slurm_dir }}/bin/srun"
chdir: /tmp/slurm-{{ slurm_version }}
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: install slurm
shell: make install
become: true
args:
chdir: /tmp/slurm-{{ slurm_version }}
creates: "{{ slurm_dir }}/bin/srun"
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: build pmi
command: make
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: install pmi
shell: make install
become: true
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: build pmi2
command: make
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi2
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: install pmi2
shell: make install
become: true
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi2
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: build pam_slurm
command: make
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: install pam_slurm
shell: make install
become: true
args:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: build pam_slurm_adopt
make:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: install pam_slurm_adopt
make:
chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt
target: install
when: force_slurm_recompile is defined or not stat_srun.stat.exists
become: true
- name: remove exist-slurm-latest-link
file:
path: /opt/slurm-latest
state: absent
become: true
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: put slurm-latest-link
file:
src: "{{ slurm_dir }}"
dest: /opt/slurm-latest
state: link
become: true
when: force_slurm_recompile is defined or not stat_srun.stat.exists
- name: add slurm log rotate config
template: src=slurmlog.j2 dest=/etc/logrotate.d/slurm mode=644
become: true
---
- name: create munge group
group: name=munge system=yes gid=498
become: true
- name: create munge user
user: name=munge group=munge system=yes createhome=no uid=498
become: true
- name: create slurm group
group: name=slurm system=yes gid=497
become: true
- name: create slurm user
user: name=slurm group=slurm system=yes createhome=no uid=497
become: true
- include: createSlurmDirectories.yml
- name: install deps
yum: name={{ item }} state=present
with_items:
- perl
- perl-DBI
- openssl-devel
- gcc
- rpm-build
- wget
- openssl-devel
- readline-devel
- pam-devel
- perl-ExtUtils-MakeMaker
- bzip2-devel
- hwloc
- hwloc-devel
- lua
- lua-devel
become: true
when: ansible_os_family == "RedHat"
- name: install deps
apt: name={{ item }} state=installed update_cache=yes
become: true
with_items:
- gcc
- wget
- libssl-dev
- libpam0g-dev
- libbz2-dev
- make
- perl
- libdbi-perl
- lua5.2
- hwloc
- libhwloc-dev
when: ansible_os_family == "Debian"
- include: installMungeFromSource.yml
- name: chown mungedir
file: path={{ munge_dir }} state=directory owner=munge recurse=yes
become: true
- name: make munge logdir
file: path={{ munge_dir }}/var/log/munge state=directory owner=munge mode=700
become: true
- name: install munge key
template: src=munge_key.j2 dest={{ munge_dir }}/etc/munge/munge.key owner=munge mode=600
become: true
- name: enable munge on boot
service: name=munge enabled=yes
become: true
- include: installSlurmFromSource.yml
- include: createSlurmDirectories.yml
- name: check slurm generic resource
shell: "{{ slurm_gres_check }}"
register: slurm_generic_resource
ignore_errors: true
when: slurm_gres_check is defined
check_mode: no
changed_when: False
- name: Gres - Test for Nvidia devices
script: scripts/nvidia-probe.py
register: probeOutput
check_mode: no
changed_when: False
- name: get cpu count
shell: 'lscpu | grep "On-line CPU" | cut -f 2 -d ":" | sed "s/\ *//g"'
register: cpucount
check_mode: no
changed_when: False
- name: "set nvidiaprobe slurm_gres_list"
set_fact: "slurm_gres_list={{ probeOutput.stdout }}"
- name: template gres.conf file
template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres.conf mode=644
become: true
- name: make slurm prolog dir
file: path=/opt/slurm/etc state=directory mode=755
become: true
become_user: root
- name: install slurm prolog
template: src=slurm.prolog.j2 dest=/opt/slurm/etc/slurm.prolog mode=755
become: true
- name: install slurm epilog
template: src=slurm.epilog.j2 dest=/opt/slurm/etc/slurm.epilog mode=755
become: true
- name: install slurm.conf
copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf
become: true
when: slurm_use_vpn==False
- name: install slurm.conf
template: src=slurm-vpn.conf.j2 dest={{ slurm_dir }}/etc/slurm.conf
become: true
when: slurm_use_vpn==True
#- name: install job_submit.lua
# copy: src=files/job_submit.lua dest={{ slurm_dir }}/etc/job_submit.lua
# become: true
# when: slurm_use_vpn==False
- name: setup envirnment variables
template: src=slurm_setup.sh.j2 dest=/etc/profile.d/slurm_setup.sh
become: true
- name: setup plugin
template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755
#delegate_to: "{{ slurmctrl }}"
#run_once: true
become: true
when: slurm_lua==True
- include: installCgroup.yml
CgroupAutomount=yes
ConstrainDevices=yes
TaskAffinity=yes
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainKmemSpace=no
AllowedDevicesFile={{ slurm_dir }}/etc/cgroup_allowed_devices.conf
/dev/vd*
/dev/null
/dev/zero
/dev/urandom
/dev/cpu/*/*
#slurm gres file for {{ ansible_hostname }}
#No Of Devices={{ slurm_gres_list | length }}
{% for gr in slurm_gres_list %}
Name={{ gr.name }} Type={{ gpumap[gr.type] }} File={{ gr.file }} CPUs={{ cpucount.stdout }}
{% endfor %}
{% for gr in slurm_gres_list %}
Name={{ gr.name }} File={{ gr.file }}
{% endfor %}
--[[
Example lua script demonstrating the SLURM job_submit/lua interface.
This is only an example, not meant for use in its current form.
Leave the function names, arguments, local varialbes and setmetatable
set up logic in each function unchanged. Change only the logic after
the lSUCCESSine containing "*** YOUR LOGIC GOES BELOW ***".
For use, this script should be copied into a file name "job_submit.lua"
in the same directory as the SLURM configuration file, slurm.conf.
--]]
function slurm_job_submit(job_desc, part_list, submit_uid)
-- Check no default account
if job_desc.account == "default" then
slurm.log_user("You have to specify your project ID as part of your job submission. The account=default is now deprecated on M3 job scheduler.")
return slurm.ERROR
end
-- Check Desktop requests with more than one node
if ((job_desc.name == "desktop") and (job_desc.min_nodes > 1 )) then
slurm.log_user("The current M3 Desktop applications are unable to utilise more than one node, please select one node instead")
return slurm.ERROR
end
-- Check for gres.gpu requirements in m3c, m3h and m3g, else move job to comp
if ((job_desc.partition == "m3c" ) or (job_desc.partition == "m3h" ) or (job_desc.partition == "m3g" )) then
local partition = ""
if (job_desc.gres == nil) then
partition = "comp"
slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition)
job_desc.partition = partition
end
return slurm.SUCCESS
end
-- Check for QOS rtq in m3c, m3h , m3g and partition=nil, then forward job to rtqp,comp,m3g
if ((job_desc.qos == "rtq") and (job_desc.partition == nil)) then
local partition = ""
partition = "rtqp,comp,m3g"
slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition)
job_desc.partition = partition
return slurm.SUCCESS
end
end
function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
return slurm.SUCCESS
end
slurm.log_info("initialized")
return slurm.SUCCESS
#!/bin/sh
###############################################################################
# Written by Chris Dunlap <cdunlap@llnl.gov>.
# Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC.
# Copyright (C) 2002-2007 The Regents of the University of California.
# UCRL-CODE-155910.
###############################################################################
# chkconfig: - 66 33
# description: MUNGE Uid 'N' Gid Emporium authentication service
###############################################################################
### BEGIN INIT INFO
# Provides: munge
# Required-Start: $local_fs $remote_fs $network $time
# Required-Stop: $local_fs $remote_fs
# Should-Start: $named $syslog
# Should-Stop: $named $syslog
# Default-Start:
# Default-Stop:
# Short-Description: MUNGE Uid 'N' Gid Emporium authentication service
# Description: MUNGE (MUNGE Uid 'N' Gid Emporium) is a highly scalable
# authentication service for creating and validating
# credentials.
### END INIT INFO
###############################################################################
unset SERVICE_NAME DAEMON_EXEC DAEMON_ARGS CONFIG PIDFILE NICE USER GROUP \
SIGHUP_RELOAD VARRUNDIR
prefix="{{ munge_dir }}"
exec_prefix="${prefix}"
sbindir="${exec_prefix}/sbin"
sysconfdir="${prefix}/etc"
localstatedir="${prefix}/var"
SERVICE_NAME="MUNGE"
DAEMON_EXEC="$sbindir/munged"
DAEMON_ARGS="-S ${localstatedir}/run/munge/munge.socket.2"
#CONFIG=#_NOT_SUPPORTED_#
PIDFILE="$localstatedir/run/munge/munged.pid"
#NICE=
USER="munge"
GROUP="munge"
#SIGHUP_RELOAD=#_NOT_SUPPORTED_#
VARRUNDIR="$localstatedir/run/munge"
###############################################################################
service_init ()
{
# Determine the system type and initialize the environment.
#
# Note that the shell positional parameters must be preserved when calling
# this function in order for SuSE to initialize its environment properly.
##
PATH=/sbin:/usr/sbin:/bin:/usr/bin
INIT_NAME="`basename \"$0\" .init | sed 's/^[SK][0-9][0-9]*//'`"
DAEMON_NAME="`basename \"$DAEMON_EXEC\"`"
SIGTERM_TIMEOUT="3"
STATUS=0
# Read configuration defaults to override variables:
# $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD
##
for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do
[ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME"
done
[ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS"
[ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER
expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE"
[ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \
&& RELOAD=1 || unset RELOAD
if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then
SYSTEM="DEBIAN"
[ -x "$DAEMON_EXEC" ] || exit 0 # pkg removed but not purged
[ -r /etc/default/rcS ] && . /etc/default/rcS
[ -r /lib/init/vars.sh ] && . /lib/init/vars.sh
[ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions
elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then
SYSTEM="REDHAT"
. /etc/rc.d/init.d/functions
RH_LOCK="/var/lock/subsys/$INIT_NAME"
elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then
SYSTEM="SUSE"
. /etc/rc.status
rc_reset
elif [ -r /lib/lsb/init-functions ]; then
SYSTEM="LSB"
. /lib/lsb/init-functions
else
SYSTEM="OTHER"
fi
# Exit if the package has been removed.
##
[ -x "$DAEMON_EXEC" ] || exit 5 # LSB: program not installed
# Exit if the configuration has been removed.
##
[ -z "$CONFIG" -o -r "$CONFIG" ] || exit 6 # LSB: program not configured
}
service_fini ()
{
# Return the exit status.
##
case $SYSTEM in
SUSE)
rc_exit
;;
DEBIAN|REDHAT|LSB|*)
exit $STATUS
;;
esac
}
service_start ()
{
# Start the service.
#
# Required by LSB, where running "start" on a service already running should be
# considered successful.
##
log_init "Starting $SERVICE_NAME" "$DAEMON_NAME"
if [ -n "$VARRUNDIR" -a ! -d "$VARRUNDIR" ]; then
mkdir -m 755 -p "$VARRUNDIR"
[ -n "$USER" ] && chown "$USER" "$VARRUNDIR"
[ -n "$GROUP" ] && chgrp "$GROUP" "$VARRUNDIR"
fi
case $SYSTEM in
DEBIAN)
if $0 status >/dev/null 2>&1; then
STATUS=0
else
ERRMSG=`start-stop-daemon --start --quiet \
${NICE:+"--nicelevel"} ${NICE:+"$NICE"} \
${USER:+"--chuid"} ${USER:+"$USER"} \
${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \
--exec "$DAEMON_EXEC" -- $DAEMON_ARGS 2>&1`
STATUS=$?
echo $ERRMSG
fi
;;
REDHAT)
if $0 status >/dev/null 2>&1; then
STATUS=0
else
daemon ${NICE:+"$NICE"} ${USER:+"--user"} ${USER:+"$USER"} \
"$DAEMON_EXEC" $DAEMON_ARGS
STATUS=$?
fi
[ $STATUS -eq 0 ] && touch "$RH_LOCK" >/dev/null 2>&1
;;
SUSE)
ERRMSG=`startproc ${NICE:+"-n"} ${NICE:+"$NICE"} \
${USER:+"-u"} ${USER:+"$USER"} \
${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \
"$DAEMON_EXEC" $DAEMON_ARGS 2>&1`
rc_status -v
STATUS=$?
;;
LSB)
if [ -n "$USER" ]; then
ERRMSG=`su "$USER" -c "/sbin/start_daemon \
${NICE:+\"-n\"} ${NICE:+\"$NICE\"} \
${PIDFILE:+\"-p\"} ${PIDFILE:+\"$PIDFILE\"} \
\"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1`
else
ERRMSG=`start_daemon ${NICE:+"-n"} ${NICE:+"$NICE"} \
${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \
"$DAEMON_EXEC" $DAEMON_ARGS 2>&1`
fi
STATUS=$?
;;
*)
if $0 status >/dev/null 2>&1; then
STATUS=0
else
[ -n "$NICE" ] && nice="nice -n $NICE"
if [ -n "$USER" ]; then
ERRMSG=`su "$USER" -c "$nice \"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1`
else
ERRMSG=`$nice "$DAEMON_EXEC" $DAEMON_ARGS 2>&1`
fi
STATUS=$?
fi
;;
esac
log_fini "$STATUS" "$ERRMSG"
}
service_stop ()
{
# Stop the service.
#
# Required by LSB, where running "stop" on a service already stopped or not
# running should be considered successful.
##
log_init "Stopping $SERVICE_NAME" "$DAEMON_NAME"
case $SYSTEM in
DEBIAN)
if ! $0 status >/dev/null 2>&1; then
STATUS=0
else
start-stop-daemon --stop --quiet \
${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \
--name "$DAEMON_NAME" ${SIGTERM_TIMEOUT:+"--retry"} \
${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} >/dev/null 2>&1
STATUS=$?
fi
;;
REDHAT)
if ! $0 status >/dev/null 2>&1; then
STATUS=0
else
killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \
${SIGTERM_TIMEOUT:+"-d"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \
"$DAEMON_EXEC"
STATUS=$?
fi
[ $STATUS -eq 0 ] && rm -f "$RH_LOCK" >/dev/null 2>&1
;;
SUSE)
killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \
${SIGTERM_TIMEOUT:+"-t"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \
"$DAEMON_EXEC"
rc_status -v
;;
LSB)
killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC"
STATUS=$?
;;
*)
signal_process "$DAEMON_EXEC"
rc=$?
[ $rc -eq 0 -o $rc -eq 2 ] && STATUS=0 || STATUS=1
;;
esac
log_fini "$STATUS"
[ -f "$PIDFILE" ] && rm -f "$PIDFILE"
}
service_restart ()
{
# Stop and restart the service if it is already running;
# otherwise, start the service.
#
# Required by LSB, where running "restart" on a service already stopped or not
# running should be considered successful.
##
if $0 status >/dev/null 2>&1; then
$0 stop && $0 start
else
$0 start
fi
case $SYSTEM in
SUSE)
rc_status
;;
DEBIAN|REDHAT|LSB|*)
STATUS=$?
;;
esac
}
service_try_restart ()
{
# Restart the service if it is already running.
#
# Optional for LSB, where running "try-restart" on a service already stopped or
# not running should be considered successful.
# Also known as "condrestart" by RedHat.
##
case $SYSTEM in
REDHAT)
[ -f "$RH_LOCK" ] && $0 restart || :
STATUS=$?
;;
SUSE)
$0 status >/dev/null 2>&1 && $0 restart || rc_reset
rc_status
;;
DEBIAN|LSB|*)
$0 status >/dev/null 2>&1 && $0 restart || :
STATUS=$?
;;
esac
}
service_reload ()
{
# Reload the configuration without stopping and restarting the service.
#
# Optional for LSB.
##
[ -z "$RELOAD" ] && STATUS=3 # LSB: unimplemented feature
log_init "Reloading $SERVICE_NAME" "$DAEMON_NAME"
case $SYSTEM in
DEBIAN)
if [ -n "$RELOAD" ]; then
start-stop-daemon --stop --quiet --signal HUP \
${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \
--name "$DAEMON_NAME" >/dev/null 2>&1
STATUS=$?
fi
;;
REDHAT)
if [ -n "$RELOAD" ]; then
killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP
STATUS=$?
else
echo_failure
fi
;;
SUSE)
if [ -n "$RELOAD" ]; then
killproc -HUP ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC"
else
rc_failed $STATUS
fi
rc_status -v
;;
LSB)
if [ -n "$RELOAD" ]; then
killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP
STATUS=$?
fi
;;
*)
if [ -n "$RELOAD" ]; then
signal_process "$DAEMON_EXEC" "HUP"
STATUS=$?
fi
;;
esac
log_fini "$STATUS"
}
service_force_reload ()
{
# Reload the configuration if the service supports this;
# otherwise, restart the service if it is already running.
#
# Required by LSB, where running "force-reload" on a service already stopped or
# not running should be considered successful.
##
if [ -n "$RELOAD" ]; then
$0 reload
else
$0 try-restart
fi
case $SYSTEM in
SUSE)
rc_status
;;
DEBIAN|REDHAT|LSB|*)
STATUS=$?
;;
esac
}
service_status ()
{
# Print the current status of the service.
#
# Required by LSB.
##
case $SYSTEM in
REDHAT)
status ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC"
STATUS=$?
;;
SUSE)
printf "Checking for service $SERVICE_NAME: "
checkproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC"
rc_status -v
;;
LSB)
printf "Checking status of $SERVICE_NAME: "
pids=`pidofproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \
"$DAEMON_EXEC" 2>/dev/null`
STATUS=$?
if [ $STATUS -eq 0 -a -n "$pids" ]; then
echo "running."
elif [ $STATUS -ne 0 -a -s "$PIDFILE" ]; then
echo "dead."
else
echo "stopped."
fi
;;
DEBIAN|*)
printf "Checking status of $SERVICE_NAME: "
pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"`
rc=$?
if [ $rc -eq 0 -a -n "$pids" ]; then
echo "running."
STATUS=0 # LSB: program is running
elif [ $rc -ne 0 -a -s "$PIDFILE" ]; then
echo "dead."
STATUS=1 # LSB: program is dead & pidfile exists
elif [ $rc -ne 0 ]; then
echo "stopped."
STATUS=3 # LSB: program is not running
else
echo "unknown."
STATUS=4 # LSB: program status unknown
fi
;;
esac
}
query_pids ()
{
# Writes the matching PIDs to stdout.
# Returns 0 on success (ie, pids found).
##
PROCNAME="$1"
PIDFILE="$2"
if type pgrep >/dev/null 2>&1; then
pids=`pgrep -d ' ' -x "\`basename \"$PROCNAME\"\`" 2>/dev/null`
rc=$?
elif type pidof >/dev/null 2>&1; then
pids=`pidof -o $$ -x "$PROCNAME" 2>/dev/null`
rc=$?
else
pids=`(ps awx -o pid -o command || ps -e -f -o pid -o args) 2>/dev/null \
| tail +2 | egrep "( |/)$PROCNAME( |$)" | grep -v egrep \
| sed 's/ *\([0-9]*\).*/\1/' | sort -n | tr '\012' ' '`
[ -n "$pids" ] && rc=0 || rc=1
fi
unset pids_running
if [ -n "$pids" -a -r "$PIDFILE" ]; then
read pid_line < "$PIDFILE"
for pid in $pid_line; do
expr -- "$pid" : '[0-9]*$' >/dev/null 2>&1 \
&& expr -- " $pids " : ".* $pid .*" >/dev/null 2>&1 \
&& pids_running="$pids_running $pid"
done
[ -n "$pids_running" ] && pids=$pids_running
fi
echo $pids
return $rc
}
signal_process ()
{
# Returns 0 on success, 1 if kill failed, 2 if PROCNAME is not running.
##
PROCNAME="$1"
SIGNUM="$2"
pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"`
[ $? -ne 0 -o -z "$pids" ] && return 2
kill ${SIGNUM:+"-$SIGNUM"} $pids >/dev/null 2>&1
[ $? -ne 0 ] && return 1
[ -n "$SIGNUM" ] && return 0
sleep 1
pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"`
[ $? -ne 0 -o -z "$pids" ] && return 0
[ -z "$SIGTERM_TIMEOUT" ] && return 1
sleep "$SIGTERM_TIMEOUT"
kill -KILL $pids >/dev/null 2>&1
pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"`
[ $? -ne 0 -o -z "$pids" ] && return 0
return 1
}
log_init ()
{
# Output informational message at beginning of action.
##
MESSAGE="$1"
PROCNAME="$2"
case $SYSTEM in
DEBIAN)
if [ "$VERBOSE" != no ]; then
if type log_daemon_msg >/dev/null 2>&1; then
log_daemon_msg "$MESSAGE" "$PROCNAME"
else
printf "$MESSAGE: $PROCNAME"
fi
fi
;;
REDHAT|SUSE|LSB|*)
printf "$MESSAGE: $PROCNAME"
;;
esac
}
log_fini ()
{
# Output informational/error message at end of action.
##
STATUS="$1"
ERRMSG="$2"
case $SYSTEM in
DEBIAN)
if [ "$VERBOSE" != no ]; then
if ( type log_end_msg && type log_failure_msg ) >/dev/null 2>&1; then
log_end_msg "$STATUS"
[ $STATUS -eq 0 -o -z "$ERRMSG" ] || log_failure_msg "$ERRMSG"
else
[ $STATUS -eq 0 ] && echo "." || echo " (failed)."
[ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2
fi
fi
;;
REDHAT)
echo
;;
SUSE)
[ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2
;;
LSB|*)
[ $STATUS -eq 0 ] && echo "." || echo " (failed)."
[ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2
;;
esac
}
###############################################################################
service_init "$@"
case "$1" in
start)
service_start
;;
stop)
service_stop
;;
restart)
service_restart
;;
try-restart|condrestart)
service_try_restart
;;
reload)
service_reload
;;
force-reload)
service_force_reload
;;
status)
service_status
;;
*)
echo "Usage: `basename \"$0\"`" \
"(start|stop|restart|try-restart|reload|force-reload|status)" >&2
exit 2 # LSB: invalid or excess argument(s)
;;
esac
service_fini
[Unit]
Description=MUNGE authentication service
Documentation=man:munged(8)
After=network.target
After=syslog.target
After=time-sync.target
[Service]
Type=forking
ExecStart={{ munge_dir }}/sbin/munged
PIDFile={{ munge_dir }}/var/run/munge/munged.pid
User=munge
Group=munge
Restart=on-abort
[Install]
WantedBy=multi-user.target
{{ mungekey }}
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=CIAB
ControlMachine={{ slurmctrl }}
ControlAddr={{ slurmctrl }}-vpn
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/tmp
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
MpiParams=ports=12000-12999
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ node }} NodeAddr={{ node }}-vpn Procs={{ hostvars[node]['ansible_processor_cores'] }} State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }}
{% endfor %}
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation={{ slurmstatedir }}
SlurmdSpoolDir={{ slurmdatadir }}
SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
{% if slurm_lua is defined %}
JobSubmitPlugins=lua
{% endif %}
OverTimeLimit=1
CompleteWait=10
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed
#SlurmctldTimeout=300
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
#
# SCHEDULING
SchedulerType={{ slurmschedulertype }}
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
{% if slurmselecttype.find("cons_res") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
SlurmctldDebug={{ slurmctlddebug.level }}
SlurmctldLogFile={{ slurmctlddebug.log }}
{% else %}
#SlurmctldDebug=
#SlurmctldLogFile=
{% endif %}
{% if slurmddebug %}
SlurmdDebug={{ slurmddebug.level }}
SlurmdLogFile={{ slurmddebug.log }}
{% else %}
#SlurmdDebug=
#SlurmdLogFile=
{% endif %}
{% if slurmschedlog %}
SlurmSchedlogLevel={{ slurmschedlog.level }}
SlurmSchedLogFile={{ slurmschedlog.log }}
{% else %}
#SlurmSchedlogLevel=
#SlurmSchedLogFile=
{% endif %}
JobCompType=jobcomp/none
#JobCompLoc=
#
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% endif %}
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmctrl }}
#AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}
DisableRootJobs=YES
MpiParams=ports=12000-12999
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP
{% endfor %}
#!/bin/sh
# specific files to be deleted to clean up after a Strudel session
find /tmp -user ${SLURM_JOB_USER} -name "pulse*" | xargs rm -rf
find /tmp -user ${SLURM_JOB_USER} -name ".esd-*" | xargs rm -rf
find /tmp -user ${SLURM_JOB_USER} -name ".X*-lock" | xargs rm -rf
find /tmp/.X11-unix -user ${SLURM_JOB_USER} -name "X*" | xargs rm -rf
# NOTE: 20180316 The jobs above clean up the VNC session
# further clean up for Strudel Session
# X lock files are owned by root, so we need to find the right files to delete
# New Strudel session will create a file under user's home folder called xorg-jobid
# This file contains the display number that is assigned to xterm when it starts
# Assigning variable and trimming the : from the display number
XSESSION=`cat /home/${SLURM_JOB_USER}/.vnc/xorg-${SLURM_JOB_ID} | tr -d :`
# Formatting the filenames for the two files that we need to clean: /tmp/.X*-lock and /tmp/.X11/X*
XLOCKFILENAME=".X"$XSESSION"-lock"
XUNIXFILENAME="X"$XSESSION
# Find the files and delete them
find /tmp/ -name $XLOCKFILENAME -exec rm -rf {} \;
find /tmp/.X11-unix -name $XUNIXFILENAME -exec rm -rf {} \;
# Now we clean up
rm -rf /home/${SLURM_JOB_USER}/.vnc/xorg-${SLURM_JOB_ID}
# echo 1 to drop page cache
/bin/sync
/bin/echo 1 > /proc/sys/vm/drop_caches
exit 0
#!/bin/sh
# echo 1 to drop page cache
/bin/sync
/bin/echo 1 > /proc/sys/vm/drop_caches
exit 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment