diff --git a/roles/apt-get-upgrade/tasks/main.yml b/roles/apt-get-upgrade/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..49e0217a1a170d17bee5ef958f7886903e0c1256 --- /dev/null +++ b/roles/apt-get-upgrade/tasks/main.yml @@ -0,0 +1,10 @@ +--- +- name: apt-get update + apt: update_cache=True + sudo: true + when: ansible_os_family=="Debian" + +- name: apt-get upgrade + apt: upgrade=safe + sudo: true + when: ansible_os_family=="Debian" diff --git a/roles/nfs-client/tasks/mountFileSystem.yml b/roles/nfs-client/tasks/mountFileSystem.yml index 555e12b5014dd5c37617903c589a60d090d73d70..0dc48290c5dfa850ef847654c5d9c4708ce61e2c 100644 --- a/roles/nfs-client/tasks/mountFileSystem.yml +++ b/roles/nfs-client/tasks/mountFileSystem.yml @@ -4,16 +4,18 @@ sudo: true + + - name: "Mounting NFS mounts" - mount: name={{ item.name }} src={{ item.ipv4 }}:{{ item.src }} fstype={{ item.fstype }} opts={{ item.opts }} state=mounted - with_items: exportList + mount: name={{ item.mntpt }} src={{ item.src }} fstype={{ item.fstype }} opts={{ item.opts }} state=mounted + with_items: nfsMounts notify: "restart authentication" notify: "restart rpcbind" notify: "restart idmap" sudo: true ignore_errors: true register: firstMount - when: exportList is defined + when: nfsMounts is defined - name: "Wait for nfs to stabailse" command: sleep 60 @@ -21,13 +23,13 @@ when: firstMount | failed - name: "Mounting NFS mounts" - mount: name={{ item.name }} src={{ item.ipv4 }}:{{ item.src }} fstype={{ item.fstype }} opts={{ item.opts }} state=mounted - with_items: exportList + mount: name={{ item.name }} src=" {{ item.ipv4 }}:{{ item.src }} " fstype={{ item.fstype }} opts={{ item.opts }} state=mounted + with_items: nfsMounts notify: "restart authentication" notify: "restart idmap" notify: "restart rpcbind" sudo: true - when: exportList is defined and firstMount | failed + when: nfsMounts is defined and firstMount | failed - name: "restart fail2ban" service: name=fail2ban state=started diff --git a/roles/nfs-common/tasks/aptPackages.yml b/roles/nfs-common/tasks/aptPackages.yml new file mode 100644 index 0000000000000000000000000000000000000000..364b2b73093be8d9ac7b80c63bd53025d3694713 --- /dev/null +++ b/roles/nfs-common/tasks/aptPackages.yml @@ -0,0 +1,9 @@ +--- +- + name: "Install nfs-utils" + with_items: + - nfs-common + - nfs-kernel-server + apt: "name={{ item }} state=latest" + sudo: true + diff --git a/roles/nfs-server/tasks/startServer.yml b/roles/nfs-server/tasks/startServer.yml index e8338d56c7265b4fb04a9f5626198d1f3bfe6846..606b143bda4081e031d4944f63125f0d8036fae2 100644 --- a/roles/nfs-server/tasks/startServer.yml +++ b/roles/nfs-server/tasks/startServer.yml @@ -2,6 +2,14 @@ - name: "Starting rpcbind" service: "name=rpcbind state=restarted" sudo: true + when: ansible_os_family == "RedHat" + - name: "Start the Server" service: "name=nfs state=restarted" sudo: true + when: ansible_os_family == "RedHat" + +- name: "Start the Server" + service: "name=nfs-kernel-server state=restarted" + sudo: true + when: ansible_os_family == "Debian" diff --git a/roles/slurm-from-source/defaults/main.yml b/roles/slurm-from-source/defaults/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc0836cd6b6ca61a8223423f7d4f8072e185abdd --- /dev/null +++ b/roles/slurm-from-source/defaults/main.yml @@ -0,0 +1,14 @@ +--- +munge_dir: /opt/munge +slurm_dir: /opt/slurm +slurm_use_vpn: False +slurmctlddebug: {level: 9, log: '/var/log/slurm/slurmctld.log'} +slurmddebug: {level: 9, log: '/var/log/slurm/slurmd.log'} +slurmschedlog: {level: 9, log: '/var/log/slurm/slurmsched.log'} +slurmdbdlog: {level: 9, log: '/var/log/slurm/slurmdbd.log'} +slurmfairshare: {def: false, val: 10000} +slurmdatadir: "/var/spool/slurm" +slurmselecttype: "select/linear" +slurmfastschedule: "1" +slurmschedulertype: "sched/backfill" + diff --git a/roles/slurm-from-source/handlers/main.yml b/roles/slurm-from-source/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..57f0cb12be532609f9884170cf7aeadf7be037f9 --- /dev/null +++ b/roles/slurm-from-source/handlers/main.yml @@ -0,0 +1,18 @@ +--- + - name: restart munge + service: name=munge state=restarted + sudo: true + + - name: restart slurm + service: name=slurm state=restarted + sudo: true + + - name: restart slurmdbd + service: name=slurmdbd state=restarted + sudo: true + + - name: scontrol reconfigure + shell: sleep 10 ; scontrol reconfigure + sudo: true + delegate_to: "{{ slurmctrl }}" + run_once: true diff --git a/roles/slurm-from-source/tasks/installMungeFromSource.yml b/roles/slurm-from-source/tasks/installMungeFromSource.yml new file mode 100644 index 0000000000000000000000000000000000000000..62a716e3eda3d4f1ae5f56d66a468ab12d19a726 --- /dev/null +++ b/roles/slurm-from-source/tasks/installMungeFromSource.yml @@ -0,0 +1,34 @@ +- name: get munge + shell: wget https://munge.googlecode.com/files/munge-{{ munge_version }}.tar.bz2 + args: + chdir: /tmp + creates: /tmp/munge-{{ munge_version }}.tar.bz2 + +- name: remove old + shell: rm -rf /tmp/munge-{{ munge_version }} && rm -rf /tmp/slurm-{{ slurm_version }} + sudo: true + +- name: untar munge + shell: tar jxf munge-{{ munge_version }}.tar.bz2 + args: + chdir: /tmp + +- name: build munge + shell: ./configure --prefix={{ munge_dir }} && make + args: + chdir: /tmp/munge-{{ munge_version }} + +- name: install munge + shell: make install + sudo: true + args: + chdir: /tmp/munge-{{ munge_version }} + +- name: copy init script + template: dest=/etc/init.d/munge src=munge.initd.j2 mode=755 + sudo: true + +- name: start on boot + shell: update-rc.d munge defaults + sudo: true + when: ansible_distribution == "Ubuntu" diff --git a/roles/slurm-from-source/tasks/installSlurmFromSource.yml b/roles/slurm-from-source/tasks/installSlurmFromSource.yml new file mode 100644 index 0000000000000000000000000000000000000000..945bf80409b908e1ec337f24e71ff5cbeb493330 --- /dev/null +++ b/roles/slurm-from-source/tasks/installSlurmFromSource.yml @@ -0,0 +1,26 @@ +- name: get slurm + shell: wget http://www.schedmd.com/download/archive/slurm-{{ slurm_version }}.tar.bz2 + args: + chdir: /tmp + creates: /tmp/slurm-{{ slurm_version }}.tar.bz2 + +- name: untar slurm + shell: tar jxf /tmp/slurm-{{ slurm_version }}.tar.bz2 + args: + chdir: /tmp + +- name: build slurm + shell: ./configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} && make + args: + chdir: /tmp/slurm-{{ slurm_version }} + +- name: install slurm + shell: make install + sudo: true + args: + chdir: /tmp/slurm-{{ slurm_version }} + +- name: copy init script + template: dest=/etc/init.d/slurm src=slurm.initd.j2 mode=755 + sudo: true + diff --git a/roles/slurm-from-source/tasks/main.yml b/roles/slurm-from-source/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..1fa1d007c4b9177cbdce067781edae6ce3a70f2c --- /dev/null +++ b/roles/slurm-from-source/tasks/main.yml @@ -0,0 +1,84 @@ +--- +- name: create munge group + group: name=munge system=yes gid=498 + sudo: true + +- name: create munge user + user: name=munge group=munge system=yes createhome=no uid=498 + sudo: true + +- name: create slurm group + group: name=slurm system=yes gid=497 + sudo: true + +- name: create slurm user + user: name=slurm group=slurm system=yes createhome=no uid=497 + sudo: true + +- name: make sure slurm conf dir exists + file: dest={{ slurm_dir }}/etc state=directory + sudo: true + +- name: make sure slurm lock dir exists + file: dest=/var/lock/subsys state=directory owner=root group=root mode=755 + sudo: true + +- name: create data directory + file: path={{ slurmdatadir }} state=directory owner=slurm group=slurm mode=755 + sudo: true + +- name: create log directory + shell: mkdir -p {{ slurmddebug.log | dirname }}; chown slurm:slurm {{ slurmddebug.log | dirname }} + args: + creates: "{{ slurmddebug.log | dirname }}" + sudo: true + when: slurmddebug is defined and slurmddebug.log + +- name: install deps + yum: name={{ item }} state=latest + with_items: + - perl + - perl-DBI + sudo: true + when: ansible_os_family == "RedHat" + +- name: install deps + apt: name={{ item }} state=installed update_cache=yes + sudo: true + with_items: + - gcc + - wget + - libssl-dev + - libpam0g-dev + - libbz2-dev + - make + - perl + - libdbi-perl + when: ansible_os_family == "Debian" + +- include: installMungeFromSource.yml + +- name: chown mungedir + file: path={{ munge_dir }} state=directory owner=munge recurse=yes + sudo: true + +- name: make munge logdir + file: path={{ munge_dir }}/var/log/munge state=directory owner=munge mode=700 + sudo: true + +- name: install munge key + template: src=munge_key.j2 dest={{ munge_dir }}/etc/munge/munge.key owner=munge mode=600 + sudo: true + notify: restart munge + +- include: installSlurmFromSource.yml + +- name: install slurm.conf + template: src=slurm.conf.j2 dest={{ slurm_dir }}/etc/slurm.conf + sudo: true + when: slurm_use_vpn==False + +- name: install slurm.conf + template: src=slurm-vpn.conf.j2 dest={{ slurm_dir }}/etc/slurm.conf + sudo: true + when: slurm_use_vpn==True diff --git a/roles/slurm-from-source/templates/munge.initd.j2 b/roles/slurm-from-source/templates/munge.initd.j2 new file mode 100755 index 0000000000000000000000000000000000000000..5443aba6b2405bc65ace2895d87c38100b79b011 --- /dev/null +++ b/roles/slurm-from-source/templates/munge.initd.j2 @@ -0,0 +1,567 @@ +#!/bin/sh +############################################################################### +# Written by Chris Dunlap <cdunlap@llnl.gov>. +# Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. +# Copyright (C) 2002-2007 The Regents of the University of California. +# UCRL-CODE-155910. +############################################################################### +# chkconfig: - 66 33 +# description: MUNGE Uid 'N' Gid Emporium authentication service +############################################################################### +### BEGIN INIT INFO +# Provides: munge +# Required-Start: $local_fs $remote_fs $network $time +# Required-Stop: $local_fs $remote_fs +# Should-Start: $named $syslog +# Should-Stop: $named $syslog +# Default-Start: +# Default-Stop: +# Short-Description: MUNGE Uid 'N' Gid Emporium authentication service +# Description: MUNGE (MUNGE Uid 'N' Gid Emporium) is a highly scalable +# authentication service for creating and validating +# credentials. +### END INIT INFO +############################################################################### + +unset SERVICE_NAME DAEMON_EXEC DAEMON_ARGS CONFIG PIDFILE NICE USER GROUP \ + SIGHUP_RELOAD VARRUNDIR + +prefix="/opt/munge" +exec_prefix="${prefix}" +sbindir="${exec_prefix}/sbin" +sysconfdir="${prefix}/etc" +localstatedir="${prefix}/var" + +SERVICE_NAME="MUNGE" +DAEMON_EXEC="$sbindir/munged" +#DAEMON_ARGS= +#CONFIG=#_NOT_SUPPORTED_# +PIDFILE="$localstatedir/run/munge/munged.pid" +#NICE= +USER="munge" +GROUP="munge" +#SIGHUP_RELOAD=#_NOT_SUPPORTED_# +VARRUNDIR="$localstatedir/run/munge" + +############################################################################### + +service_init () +{ +# Determine the system type and initialize the environment. +# +# Note that the shell positional parameters must be preserved when calling +# this function in order for SuSE to initialize its environment properly. +## + PATH=/sbin:/usr/sbin:/bin:/usr/bin + INIT_NAME="`basename \"$0\" .init | sed 's/^[SK][0-9][0-9]*//'`" + DAEMON_NAME="`basename \"$DAEMON_EXEC\"`" + SIGTERM_TIMEOUT="3" + STATUS=0 + + # Read configuration defaults to override variables: + # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD + ## + for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do + [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" + done + [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" + [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER + expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" + [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ + && RELOAD=1 || unset RELOAD + + if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then + SYSTEM="DEBIAN" + [ -x "$DAEMON_EXEC" ] || exit 0 # pkg removed but not purged + [ -r /etc/default/rcS ] && . /etc/default/rcS + [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh + [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions + elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then + SYSTEM="REDHAT" + . /etc/rc.d/init.d/functions + RH_LOCK="/var/lock/subsys/$INIT_NAME" + elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then + SYSTEM="SUSE" + . /etc/rc.status + rc_reset + elif [ -r /lib/lsb/init-functions ]; then + SYSTEM="LSB" + . /lib/lsb/init-functions + else + SYSTEM="OTHER" + fi + + # Exit if the package has been removed. + ## + [ -x "$DAEMON_EXEC" ] || exit 5 # LSB: program not installed + + # Exit if the configuration has been removed. + ## + [ -z "$CONFIG" -o -r "$CONFIG" ] || exit 6 # LSB: program not configured +} + +service_fini () +{ +# Return the exit status. +## + case $SYSTEM in + SUSE) + rc_exit + ;; + DEBIAN|REDHAT|LSB|*) + exit $STATUS + ;; + esac +} + +service_start () +{ +# Start the service. +# +# Required by LSB, where running "start" on a service already running should be +# considered successful. +## + log_init "Starting $SERVICE_NAME" "$DAEMON_NAME" + + if [ -n "$VARRUNDIR" -a ! -d "$VARRUNDIR" ]; then + mkdir -m 755 -p "$VARRUNDIR" + [ -n "$USER" ] && chown "$USER" "$VARRUNDIR" + [ -n "$GROUP" ] && chgrp "$GROUP" "$VARRUNDIR" + fi + + case $SYSTEM in + DEBIAN) + if $0 status >/dev/null 2>&1; then + STATUS=0 + else + ERRMSG=`start-stop-daemon --start --quiet \ + ${NICE:+"--nicelevel"} ${NICE:+"$NICE"} \ + ${USER:+"--chuid"} ${USER:+"$USER"} \ + ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ + --exec "$DAEMON_EXEC" -- $DAEMON_ARGS 2>&1` + STATUS=$? + echo $ERRMSG + fi + ;; + REDHAT) + if $0 status >/dev/null 2>&1; then + STATUS=0 + else + daemon ${NICE:+"$NICE"} ${USER:+"--user"} ${USER:+"$USER"} \ + "$DAEMON_EXEC" $DAEMON_ARGS + STATUS=$? + fi + [ $STATUS -eq 0 ] && touch "$RH_LOCK" >/dev/null 2>&1 + ;; + SUSE) + ERRMSG=`startproc ${NICE:+"-n"} ${NICE:+"$NICE"} \ + ${USER:+"-u"} ${USER:+"$USER"} \ + ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ + "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` + rc_status -v + STATUS=$? + ;; + LSB) + if [ -n "$USER" ]; then + ERRMSG=`su "$USER" -c "/sbin/start_daemon \ + ${NICE:+\"-n\"} ${NICE:+\"$NICE\"} \ + ${PIDFILE:+\"-p\"} ${PIDFILE:+\"$PIDFILE\"} \ + \"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1` + else + ERRMSG=`start_daemon ${NICE:+"-n"} ${NICE:+"$NICE"} \ + ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ + "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` + fi + STATUS=$? + ;; + *) + if $0 status >/dev/null 2>&1; then + STATUS=0 + else + [ -n "$NICE" ] && nice="nice -n $NICE" + if [ -n "$USER" ]; then + ERRMSG=`su "$USER" -c "$nice \"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1` + else + ERRMSG=`$nice "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` + fi + STATUS=$? + fi + ;; + esac + log_fini "$STATUS" "$ERRMSG" +} + +service_stop () +{ +# Stop the service. +# +# Required by LSB, where running "stop" on a service already stopped or not +# running should be considered successful. +## + log_init "Stopping $SERVICE_NAME" "$DAEMON_NAME" + case $SYSTEM in + DEBIAN) + if ! $0 status >/dev/null 2>&1; then + STATUS=0 + else + start-stop-daemon --stop --quiet \ + ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ + --name "$DAEMON_NAME" ${SIGTERM_TIMEOUT:+"--retry"} \ + ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} >/dev/null 2>&1 + STATUS=$? + fi + ;; + REDHAT) + if ! $0 status >/dev/null 2>&1; then + STATUS=0 + else + killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ + ${SIGTERM_TIMEOUT:+"-d"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \ + "$DAEMON_EXEC" + STATUS=$? + fi + [ $STATUS -eq 0 ] && rm -f "$RH_LOCK" >/dev/null 2>&1 + ;; + SUSE) + killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ + ${SIGTERM_TIMEOUT:+"-t"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \ + "$DAEMON_EXEC" + rc_status -v + ;; + LSB) + killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" + STATUS=$? + ;; + *) + signal_process "$DAEMON_EXEC" + rc=$? + [ $rc -eq 0 -o $rc -eq 2 ] && STATUS=0 || STATUS=1 + ;; + esac + log_fini "$STATUS" + [ -f "$PIDFILE" ] && rm -f "$PIDFILE" +} + +service_restart () +{ +# Stop and restart the service if it is already running; +# otherwise, start the service. +# +# Required by LSB, where running "restart" on a service already stopped or not +# running should be considered successful. +## + if $0 status >/dev/null 2>&1; then + $0 stop && $0 start + else + $0 start + fi + + case $SYSTEM in + SUSE) + rc_status + ;; + DEBIAN|REDHAT|LSB|*) + STATUS=$? + ;; + esac +} + +service_try_restart () +{ +# Restart the service if it is already running. +# +# Optional for LSB, where running "try-restart" on a service already stopped or +# not running should be considered successful. +# Also known as "condrestart" by RedHat. +## + case $SYSTEM in + REDHAT) + [ -f "$RH_LOCK" ] && $0 restart || : + STATUS=$? + ;; + SUSE) + $0 status >/dev/null 2>&1 && $0 restart || rc_reset + rc_status + ;; + DEBIAN|LSB|*) + $0 status >/dev/null 2>&1 && $0 restart || : + STATUS=$? + ;; + esac +} + +service_reload () +{ +# Reload the configuration without stopping and restarting the service. +# +# Optional for LSB. +## + [ -z "$RELOAD" ] && STATUS=3 # LSB: unimplemented feature + + log_init "Reloading $SERVICE_NAME" "$DAEMON_NAME" + case $SYSTEM in + DEBIAN) + if [ -n "$RELOAD" ]; then + start-stop-daemon --stop --quiet --signal HUP \ + ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ + --name "$DAEMON_NAME" >/dev/null 2>&1 + STATUS=$? + fi + ;; + REDHAT) + if [ -n "$RELOAD" ]; then + killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP + STATUS=$? + else + echo_failure + fi + ;; + SUSE) + if [ -n "$RELOAD" ]; then + killproc -HUP ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" + else + rc_failed $STATUS + fi + rc_status -v + ;; + LSB) + if [ -n "$RELOAD" ]; then + killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP + STATUS=$? + fi + ;; + *) + if [ -n "$RELOAD" ]; then + signal_process "$DAEMON_EXEC" "HUP" + STATUS=$? + fi + ;; + esac + log_fini "$STATUS" +} + +service_force_reload () +{ +# Reload the configuration if the service supports this; +# otherwise, restart the service if it is already running. +# +# Required by LSB, where running "force-reload" on a service already stopped or +# not running should be considered successful. +## + if [ -n "$RELOAD" ]; then + $0 reload + else + $0 try-restart + fi + + case $SYSTEM in + SUSE) + rc_status + ;; + DEBIAN|REDHAT|LSB|*) + STATUS=$? + ;; + esac +} + +service_status () +{ +# Print the current status of the service. +# +# Required by LSB. +## + case $SYSTEM in + REDHAT) + status ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" + STATUS=$? + ;; + SUSE) + printf "Checking for service $SERVICE_NAME: " + checkproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" + rc_status -v + ;; + LSB) + printf "Checking status of $SERVICE_NAME: " + pids=`pidofproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ + "$DAEMON_EXEC" 2>/dev/null` + STATUS=$? + if [ $STATUS -eq 0 -a -n "$pids" ]; then + echo "running." + elif [ $STATUS -ne 0 -a -s "$PIDFILE" ]; then + echo "dead." + else + echo "stopped." + fi + ;; + DEBIAN|*) + printf "Checking status of $SERVICE_NAME: " + pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` + rc=$? + if [ $rc -eq 0 -a -n "$pids" ]; then + echo "running." + STATUS=0 # LSB: program is running + elif [ $rc -ne 0 -a -s "$PIDFILE" ]; then + echo "dead." + STATUS=1 # LSB: program is dead & pidfile exists + elif [ $rc -ne 0 ]; then + echo "stopped." + STATUS=3 # LSB: program is not running + else + echo "unknown." + STATUS=4 # LSB: program status unknown + fi + ;; + esac +} + +query_pids () +{ +# Writes the matching PIDs to stdout. +# Returns 0 on success (ie, pids found). +## + PROCNAME="$1" + PIDFILE="$2" + + if type pgrep >/dev/null 2>&1; then + pids=`pgrep -d ' ' -x "\`basename \"$PROCNAME\"\`" 2>/dev/null` + rc=$? + elif type pidof >/dev/null 2>&1; then + pids=`pidof -o $$ -x "$PROCNAME" 2>/dev/null` + rc=$? + else + pids=`(ps awx -o pid -o command || ps -e -f -o pid -o args) 2>/dev/null \ + | tail +2 | egrep "( |/)$PROCNAME( |$)" | grep -v egrep \ + | sed 's/ *\([0-9]*\).*/\1/' | sort -n | tr '\012' ' '` + [ -n "$pids" ] && rc=0 || rc=1 + fi + + unset pids_running + if [ -n "$pids" -a -r "$PIDFILE" ]; then + read pid_line < "$PIDFILE" + for pid in $pid_line; do + expr -- "$pid" : '[0-9]*$' >/dev/null 2>&1 \ + && expr -- " $pids " : ".* $pid .*" >/dev/null 2>&1 \ + && pids_running="$pids_running $pid" + done + [ -n "$pids_running" ] && pids=$pids_running + fi + + echo $pids + return $rc +} + +signal_process () +{ +# Returns 0 on success, 1 if kill failed, 2 if PROCNAME is not running. +## + PROCNAME="$1" + SIGNUM="$2" + + pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` + [ $? -ne 0 -o -z "$pids" ] && return 2 + + kill ${SIGNUM:+"-$SIGNUM"} $pids >/dev/null 2>&1 + [ $? -ne 0 ] && return 1 + [ -n "$SIGNUM" ] && return 0 + + sleep 1 + pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` + [ $? -ne 0 -o -z "$pids" ] && return 0 + [ -z "$SIGTERM_TIMEOUT" ] && return 1 + + sleep "$SIGTERM_TIMEOUT" + kill -KILL $pids >/dev/null 2>&1 + pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` + [ $? -ne 0 -o -z "$pids" ] && return 0 + return 1 +} + +log_init () +{ +# Output informational message at beginning of action. +## + MESSAGE="$1" + PROCNAME="$2" + + case $SYSTEM in + DEBIAN) + if [ "$VERBOSE" != no ]; then + if type log_daemon_msg >/dev/null 2>&1; then + log_daemon_msg "$MESSAGE" "$PROCNAME" + else + printf "$MESSAGE: $PROCNAME" + fi + fi + ;; + REDHAT|SUSE|LSB|*) + printf "$MESSAGE: $PROCNAME" + ;; + esac +} + +log_fini () +{ +# Output informational/error message at end of action. +## + STATUS="$1" + ERRMSG="$2" + + case $SYSTEM in + DEBIAN) + if [ "$VERBOSE" != no ]; then + if ( type log_end_msg && type log_failure_msg ) >/dev/null 2>&1; then + log_end_msg "$STATUS" + [ $STATUS -eq 0 -o -z "$ERRMSG" ] || log_failure_msg "$ERRMSG" + else + [ $STATUS -eq 0 ] && echo "." || echo " (failed)." + [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 + fi + fi + ;; + REDHAT) + echo + ;; + SUSE) + [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 + ;; + LSB|*) + [ $STATUS -eq 0 ] && echo "." || echo " (failed)." + [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 + ;; + esac +} + +############################################################################### + +service_init "$@" + +case "$1" in + start) + service_start + ;; + stop) + service_stop + ;; + restart) + service_restart + ;; + try-restart|condrestart) + service_try_restart + ;; + reload) + service_reload + ;; + force-reload) + service_force_reload + ;; + status) + service_status + ;; + *) + echo "Usage: `basename \"$0\"`" \ + "(start|stop|restart|try-restart|reload|force-reload|status)" >&2 + exit 2 # LSB: invalid or excess argument(s) + ;; +esac + +service_fini diff --git a/roles/slurm-from-source/templates/munge_key.j2 b/roles/slurm-from-source/templates/munge_key.j2 new file mode 100644 index 0000000000000000000000000000000000000000..83d3483ee198fffce76dd82dee5cbe1fb8c0ab8f --- /dev/null +++ b/roles/slurm-from-source/templates/munge_key.j2 @@ -0,0 +1 @@ +{{ mungekey }} diff --git a/roles/slurm-from-source/templates/slurm-vpn.conf.j2 b/roles/slurm-from-source/templates/slurm-vpn.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..671840e07831bac8de9b440c3c9e90ca65ff3fd3 --- /dev/null +++ b/roles/slurm-from-source/templates/slurm-vpn.conf.j2 @@ -0,0 +1,109 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# +# See the slurm.conf man page for more information. +# +ClusterName=CIAB +ControlMachine={{ slurmctrl }} +ControlAddr={{ slurmctrl }}-vpn +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/tmp +SlurmdSpoolDir=/tmp/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +ProctrackType=proctrack/pgid +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/linear +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +#SlurmctldLogFile= +SlurmdDebug=3 +#SlurmdLogFile= +JobCompType=jobcomp/none +#JobCompLoc= +# +# ACCOUNTING +#JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherFrequency=30 +# +#AccountingStorageType=accounting_storage/slurmdbd +#AccountingStorageHost= +#AccountingStorageLoc= +#AccountingStoragePass= +#AccountingStorageUser= +# +MpiParams=ports=12000-12999 +# COMPUTE NODES +{% set nodelist = [] %} +{% for queue in slurmqueues %} +{% for node in groups[queue.group] %} +{% if nodelist.append(node) %} +{% endif %} +{% endfor %} +{% endfor %} +{% for node in nodelist|unique %} +NodeName={{ node }} NodeAddr={{ node }}-vpn Procs={{ hostvars[node]['ansible_processor_cores'] }} State=UNKNOWN +{% endfor %} + +{% for queue in slurmqueues %} +PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} +{% endfor %} diff --git a/roles/slurm-from-source/templates/slurm.conf.j2 b/roles/slurm-from-source/templates/slurm.conf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..7832ac9a5be7ec84409c754f6eeed9f84c2d3143 --- /dev/null +++ b/roles/slurm-from-source/templates/slurm.conf.j2 @@ -0,0 +1,136 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# +# See the slurm.conf man page for more information. +# +ClusterName={{ clustername }} +ControlMachine={{ slurmctrl }} +#ControlAddr= +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation={{ slurmdatadir }} +SlurmdSpoolDir={{ slurmdatadir }} +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +ProctrackType=proctrack/pgid +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType={{ slurmschedulertype }} +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType={{ slurmselecttype }} +FastSchedule={{ slurmfastschedule }} +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +{% if slurmctlddebug %} +SlurmctldDebug={{ slurmctlddebug.level }} +SlurmctldLogFile={{ slurmctlddebug.log }} +{% else %} +#SlurmctldDebug= +#SlurmctldLogFile= +{% endif %} +{% if slurmddebug %} +SlurmdDebug={{ slurmddebug.level }} +SlurmdLogFile={{ slurmddebug.log }} +{% else %} +#SlurmdDebug= +#SlurmdLogFile= +{% endif %} +{% if slurmschedlog %} +SlurmSchedlogLevel={{ slurmschedlog.level }} +SlurmSchedLogFile={{ slurmschedlog.log }} +{% else %} +#SlurmSchedlogLevel= +#SlurmSchedLogFile= +{% endif %} +JobCompType=jobcomp/none +#JobCompLoc= +# +# ACCOUNTING +#JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={{ slurmctrl }} +#AccountingStorageEnforce=limits,safe +#AccountingStorageLoc= +#AccountingStoragePass= +#AccountingStorageUser= +# +#GRES +GresTypes=gpu + +# Fair share +{% if slurmfairshare.def %} +PriorityWeightFairshare={{ slurmfairshare.val }} +{% endif %} + +DisableRootJobs=YES +MpiParams=ports=12000-12999 +# COMPUTE NODES +{% set nodelist = [] %} +{% for queue in slurmqueues %} +{% for node in groups[queue.group] %} +{% if nodelist.append(node) %} +{% endif %} +{% endfor %} +{% endfor %} +{% for node in nodelist|unique %} +NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} State=UNKNOWN +{% endfor %} + +{% for queue in slurmqueues %} +PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} +{% endfor %} diff --git a/roles/slurm-from-source/templates/slurm.initd.j2 b/roles/slurm-from-source/templates/slurm.initd.j2 new file mode 100644 index 0000000000000000000000000000000000000000..8e0b0ce083689dfdee92efeb2b8c55142b48830a --- /dev/null +++ b/roles/slurm-from-source/templates/slurm.initd.j2 @@ -0,0 +1,338 @@ +#!/bin/bash +# +# chkconfig: 345 90 10 +# description: SLURM is a simple resource management system which \ +# manages exclusive access to a set of compute \ +# resources and distributes work to those resources. +# +# processname: ${exec_prefix}/sbin/slurmd +# pidfile: /var/run/slurmd.pid +# +# processname: ${exec_prefix}/sbin/slurmctld +# pidfile: /var/run/slurmctld.pid +# +# config: /etc/sysconfig/slurm +# +### BEGIN INIT INFO +# Provides: slurm +# Required-Start: $remote_fs $syslog $network munge +# Required-Stop: $remote_fs $syslog $network munge +# Should-Start: $named +# Should-Stop: $named +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: slurm daemon management +# Description: Start slurm to provide resource management +### END INIT INFO + +exec_prefix="{{ slurm_dir }}" +prefix="{{ slurm_dir }}" +BINDIR="${exec_prefix}/bin" +CONFDIR="${prefix}/etc" +LIBDIR="${exec_prefix}/lib" +SBINDIR="${exec_prefix}/sbin" + +# Source function library. +if [ -f /etc/rc.status ]; then + . /etc/rc.status + SUSE=1 + STARTPROC=startproc + + rc_reset +else + + # Read configuration defaults to override variables: + # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD + ## + for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do + [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" + done + [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" + [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER + expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" + [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ + && RELOAD=1 || unset RELOAD + + if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then + SYSTEM="DEBIAN" + [ -r /etc/default/rcS ] && . /etc/default/rcS + [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh + [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions + STARTPROC="start_daemon" + elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then + SYSTEM="REDHAT" + . /etc/rc.d/init.d/functions + RH_LOCK="/var/lock/subsys/$INIT_NAME" + SUSE=0 + STARTPROC=daemon + elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then + SYSTEM="SUSE" + . /etc/rc.status + rc_reset + elif [ -r /lib/lsb/init-functions ]; then + SYSTEM="LSB" + . /lib/lsb/init-functions + else + SYSTEM="OTHER" + fi + + function rc_status() { + RETVAL=$? + } + function rc_exit () { + exit $RETVAL + } + RETVAL=0 +fi + +# We can not use a starter program without losing environment +# variables that are critical on Blue Gene systems +if [ -d /bgl/BlueLight/ppcfloor ]; then + STARTPROC="" +fi + +# Source slurm specific configuration +# This can be used to alter limits for users jobs or set daemon options. +# For example, the limits for user jobs could be higher or lower than the +# default limits for user root (e.g. "ulimit -t unlimited" sets an unlimited +# CPU time limit for spawned user jobs). +# SLURMCTLD_OPTIONS defines slurmctld command line options. See "man slurmctld" +# SLURMD_OPTIONS defines slurmd command line options. See "man slurmd" +if [ -f /etc/sysconfig/slurm ] ; then + . /etc/sysconfig/slurm +else + SLURMCTLD_OPTIONS="" + SLURMD_OPTIONS="" +fi + +if [ ! -x $BINDIR/scontrol ]; then + echo "Could not find $BINDIR/scontrol. Bad path?" + exit 1 +fi + +if [ ! -f $CONFDIR/slurm.conf ]; then + echo "Could not find $CONFDIR/slurm.conf. Bad path?" + exit 1 +fi + +# setup library paths for slurm and munge support +export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + +start() { + prog=$1 + shift + echo -n "starting $prog: " + unset HOME MAIL USER USERNAME + $STARTPROC $SBINDIR/$prog $* + rc_status -v + echo + touch /var/lock/subsys/slurm +} + +stop() { + echo -n "stopping $1: " + killproc $1 -TERM + rc_status -v + echo + rm -f /var/lock/subsys/slurm +} + +startall() { + for prog in `$BINDIR/scontrol show daemons`; do + optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"` + if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]] + then + for node in $($BINDIR/scontrol show aliases) + do + start $prog -N ${node} ${!optvar} + done + else + start $prog ${!optvar} + fi + done +} + +# +# status() with slight modifications to take into account +# instantiations of job manager slurmd's, which should not be +# counted as "running" +# +slurmstatus() { + local base=${1##*/} + local pid + local rpid + local pidfile + local pidfiles + local rc + + pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'` + if [ $? = 0 ]; then + pidfile=${pidfile##*=} + pidfile=${pidfile%#*} + pidfile=${pidfile//\"/} + else + pidfile=/var/run/${base}.pid + fi + + pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \ + pidof -o $$ -o $$PPID -o %PPID -x ${base}` + + if [ "$base" == "slurmd" ] ; then + echo ${pidfile} | grep -q %n + if [[ $? -eq 0 ]] + then + for n in $($BINDIR/scontrol show aliases) + do + pidfiles="${pidfiles} $(echo ${pidfile} | sed "s/%n/$n/g")" + done + else + pidfiles=${pidfile} + fi + else + pidfiles=${pidfile} + fi + + RETVAL=0 + for pidfile in ${pidfiles} + do + rc=1 + if [ -f $pidfile ]; then + read rpid < $pidfile + if [ "$rpid" != "" -a "$pid" != "" ]; then + for i in $pid ; do + if [ "$i" = "$rpid" ]; then + echo $"${base} (pid $rpid) is running..." + rc=0 + break + fi + done + elif [ "$rpid" != "" -a "$pid" = "" ]; then +# Due to change in user id, pid file may persist +# after slurmctld terminates + if [ "$base" != "slurmctld" ] ; then + echo $"${base} dead but pid file exists" + else + echo $"${base} is stopped" + fi + RETVAL=1 + fi + fi + + if [[ $rc -eq 0 ]] + then + continue + fi + + if [ "$base" = "slurmctld" -a "$pid" != "" ] ; then + echo $"${base} (pid $pid) is running..." + continue + fi + + echo $"${base} is stopped" + if [ "$RETVAL" == "0" ]; then + RETVAL=3 + fi + done + + return $RETVAL +} + +# +# stop slurm daemons, +# wait for termination to complete (up to 10 seconds) before returning +# +slurmstop() { + for prog in `$BINDIR/scontrol show daemons`; do + stop $prog + + for i in 1 2 3 4 + do + sleep $i + slurmstatus $prog + if [ $? != 0 ]; then + break + fi + done + done + + # slurmstatus return 1 in case of stopped daemon + # and that is what we are looking for here + if [[ ${RETVAL} == "1" ]] + then + RETVAL=0 + else + RETVAL=1 + fi +} + +# +# The pathname substitution in daemon command assumes prefix and +# exec_prefix are same. This is the default, unless the user requests +# otherwise. +# +# Any node can be a slurm controller and/or server. +# +case "$1" in + start) + startall + ;; + startclean) + SLURMCTLD_OPTIONS="-c $SLURMCTLD_OPTIONS" + SLURMD_OPTIONS="-c $SLURMD_OPTIONS" + startall + ;; + stop) + slurmstop + ;; + status) + anystop=0 + for prog in `$BINDIR/scontrol show daemons`; do + slurmstatus $prog + rc=$? + if [ $rc != 0 ] ; then + anystop=$rc + fi + done + RETVAL=$anystop + ;; + restart) + $0 stop + $0 start + ;; + condrestart) + if [ -f /var/lock/subsys/slurm ]; then + for prog in `$BINDIR/scontrol show daemons`; do + stop $prog + sleep 1 + optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"` + if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]] + then + for node in $($BINDIR/scontrol show aliases) + do + start $prog -N ${node} + done + else + start $prog ${!optvar} + fi + done + fi + ;; + reconfig|reload) + for prog in `$BINDIR/scontrol show daemons`; do + echo -n $"Reloading $prog daemon configuration: " + killproc $prog -HUP + echo + done + ;; + test) + for prog in `$BINDIR/scontrol show daemons`; do + echo "$prog runs here" + done + ;; + *) + echo "Usage: $0 {start|startclean|stop|status|restart|reconfig|condrestart|test}" + exit 1 + ;; +esac + +rc_exit diff --git a/roles/slurm-start/tasks/main.yml b/roles/slurm-start/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..29f3bc2e623188acbb4fdb843076bd7a675c2c97 --- /dev/null +++ b/roles/slurm-start/tasks/main.yml @@ -0,0 +1,13 @@ +--- +- name: start munge + service: name=munge state=started + sudo: true + +- name: start slurmdbd + service: name=slurmdbd state=started + sudo: true + ignore_errors: true + +- name: start slurm + service: name=slurm state=started + sudo: true diff --git a/roles/slurmdb/handlers/main.yml b/roles/slurmdb/handlers/main.yml index cbbaae73ee7bef6df63023906addb9f499ecd1f2..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/roles/slurmdb/handlers/main.yml +++ b/roles/slurmdb/handlers/main.yml @@ -1,8 +0,0 @@ ---- - - name: restart slurmdbd - service: name={{ item }} state=restarted - with_items: - - slurmdbd - - slurm - sudo: true - diff --git a/roles/slurmdb/tasks/main.yml b/roles/slurmdb/tasks/main.yml index f00f19cc6ceb3176603f3efff50fd7da98a9a183..14fbda28113a39d049684311656b7aa0fdca9790 100644 --- a/roles/slurmdb/tasks/main.yml +++ b/roles/slurmdb/tasks/main.yml @@ -13,12 +13,23 @@ sudo: true - name: install mysql local root password - mysql_user: check_implicit_admin=True login_user=root login_password="{{ sqlrootPasswd }}" name=root password="{{ sqlrootPasswd }}" state=present + mysql_user: check_implicit_admin=True login_user=root login_password={{ sqlrootPasswd }} name=root password={{ sqlrootPasswd }} state=present sudo: true +- name: install slurmdbd init + template: src=slurmdbd.initd.j2 dest=/etc/init.d/slurmdbd mode=755 + sudo: true + +- name: install slurmdb.conf init + template: src=slurmdbd.conf.j2 dest={{ slurm_dir }}/etc/slurmdbd.conf + sudo: true + when: slurm_dir is defined + + - name: install slurmdbd.conf template: src=slurmdbd.conf.j2 dest=/etc/slurm/slurmdbd.conf sudo: true + when: slurm_dir is not defined - name: configure database slurmdb localhost mysql_user: login_user=root login_password="{{ sqlrootPasswd }}" name=slurmdb password="{{ slurmdb_passwd }}" host=localhost priv=*.*:ALL,GRANT state=present @@ -27,7 +38,6 @@ - name: configure database slurmdb domain mysql_user: login_user=root login_password="{{ sqlrootPasswd }}" name=slurmdb password="{{ slurmdb_passwd }}" host="{{ ansible_hostname }}"."{{ ansible_domain }}" priv=*.*:ALL,GRANT state=present sudo: true -# notify: restart slurmdb - name: sanity check slrumdbd service service: "name=slurmdbd enabled=yes state=started" diff --git a/roles/slurmdb/templates/slurmdbd.initd.j2 b/roles/slurmdb/templates/slurmdbd.initd.j2 new file mode 100644 index 0000000000000000000000000000000000000000..afeb6cab36371c3a773c206a1d57848cbe5b05cc --- /dev/null +++ b/roles/slurmdb/templates/slurmdbd.initd.j2 @@ -0,0 +1,223 @@ +#!/bin/bash +# +# chkconfig: 345 90 10 +# description: SLURMDBD is a database server interface for \ +# SLURM (Simple Linux Utility for Resource Management). +# +# processname: ${exec_prefix}/sbin/slurmdbd +# pidfile: /var/run/slurmdbd.pid +# +# config: /etc/sysconfig/slurm +# +### BEGIN INIT INFO +# Provides: slurmbd +# Required-Start: $remote_fs $syslog $network munge +# Required-Stop: $remote_fs $syslog $network munge +# Should-Start: $named +# Should-Stop: $named +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: SLURM database daemon +# Description: Start slurm to provide database server for SLURM +### END INIT INFO + +exec_prefix="{{ slurm_dir }}" +prefix="{{ slurm_dir }}" +CONFDIR="${prefix}/etc" +LIBDIR="${exec_prefix}/lib" +SBINDIR="${exec_prefix}/sbin" + +#Source function library. +if [ -f /etc/rc.status ]; then + . /etc/rc.status + SUSE=1 + STARTPROC=startproc + + rc_reset +else + # Read configuration defaults to override variables: + # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD + ## + for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do + [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" + done + [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" + [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER + expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" + [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ + && RELOAD=1 || unset RELOAD + + if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then + SYSTEM="DEBIAN" + [ -r /etc/default/rcS ] && . /etc/default/rcS + [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh + [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions + STARTPROC="start_daemon" + elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then + SYSTEM="REDHAT" + . /etc/rc.d/init.d/functions + RH_LOCK="/var/lock/subsys/$INIT_NAME" + SUSE=0 + STARTPROC=daemon + elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then + SYSTEM="SUSE" + . /etc/rc.status + rc_reset + elif [ -r /lib/lsb/init-functions ]; then + SYSTEM="LSB" + . /lib/lsb/init-functions + else + SYSTEM="OTHER" + fi + + + function rc_status() { + RETVAL=$? + } + function rc_exit () { + exit $RETVAL + } + RETVAL=0 +fi + +# We can not use a starter program without losing environment +# variables that are critical on Blue Gene systems +if [ -d /bgl/BlueLight/ppcfloor ]; then + STARTPROC="" +fi + +# Source slurm specific configuration +# SLURMDBD_OPTIONS defines slurmdbd command line options. See "man slurmdbd" +if [ -f /etc/sysconfig/slurm ] ; then + . /etc/sysconfig/slurm +else + SLURMDBD_OPTIONS="" +fi + +if [ ! -f $CONFDIR/slurmdbd.conf ]; then + echo "Could not find $CONFDIR/slurmdbd.conf. Bad path?" + exit 1 +fi + +# setup library paths for slurm and munge support +export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + +start() { + prog=$1 + shift + echo -n "starting $prog: " + unset HOME MAIL USER USERNAME + $STARTPROC $SBINDIR/$prog $SLURMDBD_OPTIONS + rc_status -v + echo + touch /var/lock/subsys/slurmdbd +} + +stop() { + echo -n "stopping $1: " + killproc $1 -TERM + rc_status -v + echo + rm -f /var/lock/subsys/slurmdbd +} + +slurmstatus() { + local base=${1##*/} + local pid + local rpid + local pidfile + + pidfile=`grep -i pidfile $CONFDIR/slurmdbd.conf | grep -v '^ *#'` + if [ $? = 0 ]; then + pidfile=${pidfile##*=} + pidfile=${pidfile%#*} + pidfile=${pidfile//\"/} + else + pidfile=/var/run/slurmdbd.pid + fi + + pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \ + pidof -o $$ -o $$PPID -o %PPID -x ${base}` + + if [ -f $pidfile ]; then + read rpid < $pidfile + if [ "$rpid" != "" -a "$pid" != "" ]; then + for i in $pid ; do + if [ "$i" = "$rpid" ]; then + echo $"${base} (pid $pid) is running..." + return 0 + fi + done + elif [ "$rpid" != "" -a "$pid" = "" ]; then + echo $"${base} dead but pid file exists" + return 1 + fi + + fi + + if [ "$base" = "slurmdbd" -a "$pid" != "" ] ; then + echo $"${base} (pid $pid) is running..." + return 0 + fi + + echo $"${base} is stopped" + + return 3 +} + +# +# stop slurm daemons, +# wait for termination to complete (up to 10 seconds) before returning +# +slurmstop() { + stop $1 + + for i in 1 2 3 4 + do + sleep $i + slurmstatus $1 + if [ $? != 0 ]; then + break + fi + done +} +# +# The pathname substitution in daemon command assumes prefix and +# exec_prefix are same. This is the default, unless the user requests +# otherwise. +# +# Any node can be a slurm controller and/or server. +# +case "$1" in + start) + start slurmdbd + ;; + stop) + slurmstop slurmdbd + ;; + status) + slurmstatus slurmdbd + rc_status -v + ;; + restart) + $0 stop + $0 start + ;; + condrestart) + if [ -f /var/lock/subsys/slurm ]; then + stop slurmdbd + start slurmdbd + fi + ;; + reconfig|reload) + echo -n $"Reloading slurmdbd daemon configuration: " + killproc slurmdbd -HUP + echo + ;; + *) + echo "Usage: $0 {start|stop|status|restart|condrestart|reconfig}" + exit 1 + ;; +esac + +rc_exit diff --git a/roles/syncExports/tasks/addExports.yml b/roles/syncExports/tasks/addExports.yml index f91dd861f324e07105e2ad587c0ce2fe2ed8a526..8853541bcd81ebbe805949044d2d030b89dc5bdf 100644 --- a/roles/syncExports/tasks/addExports.yml +++ b/roles/syncExports/tasks/addExports.yml @@ -2,26 +2,18 @@ - name: "Create exports if necessary" file: dest={{ item.src }} state=directory mode=755 owner=root group=root sudo: true - delegate_to: "{{ nfs_server }}" - run_once: true with_items: exportList - name: "Templating /etc/exports" template: src=exports.j2 dest=/etc/exports owner=root group=root mode=644 - delegate_to: "{{ nfs_server }}" - run_once: true sudo: true # Do not do this as a handler, instead do this here as a task so that it happens imediatly after the exports file is created before any clients # attempt a mount - name : "Reload exports" command: exportfs -ra - delegate_to: "{{ nfs_server }}" - run_once: true sudo: true - name : "Pause ... clients sometimes have errors" command: sleep 60 - delegate_to: "{{ nfs_server }}" - run_once: true sudo: true diff --git a/roles/syncExports/templates/exports.j2 b/roles/syncExports/templates/exports.j2 index 21c4f552610f7c382d3159c85cc6788786c8df85..b480290d68f0b775b0a7f3ef9b67b58e1de20325 100644 --- a/roles/syncExports/templates/exports.j2 +++ b/roles/syncExports/templates/exports.j2 @@ -1,13 +1,10 @@ {% set iplist = [] %} {% for export in exportList %} -{% for group in groupList %} -{% for node in groups[group.name] %} -{% if hostvars[node]['ansible_'+group.interface] is defined %} -{% if iplist.append(hostvars[node]['ansible_'+group.interface]['ipv4']['address']) %} +{% for node in groups[export.group] %} +{% if hostvars[node]['ansible_'+export.interface] is defined %} +{% if iplist.append(hostvars[node]['ansible_'+export.interface]['ipv4']['address']) %} {% endif %} {% endif %} {% endfor %} -{% endfor %} {{ export.src }} {% for ip in iplist|unique %}{{ ip }}({{ export.srvopts }}) {% endfor %} - {% endfor %}