diff --git a/roles/slurm-common.bak/defaults/main.yml b/roles/slurm-common.bak/defaults/main.yml deleted file mode 100644 index 283c06273b4c10af672e815f0fe0062f986814fd..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/defaults/main.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -slurm_use_vpn: False -slurmddebug: {level: 5, log: '/var/log/slurm/slurmd.log'} -slurmctlddebug: {level: 5, log: '/mnt/slurm-logs/slurmctld.log'} -slurmdbdlog: {level: 5, log: '/mnt/slurm-logs/slurmdbd.log'} -slurmfairshare: {def: false, val: 10000} -slurmdatadir: "/var/spool/slurm" -slurmselecttype: "select/linear" -slurmfastschedule: "1" -slurmschedulertype: "sched/backfill" - diff --git a/roles/slurm-common.bak/files/scripts/nvidia-probe.py b/roles/slurm-common.bak/files/scripts/nvidia-probe.py deleted file mode 100755 index 7fd743ef41b91c85842973e623e1cbfd9f3c6535..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/files/scripts/nvidia-probe.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/env python -# prints a list of NIDIA devices and their type in json format for -# parsing by ansible program; -# fields are 'name':'gpu' (fixed) -# 'file': devicePath, (i.e. /dev/nvidia0) -# 'type':typeOfDevice (i.e. 80 parsed from nvidia-smi outout) -# program returns nothing upon error (i.e. no error messages) -# Also checks for existance of /dev/nvidia? where ? is number from nvidia-smi GPU count -# nvidia-smi -L produces output like -#GPU 0: Tesla K80 (UUID: GPU-8bdb2956-4c10-7bd0-80d4-46da054663b4) -#GPU 1: Tesla K80 (UUID: GPU-19ed5f7c-435a-036e-54f0-f64209c3cede) -#GPU 2: Tesla K80 (UUID: GPU-a2f8cfe2-5bbc-de2a-8adc-4038f3379b5e) -#GPU 3: Tesla K80 (UUID: GPU-1c9c0d02-4590-c915-18d2-d709efb56d8d) -#GPU 4: Tesla K80 (UUID: GPU-b0f290c8-3b69-a518-ac77-22718f43e946) -#GPU 5: Tesla K80 (UUID: GPU-565ebca2-6b37-3bc0-a355-72330049a349) -#GPU 6: Tesla K80 (UUID: GPU-d8096845-d8a1-e3ef-ad00-c1d069c1b685) -#GPU 7: Tesla K80 (UUID: GPU-20ee0841-22b5-9974-66c0-b49e5be3e469) - -import subprocess -import sys -import re -import os -import json - -try: - #run nvidia-smi -L to parse output - p = subprocess.Popen(['nvidia-smi', '-L'], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - out, err = p.communicate() - lines=out.strip().split('\n') - numberOfDevices=len(lines) - typeofDevice="" - deviceList=[] #return list - for line in lines: - if not line : - break - #print "Line is ",line - pe=re.compile('GPU\s+(\d*):\s+\S+\s+(\S*)') - m=pe.search(line) - if not m: - #print "No match found" - break - numberOfDevice=m.group(1) - typeOfDevice=m.group(2) - #print "Number of Devics is "+numberOfDevice+" Type of device is "+typeOfDevice - #check device file existance - devicePath="/dev/nvidia"+numberOfDevice - if os.path.exists(devicePath): - #print "OK" - deviceList.append( { 'name':'gpu' , 'file': devicePath, 'type':typeOfDevice } ) - else: - print("error looking for nvidia device") - sys.exit(1) - #now convert list to json - output=json.dumps(deviceList) - print output -except OSError: - output=json.dumps([]) - print output -#if nvidia-smi is not installed on computer then this error is thrown by subprocess.Popen - sys.exit(0) diff --git a/roles/slurm-common.bak/handlers/main.yml b/roles/slurm-common.bak/handlers/main.yml deleted file mode 100644 index b57d5bf21738f9ab035743b94a66e2225c56f3e4..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/handlers/main.yml +++ /dev/null @@ -1,18 +0,0 @@ ---- - - name: restart munge - service: name=munge state=restarted - become: true - - - name: restart slurm - service: name=slurm state=restarted - become: true - - - name: restart slurmdbd - service: name=slurmdbd state=restarted - become: true - - - name: scontrol reconfigure - shell: sleep 10 ; scontrol reconfigure - become: true - delegate_to: "{{ slurmctrl }}" - run_once: true diff --git a/roles/slurm-common.bak/tasks/createSlurmDirectories.yml b/roles/slurm-common.bak/tasks/createSlurmDirectories.yml deleted file mode 100644 index 738956823167ca062efe85940774a45c9a547423..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/tasks/createSlurmDirectories.yml +++ /dev/null @@ -1,77 +0,0 @@ ---- -- name: make sure slurmctld and slurmdb log dir exists - file: dest=/mnt/slurm-logs state=directory owner=root group=root mode=755 - become: true - -- name: make sure slurm conf dir exists - file: dest={{ slurm_dir }}/etc state=directory - become: true - -- name: make sure slurm lock dir exists - file: dest=/var/lock/subsys state=directory owner=root group=root mode=755 - become: true - -- name: stat run directory - stat: path={{ slurmdatadir }} - become_user: root - become: True - register: runstat - when: slurmdatadir is defined - -- name: create data directory - file: path={{ slurmdatadir }} state=directory owner=slurm group=slurm mode=755 - become: true - when: slurmdatadir is defined and not runstat.stat.exists - -- name: stat pid directory - stat: path={{ slurmpiddir }} - become_user: root - become: True - register: pidstat - when: slurmpiddir is defined - -- name: create pid directory - file: path={{ slurmpiddir }} state=directory owner=slurm group=slurm mode=755 - become: true - when: slurmpiddir is defined and not pidstat.stat.exists - -- name: create slurmdbdpiddir directory - file: path={{ slurmdbdpiddir }} state=directory owner=slurm group=slurm mode=755 - become: true - -- name: create shared state directory - file: path={{slurmsharedstatedir }} state=directory owner=slurm group=slurm mode=750 - become: true - run_once: true - when: usesharedstatedir is defined and usesharedstatedir - -- name: symlink shared state dir - file: path={{ slurmstatedir }} src={{ slurmsharedstatedir }} state=link - become: true - when: usesharedstatedir is defined and usesharedstatedir - -- name: create state directory - file: path={{ slurmstatedir }} state=directory owner=slurm group=slurm mode=750 - become: true - when: slurmstatedir is defined and not usesharedstatedir - -- name: stat log directory - stat: path={{ slurmlogdir }} - become_user: root - become: True - register: logstat - when: slurmlogdir is defined - -- name: create log directory - file: path={{ slurmlogdir }} state=directory owner=slurm group=slurm mode=750 - become: true - when: slurmlogdir is defined and not logstat.stat.exists - -- name: make sure slurm conf dir exists - file: dest={{ slurm_dir }}/etc state=directory - become: true - -- name: create greps directory - file: path={{ slurm_dir }}/etc/gres state=directory owner=slurm group=slurm mode=755 - become: true - diff --git a/roles/slurm-common.bak/tasks/installCgroup.yml b/roles/slurm-common.bak/tasks/installCgroup.yml deleted file mode 100644 index c7f4253d3dfcb0540421c27249d7aee0a4920118..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/tasks/installCgroup.yml +++ /dev/null @@ -1,27 +0,0 @@ -- name: yum install cgroup - yum: name={{ item }} state=installed - with_items: - - libcgroup - become: True - become_method: sudo - when: ansible_os_family == "RedHat" - -- name: apt install cgroup - apt: name={{ item }} state=installed update_cache=yes - with_items: - - cgmanager - - cgmanager-utils - - libcgmanager0 - when: ansible_os_family == "Debian" - become: True - become_method: sudo - -- name: config cgroup.conf file - template: dest={{ slurm_dir }}/etc/cgroup.conf src=cgroup.conf.j2 mode=644 - become: True - become_method: sudo - -- name: config cgroup_allowed_devices.conf file - template: dest={{ slurm_dir }}/etc/cgroup_allowed_devices.conf src=cgroup_allowed_devices.conf.j2 mode=644 - become: True - become_method: sudo diff --git a/roles/slurm-common.bak/tasks/installMungeFromSource.yml b/roles/slurm-common.bak/tasks/installMungeFromSource.yml deleted file mode 100644 index 656d35c9ff04a253224e44c9031e2c37c67c777e..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/tasks/installMungeFromSource.yml +++ /dev/null @@ -1,49 +0,0 @@ -- name: test if munge is already isntalled - stat: path="{{ munge_dir }}/bin/munge" - register: munge_binary - -- name: unarchive munge - unarchive: - args: - src: "http://consistency0/src/munge-{{ munge_version }}.tar.bz2" - copy: no - dest: /tmp - creates: /tmp/munge-{{ munge_version }}/configure - when: not munge_binary.stat.exists - - -- name: build munge - shell: ./configure --prefix={{ munge_dir }} && make - args: - chdir: /tmp/munge-{{ munge_version }} - creates: /tmp/munge-{{ munge_version }}/src/munge/munge - when: not munge_binary.stat.exists - -- name: install munge - shell: make install - become: true - args: - chdir: /tmp/munge-{{ munge_version }} - creates: "{{ munge_dir }}/bin/munge" - when: not munge_binary.stat.exists - -- name: set use_systemd - set_fact: - use_systemd: True - when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and ( ansible_distribution_major_version == "7") - -- name: copy init script - template: dest=/etc/init.d/munge src=munge.initd.j2 mode=755 - become: true - register: systemd_script_installed - when: use_systemd is not defined - -- name: copy slurm init script if OS contains systemd - template: dest=/etc/systemd/system/munge.service src=munge.service.j2 mode=644 - become: true - when: use_systemd is defined - -- name: reload systemd - shell: systemctl daemon-reload - become: true - when: use_systemd is defined and systemd_script_installed.changed diff --git a/roles/slurm-common.bak/tasks/installSlurmFromSource.yml b/roles/slurm-common.bak/tasks/installSlurmFromSource.yml deleted file mode 100644 index 9d1a326c634ede300ccbe6571b6123b88903cf50..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/tasks/installSlurmFromSource.yml +++ /dev/null @@ -1,117 +0,0 @@ -- name: remove all install - file: - path: "/tmp/slurm-{{ slurm_version }}" - state: absent - become: true - when: force_slurm_recompile is defined - -- name: remove all install - file: - path: "{{ slurm_dir }}" - state: absent - become: true - when: force_slurm_recompile is defined - -- name: unarchive slurm - unarchive: - src: "http://consistency0/src/slurm-{{ slurm_version }}.tar.bz2" - dest: /tmp - remote_src: yes - creates: "{{ slurm_dir }}/bin/srun" - -- name: stat srun - stat: path="{{ slurm_dir }}/bin/srun" - register: stat_srun - - -- name: configure slurm - command: /tmp/slurm-{{ slurm_version }}/configure --prefix={{ slurm_dir }} --with-munge={{ munge_dir }} --enable-pam --with-pmix=/usr/local/pmix/latest - args: - creates: "{{ slurm_dir }}/bin/srun" - chdir: /tmp/slurm-{{ slurm_version }} - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: build slurm - command: make - args: - creates: "{{ slurm_dir }}/bin/srun" - chdir: /tmp/slurm-{{ slurm_version }} - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: install slurm - shell: make install - become: true - args: - chdir: /tmp/slurm-{{ slurm_version }} - creates: "{{ slurm_dir }}/bin/srun" - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: build pmi - command: make - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: install pmi - shell: make install - become: true - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: build pmi2 - command: make - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi2 - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: install pmi2 - shell: make install - become: true - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pmi2 - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: build pam_slurm - command: make - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: install pam_slurm - shell: make install - become: true - args: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: build pam_slurm_adopt - make: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: install pam_slurm_adopt - make: - chdir: /tmp/slurm-{{ slurm_version }}/contribs/pam_slurm_adopt - target: install - when: force_slurm_recompile is defined or not stat_srun.stat.exists - become: true - -- name: remove exist-slurm-latest-link - file: - path: /opt/slurm-latest - state: absent - become: true - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: put slurm-latest-link - file: - src: "{{ slurm_dir }}" - dest: /opt/slurm-latest - state: link - become: true - when: force_slurm_recompile is defined or not stat_srun.stat.exists - -- name: add slurm log rotate config - template: src=slurmlog.j2 dest=/etc/logrotate.d/slurm mode=644 - become: true diff --git a/roles/slurm-common.bak/tasks/main.yml b/roles/slurm-common.bak/tasks/main.yml deleted file mode 100644 index d2351af627d7d6b32aa7d720d236c3a5139d84d5..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/tasks/main.yml +++ /dev/null @@ -1,148 +0,0 @@ ---- -- name: create munge group - group: name=munge system=yes gid=498 - become: true - -- name: create munge user - user: name=munge group=munge system=yes createhome=no uid=498 - become: true - -- name: create slurm group - group: name=slurm system=yes gid=497 - become: true - -- name: create slurm user - user: name=slurm group=slurm system=yes createhome=no uid=497 - become: true - -- include: createSlurmDirectories.yml - -- name: install deps - yum: name={{ item }} state=present - with_items: - - perl - - perl-DBI - - openssl-devel - - gcc - - rpm-build - - wget - - openssl-devel - - readline-devel - - pam-devel - - perl-ExtUtils-MakeMaker - - bzip2-devel - - hwloc - - hwloc-devel - - lua - - lua-devel - become: true - when: ansible_os_family == "RedHat" - -- name: install deps - apt: name={{ item }} state=installed update_cache=yes - become: true - with_items: - - gcc - - wget - - libssl-dev - - libpam0g-dev - - libbz2-dev - - make - - perl - - libdbi-perl - - lua5.2 - - hwloc - - libhwloc-dev - when: ansible_os_family == "Debian" - -- include: installMungeFromSource.yml - -- name: chown mungedir - file: path={{ munge_dir }} state=directory owner=munge recurse=yes - become: true - -- name: make munge logdir - file: path={{ munge_dir }}/var/log/munge state=directory owner=munge mode=700 - become: true - -- name: install munge key - template: src=munge_key.j2 dest={{ munge_dir }}/etc/munge/munge.key owner=munge mode=600 - become: true - -- name: enable munge on boot - service: name=munge enabled=yes - become: true - - -- include: installSlurmFromSource.yml - -- include: createSlurmDirectories.yml - -- name: check slurm generic resource - shell: "{{ slurm_gres_check }}" - register: slurm_generic_resource - ignore_errors: true - when: slurm_gres_check is defined - check_mode: no - changed_when: False - -- name: Gres - Test for Nvidia devices - script: scripts/nvidia-probe.py - register: probeOutput - check_mode: no - changed_when: False - -- name: get cpu count - shell: 'lscpu | grep "On-line CPU" | cut -f 2 -d ":" | sed "s/\ *//g"' - register: cpucount - check_mode: no - changed_when: False - -- name: "set nvidiaprobe slurm_gres_list" - set_fact: "slurm_gres_list={{ probeOutput.stdout }}" - -- name: template gres.conf file - template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres.conf mode=644 - become: true - -- name: make slurm prolog dir - file: path=/opt/slurm/etc state=directory mode=755 - become: true - become_user: root - -- name: install slurm prolog - template: src=slurm.prolog.j2 dest=/opt/slurm/etc/slurm.prolog mode=755 - become: true - -- name: install slurm epilog - template: src=slurm.epilog.j2 dest=/opt/slurm/etc/slurm.epilog mode=755 - become: true - -- name: install slurm.conf - copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf - become: true - when: slurm_use_vpn==False - -- name: install slurm.conf - template: src=slurm-vpn.conf.j2 dest={{ slurm_dir }}/etc/slurm.conf - become: true - when: slurm_use_vpn==True - -#- name: install job_submit.lua -# copy: src=files/job_submit.lua dest={{ slurm_dir }}/etc/job_submit.lua -# become: true -# when: slurm_use_vpn==False - -- name: setup envirnment variables - template: src=slurm_setup.sh.j2 dest=/etc/profile.d/slurm_setup.sh - become: true - -- name: setup plugin - template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755 - #delegate_to: "{{ slurmctrl }}" - #run_once: true - become: true - when: slurm_lua==True - -- include: installCgroup.yml - diff --git a/roles/slurm-common.bak/templates/cgroup.conf.j2 b/roles/slurm-common.bak/templates/cgroup.conf.j2 deleted file mode 100644 index a1995cc380f887986a2666af836d190d6f525bb9..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/cgroup.conf.j2 +++ /dev/null @@ -1,8 +0,0 @@ -CgroupAutomount=yes -ConstrainDevices=yes -TaskAffinity=yes -ConstrainCores=yes -ConstrainRAMSpace=yes -ConstrainKmemSpace=no -AllowedDevicesFile={{ slurm_dir }}/etc/cgroup_allowed_devices.conf - diff --git a/roles/slurm-common.bak/templates/cgroup_allowed_devices.conf.j2 b/roles/slurm-common.bak/templates/cgroup_allowed_devices.conf.j2 deleted file mode 100644 index ed829ec80e683f56e733e9a287be395acf2a6e18..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/cgroup_allowed_devices.conf.j2 +++ /dev/null @@ -1,6 +0,0 @@ -/dev/vd* -/dev/null -/dev/zero -/dev/urandom -/dev/cpu/*/* - diff --git a/roles/slurm-common.bak/templates/gres.conf.j2 b/roles/slurm-common.bak/templates/gres.conf.j2 deleted file mode 100644 index e6e50117af835ee4023c6d5c22366bda9f9d21cd..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/gres.conf.j2 +++ /dev/null @@ -1,5 +0,0 @@ -#slurm gres file for {{ ansible_hostname }} -#No Of Devices={{ slurm_gres_list | length }} -{% for gr in slurm_gres_list %} -Name={{ gr.name }} Type={{ gpumap[gr.type] }} File={{ gr.file }} CPUs={{ cpucount.stdout }} -{% endfor %} diff --git a/roles/slurm-common.bak/templates/gres_sub.conf.j2 b/roles/slurm-common.bak/templates/gres_sub.conf.j2 deleted file mode 100644 index a3bbf7199baffd6c7e154e898d36cf2857afdc57..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/gres_sub.conf.j2 +++ /dev/null @@ -1,3 +0,0 @@ -{% for gr in slurm_gres_list %} -Name={{ gr.name }} File={{ gr.file }} -{% endfor %} diff --git a/roles/slurm-common.bak/templates/job_submit.lua.j2 b/roles/slurm-common.bak/templates/job_submit.lua.j2 deleted file mode 100644 index 22b05df79c76d4e33a0aae386ac6f5102454ee32..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/job_submit.lua.j2 +++ /dev/null @@ -1,70 +0,0 @@ ---[[ - - Example lua script demonstrating the SLURM job_submit/lua interface. - This is only an example, not meant for use in its current form. - - Leave the function names, arguments, local varialbes and setmetatable - set up logic in each function unchanged. Change only the logic after - the lSUCCESSine containing "*** YOUR LOGIC GOES BELOW ***". - - For use, this script should be copied into a file name "job_submit.lua" - in the same directory as the SLURM configuration file, slurm.conf. - - ---]] - -function slurm_job_submit(job_desc, part_list, submit_uid) - - --- Check no default account - -if job_desc.account == "default" then - slurm.log_user("You have to specify your project ID as part of your job submission. The account=default is now deprecated on M3 job scheduler.") - return slurm.ERROR -end - - --- Check Desktop requests with more than one node - -if ((job_desc.name == "desktop") and (job_desc.min_nodes > 1 )) then - slurm.log_user("The current M3 Desktop applications are unable to utilise more than one node, please select one node instead") - return slurm.ERROR -end - - - --- Check for gres.gpu requirements in m3c, m3h and m3g, else move job to comp - -if ((job_desc.partition == "m3c" ) or (job_desc.partition == "m3h" ) or (job_desc.partition == "m3g" )) then - local partition = "" - if (job_desc.gres == nil) then - partition = "comp" - slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition) - job_desc.partition = partition - end - return slurm.SUCCESS -end - - --- Check for QOS rtq in m3c, m3h , m3g and partition=nil, then forward job to rtqp,comp,m3g - -if ((job_desc.qos == "rtq") and (job_desc.partition == nil)) then - local partition = "" - partition = "rtqp,comp,m3g" - slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition) - job_desc.partition = partition - return slurm.SUCCESS -end - - - -end - - - -function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) - return slurm.SUCCESS -end - -slurm.log_info("initialized") -return slurm.SUCCESS diff --git a/roles/slurm-common.bak/templates/munge.initd.j2 b/roles/slurm-common.bak/templates/munge.initd.j2 deleted file mode 100755 index 10d63a1974bbcd1fc02ecca938900515ea79404c..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/munge.initd.j2 +++ /dev/null @@ -1,567 +0,0 @@ -#!/bin/sh -############################################################################### -# Written by Chris Dunlap <cdunlap@llnl.gov>. -# Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. -# Copyright (C) 2002-2007 The Regents of the University of California. -# UCRL-CODE-155910. -############################################################################### -# chkconfig: - 66 33 -# description: MUNGE Uid 'N' Gid Emporium authentication service -############################################################################### -### BEGIN INIT INFO -# Provides: munge -# Required-Start: $local_fs $remote_fs $network $time -# Required-Stop: $local_fs $remote_fs -# Should-Start: $named $syslog -# Should-Stop: $named $syslog -# Default-Start: -# Default-Stop: -# Short-Description: MUNGE Uid 'N' Gid Emporium authentication service -# Description: MUNGE (MUNGE Uid 'N' Gid Emporium) is a highly scalable -# authentication service for creating and validating -# credentials. -### END INIT INFO -############################################################################### - -unset SERVICE_NAME DAEMON_EXEC DAEMON_ARGS CONFIG PIDFILE NICE USER GROUP \ - SIGHUP_RELOAD VARRUNDIR - -prefix="{{ munge_dir }}" -exec_prefix="${prefix}" -sbindir="${exec_prefix}/sbin" -sysconfdir="${prefix}/etc" -localstatedir="${prefix}/var" - -SERVICE_NAME="MUNGE" -DAEMON_EXEC="$sbindir/munged" -DAEMON_ARGS="-S ${localstatedir}/run/munge/munge.socket.2" -#CONFIG=#_NOT_SUPPORTED_# -PIDFILE="$localstatedir/run/munge/munged.pid" -#NICE= -USER="munge" -GROUP="munge" -#SIGHUP_RELOAD=#_NOT_SUPPORTED_# -VARRUNDIR="$localstatedir/run/munge" - -############################################################################### - -service_init () -{ -# Determine the system type and initialize the environment. -# -# Note that the shell positional parameters must be preserved when calling -# this function in order for SuSE to initialize its environment properly. -## - PATH=/sbin:/usr/sbin:/bin:/usr/bin - INIT_NAME="`basename \"$0\" .init | sed 's/^[SK][0-9][0-9]*//'`" - DAEMON_NAME="`basename \"$DAEMON_EXEC\"`" - SIGTERM_TIMEOUT="3" - STATUS=0 - - # Read configuration defaults to override variables: - # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD - ## - for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do - [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" - done - [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" - [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER - expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" - [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ - && RELOAD=1 || unset RELOAD - - if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then - SYSTEM="DEBIAN" - [ -x "$DAEMON_EXEC" ] || exit 0 # pkg removed but not purged - [ -r /etc/default/rcS ] && . /etc/default/rcS - [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh - [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions - elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then - SYSTEM="REDHAT" - . /etc/rc.d/init.d/functions - RH_LOCK="/var/lock/subsys/$INIT_NAME" - elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then - SYSTEM="SUSE" - . /etc/rc.status - rc_reset - elif [ -r /lib/lsb/init-functions ]; then - SYSTEM="LSB" - . /lib/lsb/init-functions - else - SYSTEM="OTHER" - fi - - # Exit if the package has been removed. - ## - [ -x "$DAEMON_EXEC" ] || exit 5 # LSB: program not installed - - # Exit if the configuration has been removed. - ## - [ -z "$CONFIG" -o -r "$CONFIG" ] || exit 6 # LSB: program not configured -} - -service_fini () -{ -# Return the exit status. -## - case $SYSTEM in - SUSE) - rc_exit - ;; - DEBIAN|REDHAT|LSB|*) - exit $STATUS - ;; - esac -} - -service_start () -{ -# Start the service. -# -# Required by LSB, where running "start" on a service already running should be -# considered successful. -## - log_init "Starting $SERVICE_NAME" "$DAEMON_NAME" - - if [ -n "$VARRUNDIR" -a ! -d "$VARRUNDIR" ]; then - mkdir -m 755 -p "$VARRUNDIR" - [ -n "$USER" ] && chown "$USER" "$VARRUNDIR" - [ -n "$GROUP" ] && chgrp "$GROUP" "$VARRUNDIR" - fi - - case $SYSTEM in - DEBIAN) - if $0 status >/dev/null 2>&1; then - STATUS=0 - else - ERRMSG=`start-stop-daemon --start --quiet \ - ${NICE:+"--nicelevel"} ${NICE:+"$NICE"} \ - ${USER:+"--chuid"} ${USER:+"$USER"} \ - ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ - --exec "$DAEMON_EXEC" -- $DAEMON_ARGS 2>&1` - STATUS=$? - echo $ERRMSG - fi - ;; - REDHAT) - if $0 status >/dev/null 2>&1; then - STATUS=0 - else - daemon ${NICE:+"$NICE"} ${USER:+"--user"} ${USER:+"$USER"} \ - "$DAEMON_EXEC" $DAEMON_ARGS - STATUS=$? - fi - [ $STATUS -eq 0 ] && touch "$RH_LOCK" >/dev/null 2>&1 - ;; - SUSE) - ERRMSG=`startproc ${NICE:+"-n"} ${NICE:+"$NICE"} \ - ${USER:+"-u"} ${USER:+"$USER"} \ - ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ - "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` - rc_status -v - STATUS=$? - ;; - LSB) - if [ -n "$USER" ]; then - ERRMSG=`su "$USER" -c "/sbin/start_daemon \ - ${NICE:+\"-n\"} ${NICE:+\"$NICE\"} \ - ${PIDFILE:+\"-p\"} ${PIDFILE:+\"$PIDFILE\"} \ - \"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1` - else - ERRMSG=`start_daemon ${NICE:+"-n"} ${NICE:+"$NICE"} \ - ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ - "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` - fi - STATUS=$? - ;; - *) - if $0 status >/dev/null 2>&1; then - STATUS=0 - else - [ -n "$NICE" ] && nice="nice -n $NICE" - if [ -n "$USER" ]; then - ERRMSG=`su "$USER" -c "$nice \"$DAEMON_EXEC\" $DAEMON_ARGS" 2>&1` - else - ERRMSG=`$nice "$DAEMON_EXEC" $DAEMON_ARGS 2>&1` - fi - STATUS=$? - fi - ;; - esac - log_fini "$STATUS" "$ERRMSG" -} - -service_stop () -{ -# Stop the service. -# -# Required by LSB, where running "stop" on a service already stopped or not -# running should be considered successful. -## - log_init "Stopping $SERVICE_NAME" "$DAEMON_NAME" - case $SYSTEM in - DEBIAN) - if ! $0 status >/dev/null 2>&1; then - STATUS=0 - else - start-stop-daemon --stop --quiet \ - ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ - --name "$DAEMON_NAME" ${SIGTERM_TIMEOUT:+"--retry"} \ - ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} >/dev/null 2>&1 - STATUS=$? - fi - ;; - REDHAT) - if ! $0 status >/dev/null 2>&1; then - STATUS=0 - else - killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ - ${SIGTERM_TIMEOUT:+"-d"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \ - "$DAEMON_EXEC" - STATUS=$? - fi - [ $STATUS -eq 0 ] && rm -f "$RH_LOCK" >/dev/null 2>&1 - ;; - SUSE) - killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ - ${SIGTERM_TIMEOUT:+"-t"} ${SIGTERM_TIMEOUT:+"$SIGTERM_TIMEOUT"} \ - "$DAEMON_EXEC" - rc_status -v - ;; - LSB) - killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" - STATUS=$? - ;; - *) - signal_process "$DAEMON_EXEC" - rc=$? - [ $rc -eq 0 -o $rc -eq 2 ] && STATUS=0 || STATUS=1 - ;; - esac - log_fini "$STATUS" - [ -f "$PIDFILE" ] && rm -f "$PIDFILE" -} - -service_restart () -{ -# Stop and restart the service if it is already running; -# otherwise, start the service. -# -# Required by LSB, where running "restart" on a service already stopped or not -# running should be considered successful. -## - if $0 status >/dev/null 2>&1; then - $0 stop && $0 start - else - $0 start - fi - - case $SYSTEM in - SUSE) - rc_status - ;; - DEBIAN|REDHAT|LSB|*) - STATUS=$? - ;; - esac -} - -service_try_restart () -{ -# Restart the service if it is already running. -# -# Optional for LSB, where running "try-restart" on a service already stopped or -# not running should be considered successful. -# Also known as "condrestart" by RedHat. -## - case $SYSTEM in - REDHAT) - [ -f "$RH_LOCK" ] && $0 restart || : - STATUS=$? - ;; - SUSE) - $0 status >/dev/null 2>&1 && $0 restart || rc_reset - rc_status - ;; - DEBIAN|LSB|*) - $0 status >/dev/null 2>&1 && $0 restart || : - STATUS=$? - ;; - esac -} - -service_reload () -{ -# Reload the configuration without stopping and restarting the service. -# -# Optional for LSB. -## - [ -z "$RELOAD" ] && STATUS=3 # LSB: unimplemented feature - - log_init "Reloading $SERVICE_NAME" "$DAEMON_NAME" - case $SYSTEM in - DEBIAN) - if [ -n "$RELOAD" ]; then - start-stop-daemon --stop --quiet --signal HUP \ - ${PIDFILE:+"--pidfile"} ${PIDFILE:+"$PIDFILE"} \ - --name "$DAEMON_NAME" >/dev/null 2>&1 - STATUS=$? - fi - ;; - REDHAT) - if [ -n "$RELOAD" ]; then - killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP - STATUS=$? - else - echo_failure - fi - ;; - SUSE) - if [ -n "$RELOAD" ]; then - killproc -HUP ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" - else - rc_failed $STATUS - fi - rc_status -v - ;; - LSB) - if [ -n "$RELOAD" ]; then - killproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" -HUP - STATUS=$? - fi - ;; - *) - if [ -n "$RELOAD" ]; then - signal_process "$DAEMON_EXEC" "HUP" - STATUS=$? - fi - ;; - esac - log_fini "$STATUS" -} - -service_force_reload () -{ -# Reload the configuration if the service supports this; -# otherwise, restart the service if it is already running. -# -# Required by LSB, where running "force-reload" on a service already stopped or -# not running should be considered successful. -## - if [ -n "$RELOAD" ]; then - $0 reload - else - $0 try-restart - fi - - case $SYSTEM in - SUSE) - rc_status - ;; - DEBIAN|REDHAT|LSB|*) - STATUS=$? - ;; - esac -} - -service_status () -{ -# Print the current status of the service. -# -# Required by LSB. -## - case $SYSTEM in - REDHAT) - status ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" - STATUS=$? - ;; - SUSE) - printf "Checking for service $SERVICE_NAME: " - checkproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} "$DAEMON_EXEC" - rc_status -v - ;; - LSB) - printf "Checking status of $SERVICE_NAME: " - pids=`pidofproc ${PIDFILE:+"-p"} ${PIDFILE:+"$PIDFILE"} \ - "$DAEMON_EXEC" 2>/dev/null` - STATUS=$? - if [ $STATUS -eq 0 -a -n "$pids" ]; then - echo "running." - elif [ $STATUS -ne 0 -a -s "$PIDFILE" ]; then - echo "dead." - else - echo "stopped." - fi - ;; - DEBIAN|*) - printf "Checking status of $SERVICE_NAME: " - pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` - rc=$? - if [ $rc -eq 0 -a -n "$pids" ]; then - echo "running." - STATUS=0 # LSB: program is running - elif [ $rc -ne 0 -a -s "$PIDFILE" ]; then - echo "dead." - STATUS=1 # LSB: program is dead & pidfile exists - elif [ $rc -ne 0 ]; then - echo "stopped." - STATUS=3 # LSB: program is not running - else - echo "unknown." - STATUS=4 # LSB: program status unknown - fi - ;; - esac -} - -query_pids () -{ -# Writes the matching PIDs to stdout. -# Returns 0 on success (ie, pids found). -## - PROCNAME="$1" - PIDFILE="$2" - - if type pgrep >/dev/null 2>&1; then - pids=`pgrep -d ' ' -x "\`basename \"$PROCNAME\"\`" 2>/dev/null` - rc=$? - elif type pidof >/dev/null 2>&1; then - pids=`pidof -o $$ -x "$PROCNAME" 2>/dev/null` - rc=$? - else - pids=`(ps awx -o pid -o command || ps -e -f -o pid -o args) 2>/dev/null \ - | tail +2 | egrep "( |/)$PROCNAME( |$)" | grep -v egrep \ - | sed 's/ *\([0-9]*\).*/\1/' | sort -n | tr '\012' ' '` - [ -n "$pids" ] && rc=0 || rc=1 - fi - - unset pids_running - if [ -n "$pids" -a -r "$PIDFILE" ]; then - read pid_line < "$PIDFILE" - for pid in $pid_line; do - expr -- "$pid" : '[0-9]*$' >/dev/null 2>&1 \ - && expr -- " $pids " : ".* $pid .*" >/dev/null 2>&1 \ - && pids_running="$pids_running $pid" - done - [ -n "$pids_running" ] && pids=$pids_running - fi - - echo $pids - return $rc -} - -signal_process () -{ -# Returns 0 on success, 1 if kill failed, 2 if PROCNAME is not running. -## - PROCNAME="$1" - SIGNUM="$2" - - pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` - [ $? -ne 0 -o -z "$pids" ] && return 2 - - kill ${SIGNUM:+"-$SIGNUM"} $pids >/dev/null 2>&1 - [ $? -ne 0 ] && return 1 - [ -n "$SIGNUM" ] && return 0 - - sleep 1 - pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` - [ $? -ne 0 -o -z "$pids" ] && return 0 - [ -z "$SIGTERM_TIMEOUT" ] && return 1 - - sleep "$SIGTERM_TIMEOUT" - kill -KILL $pids >/dev/null 2>&1 - pids=`query_pids "$DAEMON_EXEC" "$PIDFILE"` - [ $? -ne 0 -o -z "$pids" ] && return 0 - return 1 -} - -log_init () -{ -# Output informational message at beginning of action. -## - MESSAGE="$1" - PROCNAME="$2" - - case $SYSTEM in - DEBIAN) - if [ "$VERBOSE" != no ]; then - if type log_daemon_msg >/dev/null 2>&1; then - log_daemon_msg "$MESSAGE" "$PROCNAME" - else - printf "$MESSAGE: $PROCNAME" - fi - fi - ;; - REDHAT|SUSE|LSB|*) - printf "$MESSAGE: $PROCNAME" - ;; - esac -} - -log_fini () -{ -# Output informational/error message at end of action. -## - STATUS="$1" - ERRMSG="$2" - - case $SYSTEM in - DEBIAN) - if [ "$VERBOSE" != no ]; then - if ( type log_end_msg && type log_failure_msg ) >/dev/null 2>&1; then - log_end_msg "$STATUS" - [ $STATUS -eq 0 -o -z "$ERRMSG" ] || log_failure_msg "$ERRMSG" - else - [ $STATUS -eq 0 ] && echo "." || echo " (failed)." - [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 - fi - fi - ;; - REDHAT) - echo - ;; - SUSE) - [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 - ;; - LSB|*) - [ $STATUS -eq 0 ] && echo "." || echo " (failed)." - [ $STATUS -eq 0 -o -z "$ERRMSG" ] || echo "$ERRMSG" >&2 - ;; - esac -} - -############################################################################### - -service_init "$@" - -case "$1" in - start) - service_start - ;; - stop) - service_stop - ;; - restart) - service_restart - ;; - try-restart|condrestart) - service_try_restart - ;; - reload) - service_reload - ;; - force-reload) - service_force_reload - ;; - status) - service_status - ;; - *) - echo "Usage: `basename \"$0\"`" \ - "(start|stop|restart|try-restart|reload|force-reload|status)" >&2 - exit 2 # LSB: invalid or excess argument(s) - ;; -esac - -service_fini diff --git a/roles/slurm-common.bak/templates/munge.service.j2 b/roles/slurm-common.bak/templates/munge.service.j2 deleted file mode 100644 index 2e73014c8c6075e71193b46abde387cac3bee4c2..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/munge.service.j2 +++ /dev/null @@ -1,17 +0,0 @@ -[Unit] -Description=MUNGE authentication service -Documentation=man:munged(8) -After=network.target -After=syslog.target -After=time-sync.target - -[Service] -Type=forking -ExecStart={{ munge_dir }}/sbin/munged -PIDFile={{ munge_dir }}/var/run/munge/munged.pid -User=munge -Group=munge -Restart=on-abort - -[Install] -WantedBy=multi-user.target diff --git a/roles/slurm-common.bak/templates/munge_key.j2 b/roles/slurm-common.bak/templates/munge_key.j2 deleted file mode 100644 index 83d3483ee198fffce76dd82dee5cbe1fb8c0ab8f..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/munge_key.j2 +++ /dev/null @@ -1 +0,0 @@ -{{ mungekey }} diff --git a/roles/slurm-common.bak/templates/slurm-vpn.conf.j2 b/roles/slurm-common.bak/templates/slurm-vpn.conf.j2 deleted file mode 100644 index 671840e07831bac8de9b440c3c9e90ca65ff3fd3..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurm-vpn.conf.j2 +++ /dev/null @@ -1,109 +0,0 @@ -# -# Example slurm.conf file. Please run configurator.html -# (in doc/html) to build a configuration file customized -# for your environment. -# -# -# slurm.conf file generated by configurator.html. -# -# See the slurm.conf man page for more information. -# -ClusterName=CIAB -ControlMachine={{ slurmctrl }} -ControlAddr={{ slurmctrl }}-vpn -#BackupController= -#BackupAddr= -# -SlurmUser=slurm -#SlurmdUser=root -SlurmctldPort=6817 -SlurmdPort=6818 -AuthType=auth/munge -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= -StateSaveLocation=/tmp -SlurmdSpoolDir=/tmp/slurmd -SwitchType=switch/none -MpiDefault=none -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmdPidFile=/var/run/slurmd.pid -ProctrackType=proctrack/pgid -#PluginDir= -CacheGroups=0 -#FirstJobId= -ReturnToService=0 -#MaxJobCount= -#PlugStackConfig= -#PropagatePrioProcess= -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#Prolog= -#Epilog= -#SrunProlog= -#SrunEpilog= -#TaskProlog= -#TaskEpilog= -#TaskPlugin= -#TrackWCKey=no -#TreeWidth=50 -#TmpFS= -#UsePAM= -# -# TIMERS -SlurmctldTimeout=300 -SlurmdTimeout=300 -InactiveLimit=0 -MinJobAge=300 -KillWait=30 -Waittime=0 -# -# SCHEDULING -SchedulerType=sched/backfill -#SchedulerAuth= -#SchedulerPort= -#SchedulerRootFilter= -SelectType=select/linear -FastSchedule=1 -#PriorityType=priority/multifactor -#PriorityDecayHalfLife=14-0 -#PriorityUsageResetPeriod=14-0 -#PriorityWeightFairshare=100000 -#PriorityWeightAge=1000 -#PriorityWeightPartition=10000 -#PriorityWeightJobSize=1000 -#PriorityMaxAge=1-0 -# -# LOGGING -SlurmctldDebug=3 -#SlurmctldLogFile= -SlurmdDebug=3 -#SlurmdLogFile= -JobCompType=jobcomp/none -#JobCompLoc= -# -# ACCOUNTING -#JobAcctGatherType=jobacct_gather/linux -#JobAcctGatherFrequency=30 -# -#AccountingStorageType=accounting_storage/slurmdbd -#AccountingStorageHost= -#AccountingStorageLoc= -#AccountingStoragePass= -#AccountingStorageUser= -# -MpiParams=ports=12000-12999 -# COMPUTE NODES -{% set nodelist = [] %} -{% for queue in slurmqueues %} -{% for node in groups[queue.group] %} -{% if nodelist.append(node) %} -{% endif %} -{% endfor %} -{% endfor %} -{% for node in nodelist|unique %} -NodeName={{ node }} NodeAddr={{ node }}-vpn Procs={{ hostvars[node]['ansible_processor_cores'] }} State=UNKNOWN -{% endfor %} - -{% for queue in slurmqueues %} -PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} -{% endfor %} diff --git a/roles/slurm-common.bak/templates/slurm.conf.j2 b/roles/slurm-common.bak/templates/slurm.conf.j2 deleted file mode 100644 index 4d868b18af4d1f62074380c95a438b7f707f8858..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurm.conf.j2 +++ /dev/null @@ -1,157 +0,0 @@ -# -# Example slurm.conf file. Please run configurator.html -# (in doc/html) to build a configuration file customized -# for your environment. -# -# -# slurm.conf file generated by configurator.html. -# -# See the slurm.conf man page for more information. -# -ClusterName={{ clustername }} -ControlMachine={{ slurmctrl }} -#ControlAddr= -#BackupController= -#BackupAddr= -# -SlurmUser=slurm -SlurmdUser=root -SlurmctldPort=6817 -SlurmdPort=6818 -AuthType=auth/munge -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= -StateSaveLocation={{ slurmstatedir }} -SlurmdSpoolDir={{ slurmdatadir }} -SwitchType=switch/none -MpiDefault=pmi2 -SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid -SlurmdPidFile={{ slurmpiddir }}/slurmd.pid -ProctrackType=proctrack/linuxproc -#PluginDir= -CacheGroups=0 -#FirstJobId= -ReturnToService=1 -#MaxJobCount= -#PlugStackConfig= -#PropagatePrioProcess= -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#Prolog= -#Epilog= -#SrunProlog= -#SrunEpilog= -#TaskProlog= -#TaskEpilog= -TaskPlugin=task/cgroup -#TaskPlugin=task/affinity -#TaskPlugin=task/affinity,task/cgroup -{% if slurm_lua is defined %} -JobSubmitPlugins=lua -{% endif %} -OverTimeLimit=1 -CompleteWait=10 - -#TrackWCKey=no -#TreeWidth=50 -#TmpFS= -#UsePAM= -# -# TIMERS - -SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed - -#SlurmctldTimeout=300 -#SlurmdTimeout=300 -#InactiveLimit=0 -#MinJobAge=300 -KillWait=10 -#Waittime=0 -# -# SCHEDULING -SchedulerType={{ slurmschedulertype }} -#SchedulerAuth= -#SchedulerPort= -#SchedulerRootFilter= -SelectType={{ slurmselecttype }} -{% if slurmselecttype.find("cons_res") > 0 %} -SelectTypeParameters=CR_Core_Memory -{% endif %} -FastSchedule={{ slurmfastschedule }} -#PriorityType=priority/multifactor -#PriorityFlags=Ticket_Based -#PriorityCalcPeriod=5 -#PriorityDecayHalfLife=0 -#PriorityUsageResetPeriod=14-0 -##PriorityWeightFairshare=10000 -#PriorityWeightAge=10000 -#PriorityWeightPartition=10000 -#PriorityWeightJobSize=10000 -#PriorityMaxAge=14-0 -# -# LOGGING -{% if slurmctlddebug %} -SlurmctldDebug={{ slurmctlddebug.level }} -SlurmctldLogFile={{ slurmctlddebug.log }} -{% else %} -#SlurmctldDebug= -#SlurmctldLogFile= -{% endif %} -{% if slurmddebug %} -SlurmdDebug={{ slurmddebug.level }} -SlurmdLogFile={{ slurmddebug.log }} -{% else %} -#SlurmdDebug= -#SlurmdLogFile= -{% endif %} -{% if slurmschedlog %} -SlurmSchedlogLevel={{ slurmschedlog.level }} -SlurmSchedLogFile={{ slurmschedlog.log }} -{% else %} -#SlurmSchedlogLevel= -#SlurmSchedLogFile= -{% endif %} -JobCompType=jobcomp/none -#JobCompLoc= -# -{% if slurmjob is defined %} -Prolog={{ slurmjob.prolog }} -Epilog={{ slurmjob.epilog }} -{% endif %} -# -# ACCOUNTING -#JobAcctGatherType=jobacct_gather/linux -#JobAcctGatherFrequency=30 -# -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost={{ slurmctrl }} -#AccountingStorageEnforce=limits,safe -#AccountingStorageLoc= -#AccountingStoragePass= -#AccountingStorageUser= -# -#GRES -GresTypes=gpu - -# Fair share -{% if slurmfairshare.def %} -PriorityWeightFairshare={{ slurmfairshare.val }} -{% endif %} - -DisableRootJobs=YES -MpiParams=ports=12000-12999 -# COMPUTE NODES -{% set nodelist = [] %} -{% for queue in slurmqueues %} -{% for node in groups[queue.group] %} -{% if nodelist.append(node) %} -{% endif %} -{% endfor %} -{% endfor %} -{% for node in nodelist|unique %} -NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN -{% endfor %} - -{% for queue in slurmqueues %} -PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP -{% endfor %} diff --git a/roles/slurm-common.bak/templates/slurm.epilog.j2 b/roles/slurm-common.bak/templates/slurm.epilog.j2 deleted file mode 100644 index 0c6cbce7974ac875cdb6cbeeb772a7a63b07c75f..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurm.epilog.j2 +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh - -# specific files to be deleted to clean up after a Strudel session -find /tmp -user ${SLURM_JOB_USER} -name "pulse*" | xargs rm -rf -find /tmp -user ${SLURM_JOB_USER} -name ".esd-*" | xargs rm -rf -find /tmp -user ${SLURM_JOB_USER} -name ".X*-lock" | xargs rm -rf -find /tmp/.X11-unix -user ${SLURM_JOB_USER} -name "X*" | xargs rm -rf - -# NOTE: 20180316 The jobs above clean up the VNC session -# further clean up for Strudel Session -# X lock files are owned by root, so we need to find the right files to delete - - -# New Strudel session will create a file under user's home folder called xorg-jobid -# This file contains the display number that is assigned to xterm when it starts - -# Assigning variable and trimming the : from the display number -XSESSION=`cat /home/${SLURM_JOB_USER}/.vnc/xorg-${SLURM_JOB_ID} | tr -d :` - -# Formatting the filenames for the two files that we need to clean: /tmp/.X*-lock and /tmp/.X11/X* -XLOCKFILENAME=".X"$XSESSION"-lock" -XUNIXFILENAME="X"$XSESSION - -# Find the files and delete them -find /tmp/ -name $XLOCKFILENAME -exec rm -rf {} \; -find /tmp/.X11-unix -name $XUNIXFILENAME -exec rm -rf {} \; - -# Now we clean up -rm -rf /home/${SLURM_JOB_USER}/.vnc/xorg-${SLURM_JOB_ID} - -# echo 1 to drop page cache -/bin/sync -/bin/echo 1 > /proc/sys/vm/drop_caches - -exit 0 diff --git a/roles/slurm-common.bak/templates/slurm.prolog.j2 b/roles/slurm-common.bak/templates/slurm.prolog.j2 deleted file mode 100644 index 187206c6e5851e2113c4dd81d4ec0f0581eb5b58..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurm.prolog.j2 +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -# echo 1 to drop page cache -/bin/sync -/bin/echo 1 > /proc/sys/vm/drop_caches - -exit 0 diff --git a/roles/slurm-common.bak/templates/slurm_setup.sh.j2 b/roles/slurm-common.bak/templates/slurm_setup.sh.j2 deleted file mode 100644 index 2a80aa3e23242ecf452a8d0ea68c86b32535615f..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurm_setup.sh.j2 +++ /dev/null @@ -1,8 +0,0 @@ - -export PATH={{ munge_dir }}/bin:{{ slurm_dir }}/bin:{{ slurm_dir }}/sbin:{{ nhc_dir }}/sbin:$PATH - -export LD_LIBRARY_PATH={{ munge_dir }}/lib:{{ slurm_dir }}/lib:{{ slurm_dir }}/lib/slurm:$LD_LIBRARY_PATH - -export SLURM_SERVER_HOME={{ slurm_dir }} - -export MANPATH={{ slurm_dir }}/share/man:$MANPATH diff --git a/roles/slurm-common.bak/templates/slurmlog.j2 b/roles/slurm-common.bak/templates/slurmlog.j2 deleted file mode 100644 index 3e3d3e68e112f6124161654f5122a17195ed1c08..0000000000000000000000000000000000000000 --- a/roles/slurm-common.bak/templates/slurmlog.j2 +++ /dev/null @@ -1,32 +0,0 @@ -{% if slurmctrl == inventory_hostname %} -{{ slurmctlddebug.log }} -{{ slurmschedlog.log }} -{% else %} -{{ slurmddebug.log }} -{% endif %} -{ - compress - missingok - nocopytruncate - nocreate - nodelaycompress - nomail - notifempty - noolddir - rotate 5 - sharedscripts - size=5M - create 640 slurm root -{% if ansible_os_family == 'RedHat' and ansible_distribution_version >= '7' %} - postrotate -{% if slurmctrl == inventory_hostname %} - systemctl kill -s HUP --kill-who=main slurmctld -{% else %} - systemctl kill -s HUP --kill-who=main slurmd -{% endif %} -{% else %} - postrotate /etc/init.d/slurm reconfig -{% endif %} - endscript -} - diff --git a/roles/slurm-start.bak/tasks/main.yml b/roles/slurm-start.bak/tasks/main.yml deleted file mode 100644 index df0ff262a08d5c63e85f3c0efb4e19082b4be8c2..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/tasks/main.yml +++ /dev/null @@ -1,86 +0,0 @@ ---- -- name: set use_systemd - set_fact: - use_systemd: True - when: (ansible_distribution == "CentOS" or ansible_distribution == "RedHat") and - ( ansible_distribution_major_version == "7") - -- name: set slurmd_enabled (default enabled) - set_fact: - slurmd_enabled: True - when: slurmd_enabled is not defined - -- name: install slurmdbd init - template: src=slurmdbd.initd.j2 dest=/etc/init.d/slurmdbd mode=755 - become: true - when: use_systemd is not defined and start_slurmdbd is defined - -- name: copy slurmdbd init script if OS contains systemd - template: dest=/etc/systemd/system/slurmdbd.service src=slurmdbd.service.j2 mode=644 - become: true - when: use_systemd is defined and start_slurmdbd is defined - register: slurmdbd_service_installed - -- name: copy slurm init script - template: dest=/etc/init.d/slurm src=slurm.initd.j2 mode=755 - become: true - when: use_systemd is not defined - -- name: copy slurmd.service - template: dest=/etc/systemd/system/slurmd.service src=slurmd.service.j2 mode=644 - become: true - when: use_systemd is defined and start_slurmd is defined - register: slurmd_service_installed - -- name: slurmctld.service - template: dest=/etc/systemd/system/slurmctld.service src=slurmctld.service.j2 mode=644 - become: true - when: use_systemd is defined and start_slurmctld is defined - register: slurmctld_service_installed - -- name: reload systemd after slurmd install - systemd: - daemon_reload: yes - become: true - when: use_systemd is defined and start_slurmd is defined and slurmd_service_installed.changed - -- name: reload systemd after slurmctld _service _installed - systemd: - daemon_reload: yes - become: true - when: use_systemd is defined and start_slurmctld is defined and slurmctld_service_installed.changed - -- name: reload systemd slurmdbd_ service _installed - systemd: - daemon_reload: yes - become: true - when: use_systemd is defined and start_slurmdbd is defined and slurmdbd_service_installed.changed - -- name: start munge - service: name=munge state=restarted enabled=yes - become: true - -- name: start slurmdbd - service: name=slurmdbd state=restarted enabled=no - become: true - when: start_slurmdbd is defined - -- name: "create cluster in slurm db" - shell: "{{slurm_dir}}/bin/sacctmgr -i create cluster {{ clustername }}" - become: true - ignore_errors: true - -- name: start slurmctl - service: name=slurmctld state=restarted enabled=no - become: true - when: use_systemd is defined and start_slurmctld is defined - -- name: start slurmd - service: name=slurmd state=restarted enabled={{ slurmd_enabled }} - become: true - when: use_systemd is defined and start_slurmd is defined - -- name: start slurm - service: name=slurm state=restarted enabled={{ slurmd_enabled }} - become: true - when: use_systemd is not defined and ( start_slurmd is defined or start_slurmctld is defined ) diff --git a/roles/slurm-start.bak/templates/slurm.initd.j2 b/roles/slurm-start.bak/templates/slurm.initd.j2 deleted file mode 100644 index a667fce9716f2ef220e7f2bcd4a565eccc8ad0b1..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/templates/slurm.initd.j2 +++ /dev/null @@ -1,338 +0,0 @@ -#!/bin/bash -# -# chkconfig: 345 90 10 -# description: SLURM is a simple resource management system which \ -# manages exclusive access to a set of compute \ -# resources and distributes work to those resources. -# -# processname: ${exec_prefix}/sbin/slurmd -# pidfile: /var/run/slurmd.pid -# -# processname: ${exec_prefix}/sbin/slurmctld -# pidfile: /var/run/slurmctld.pid -# -# config: /etc/sysconfig/slurm -# -### BEGIN INIT INFO -# Provides: slurm -# Required-Start: $remote_fs $syslog $network munge -# Required-Stop: $remote_fs $syslog $network munge -# Should-Start: $named -# Should-Stop: $named -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: slurm daemon management -# Description: Start slurm to provide resource management -### END INIT INFO -munge_lib="{{ munge_dir }}/lib" -exec_prefix="{{ slurm_dir }}" -prefix="{{ slurm_dir }}" -BINDIR="${exec_prefix}/bin" -CONFDIR="${prefix}/etc" -LIBDIR="${exec_prefix}/lib:${munge_lib}" -SBINDIR="${exec_prefix}/sbin" - -# Source function library. -if [ -f /etc/rc.status ]; then - . /etc/rc.status - SUSE=1 - STARTPROC=startproc - - rc_reset -else - - # Read configuration defaults to override variables: - # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD - ## - for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do - [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" - done - [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" - [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER - expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" - [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ - && RELOAD=1 || unset RELOAD - - if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then - SYSTEM="DEBIAN" - [ -r /etc/default/rcS ] && . /etc/default/rcS - [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh - [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions - STARTPROC="start_daemon" - elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then - SYSTEM="REDHAT" - . /etc/rc.d/init.d/functions - RH_LOCK="/var/lock/subsys/$INIT_NAME" - SUSE=0 - STARTPROC=daemon - elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then - SYSTEM="SUSE" - . /etc/rc.status - rc_reset - elif [ -r /lib/lsb/init-functions ]; then - SYSTEM="LSB" - . /lib/lsb/init-functions - else - SYSTEM="OTHER" - fi - - function rc_status() { - RETVAL=$? - } - function rc_exit () { - exit $RETVAL - } - RETVAL=0 -fi - -# We can not use a starter program without losing environment -# variables that are critical on Blue Gene systems -if [ -d /bgl/BlueLight/ppcfloor ]; then - STARTPROC="" -fi - -# Source slurm specific configuration -# This can be used to alter limits for users jobs or set daemon options. -# For example, the limits for user jobs could be higher or lower than the -# default limits for user root (e.g. "ulimit -t unlimited" sets an unlimited -# CPU time limit for spawned user jobs). -# SLURMCTLD_OPTIONS defines slurmctld command line options. See "man slurmctld" -# SLURMD_OPTIONS defines slurmd command line options. See "man slurmd" -if [ -f /etc/sysconfig/slurm ] ; then - . /etc/sysconfig/slurm -else - SLURMCTLD_OPTIONS="" - SLURMD_OPTIONS="" -fi - -if [ ! -x $BINDIR/scontrol ]; then - echo "Could not find $BINDIR/scontrol. Bad path?" - exit 1 -fi - -if [ ! -f $CONFDIR/slurm.conf ]; then - echo "Could not find $CONFDIR/slurm.conf. Bad path?" - exit 1 -fi - -# setup library paths for slurm and munge support -export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} - -start() { - prog=$1 - shift - echo -n "starting $prog: " - unset HOME MAIL USER USERNAME - $STARTPROC $SBINDIR/$prog $* - rc_status -v - echo - touch /var/lock/subsys/slurm -} - -stop() { - echo -n "stopping $1: " - killproc $1 -TERM - rc_status -v - echo - rm -f /var/lock/subsys/slurm -} - -startall() { - for prog in `$BINDIR/scontrol show daemons`; do - optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"` - if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]] - then - for node in $($BINDIR/scontrol show aliases) - do - start $prog -N ${node} ${!optvar} - done - else - start $prog ${!optvar} - fi - done -} - -# -# status() with slight modifications to take into account -# instantiations of job manager slurmd's, which should not be -# counted as "running" -# -slurmstatus() { - local base=${1##*/} - local pid - local rpid - local pidfile - local pidfiles - local rc - - pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'` - if [ $? = 0 ]; then - pidfile=${pidfile##*=} - pidfile=${pidfile%#*} - pidfile=${pidfile//\"/} - else - pidfile=/var/run/${base}.pid - fi - - pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \ - pidof -o $$ -o $$PPID -o %PPID -x ${base}` - - if [ "$base" == "slurmd" ] ; then - echo ${pidfile} | grep -q %n - if [[ $? -eq 0 ]] - then - for n in $($BINDIR/scontrol show aliases) - do - pidfiles="${pidfiles} $(echo ${pidfile} | sed "s/%n/$n/g")" - done - else - pidfiles=${pidfile} - fi - else - pidfiles=${pidfile} - fi - - RETVAL=0 - for pidfile in ${pidfiles} - do - rc=1 - if [ -f $pidfile ]; then - read rpid < $pidfile - if [ "$rpid" != "" -a "$pid" != "" ]; then - for i in $pid ; do - if [ "$i" = "$rpid" ]; then - echo $"${base} (pid $rpid) is running..." - rc=0 - break - fi - done - elif [ "$rpid" != "" -a "$pid" = "" ]; then -# Due to change in user id, pid file may persist -# after slurmctld terminates - if [ "$base" != "slurmctld" ] ; then - echo $"${base} dead but pid file exists" - else - echo $"${base} is stopped" - fi - RETVAL=1 - fi - fi - - if [[ $rc -eq 0 ]] - then - continue - fi - - if [ "$base" = "slurmctld" -a "$pid" != "" ] ; then - echo $"${base} (pid $pid) is running..." - continue - fi - - echo $"${base} is stopped" - if [ "$RETVAL" == "0" ]; then - RETVAL=3 - fi - done - - return $RETVAL -} - -# -# stop slurm daemons, -# wait for termination to complete (up to 10 seconds) before returning -# -slurmstop() { - for prog in `$BINDIR/scontrol show daemons`; do - stop $prog - - for i in 1 2 3 4 - do - sleep $i - slurmstatus $prog - if [ $? != 0 ]; then - break - fi - done - done - - # slurmstatus return 1 in case of stopped daemon - # and that is what we are looking for here - if [[ ${RETVAL} == "1" ]] - then - RETVAL=0 - else - RETVAL=1 - fi -} - -# -# The pathname substitution in daemon command assumes prefix and -# exec_prefix are same. This is the default, unless the user requests -# otherwise. -# -# Any node can be a slurm controller and/or server. -# -case "$1" in - start) - startall - ;; - startclean) - SLURMCTLD_OPTIONS="-c $SLURMCTLD_OPTIONS" - SLURMD_OPTIONS="-c $SLURMD_OPTIONS" - startall - ;; - stop) - slurmstop - ;; - status) - anystop=0 - for prog in `$BINDIR/scontrol show daemons`; do - slurmstatus $prog - rc=$? - if [ $rc != 0 ] ; then - anystop=$rc - fi - done - RETVAL=$anystop - ;; - restart) - $0 stop - $0 start - ;; - condrestart) - if [ -f /var/lock/subsys/slurm ]; then - for prog in `$BINDIR/scontrol show daemons`; do - stop $prog - sleep 1 - optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"` - if [[ ${MULTIPLE_SLURMD} == yes ]] && [[ ${prog} == slurmd ]] - then - for node in $($BINDIR/scontrol show aliases) - do - start $prog -N ${node} - done - else - start $prog ${!optvar} - fi - done - fi - ;; - reconfig|reload) - for prog in `$BINDIR/scontrol show daemons`; do - echo -n $"Reloading $prog daemon configuration: " - killproc $prog -HUP - echo - done - ;; - test) - for prog in `$BINDIR/scontrol show daemons`; do - echo "$prog runs here" - done - ;; - *) - echo "Usage: $0 {start|startclean|stop|status|restart|reconfig|condrestart|test}" - exit 1 - ;; -esac - -rc_exit diff --git a/roles/slurm-start.bak/templates/slurmctld.service.j2 b/roles/slurm-start.bak/templates/slurmctld.service.j2 deleted file mode 100644 index 325468aa45600fe813e87827b256bdb66254ca1d..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/templates/slurmctld.service.j2 +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=Slurm controller daemon -After=network.target glusterVolume.mount -ConditionPathExists={{ slurm_dir }}/etc/slurm.conf - -[Service] -Type=forking -#EnvironmentFile=/etc/default/slurmctld -ExecStart={{ slurm_dir }}/sbin/slurmctld $SLURMCTLD_OPTIONS -PIDFile={{ slurmpiddir }}/slurmctld.pid -[Install] -WantedBy=multi-user.target diff --git a/roles/slurm-start.bak/templates/slurmd.service.j2 b/roles/slurm-start.bak/templates/slurmd.service.j2 deleted file mode 100644 index 60d051dbaf8addbcae4747f43e78207281b28689..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/templates/slurmd.service.j2 +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Slurm node daemon -After=network.target -ConditionPathExists={{ slurm_dir }}/etc/slurm.conf - -[Service] -Type=forking -KillMode=process -LimitMEMLOCK=infinity -#EnvironmentFile=/etc/default/slurmd -ExecStart={{ slurm_dir }}/sbin/slurmd $SLURMD_OPTIONS -PIDFile={{ slurmpiddir }}/slurmd.pid - -[Install] -WantedBy=multi-user.target diff --git a/roles/slurm-start.bak/templates/slurmdbd.initd.j2 b/roles/slurm-start.bak/templates/slurmdbd.initd.j2 deleted file mode 100644 index d9a7b2084f4340f4099f9bedafafa49fbb396968..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/templates/slurmdbd.initd.j2 +++ /dev/null @@ -1,224 +0,0 @@ -#!/bin/bash -# -# chkconfig: 345 90 10 -# description: SLURMDBD is a database server interface for \ -# SLURM (Simple Linux Utility for Resource Management). -# -# processname: ${exec_prefix}/sbin/slurmdbd -# pidfile: /var/run/slurmdbd.pid -# -# config: /etc/sysconfig/slurm -# -### BEGIN INIT INFO -# Provides: slurmbd -# Required-Start: $remote_fs $syslog $network munge -# Required-Stop: $remote_fs $syslog $network munge -# Should-Start: $named -# Should-Stop: $named -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: SLURM database daemon -# Description: Start slurm to provide database server for SLURM -### END INIT INFO - -munge_lib="{{ munge_dir }}/lib" -exec_prefix="{{ slurm_dir }}" -prefix="{{ slurm_dir }}" -CONFDIR="${prefix}/etc" -LIBDIR="${exec_prefix}/lib:${munge_lib}" -SBINDIR="${exec_prefix}/sbin" - -#Source function library. -if [ -f /etc/rc.status ]; then - . /etc/rc.status - SUSE=1 - STARTPROC=startproc - - rc_reset -else - # Read configuration defaults to override variables: - # $CONFIG, $DAEMON_ARGS, $PIDFILE, $USER, $NICE, $SIGHUP_RELOAD - ## - for dir in "$sysconfdir/default" "$sysconfdir/sysconfig"; do - [ -r "$dir/$INIT_NAME" ] && . "$dir/$INIT_NAME" - done - [ -z "$DAEMON_ARGS" -a -n "$OPTIONS" ] && DAEMON_ARGS="$OPTIONS" - [ "`id | sed 's/^uid=\([0-9]*\).*/\1/'`" -ne 0 ] && unset USER - expr -- "$NICE" : '[0-9]*$' >/dev/null 2>&1 && NICE="+$NICE" - [ -n "$SIGHUP_RELOAD" -a "$SIGHUP_RELOAD" != 0 ] \ - && RELOAD=1 || unset RELOAD - - if [ -f /etc/debian_version -a -x /sbin/start-stop-daemon ]; then - SYSTEM="DEBIAN" - [ -r /etc/default/rcS ] && . /etc/default/rcS - [ -r /lib/init/vars.sh ] && . /lib/init/vars.sh - [ -r /lib/lsb/init-functions ] && . /lib/lsb/init-functions - STARTPROC="start_daemon" - elif [ -f /etc/redhat-release -a -r /etc/rc.d/init.d/functions ]; then - SYSTEM="REDHAT" - . /etc/rc.d/init.d/functions - RH_LOCK="/var/lock/subsys/$INIT_NAME" - SUSE=0 - STARTPROC=daemon - elif [ -f /etc/SuSE-release -a -r /etc/rc.status ]; then - SYSTEM="SUSE" - . /etc/rc.status - rc_reset - elif [ -r /lib/lsb/init-functions ]; then - SYSTEM="LSB" - . /lib/lsb/init-functions - else - SYSTEM="OTHER" - fi - - - function rc_status() { - RETVAL=$? - } - function rc_exit () { - exit $RETVAL - } - RETVAL=0 -fi - -# We can not use a starter program without losing environment -# variables that are critical on Blue Gene systems -if [ -d /bgl/BlueLight/ppcfloor ]; then - STARTPROC="" -fi - -# Source slurm specific configuration -# SLURMDBD_OPTIONS defines slurmdbd command line options. See "man slurmdbd" -if [ -f /etc/sysconfig/slurm ] ; then - . /etc/sysconfig/slurm -else - SLURMDBD_OPTIONS="" -fi - -if [ ! -f $CONFDIR/slurmdbd.conf ]; then - echo "Could not find $CONFDIR/slurmdbd.conf. Bad path?" - exit 1 -fi - -# setup library paths for slurm and munge support -export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} - -start() { - prog=$1 - shift - echo -n "starting $prog: " - unset HOME MAIL USER USERNAME - $STARTPROC $SBINDIR/$prog $SLURMDBD_OPTIONS - rc_status -v - echo - touch /var/lock/subsys/slurmdbd -} - -stop() { - echo -n "stopping $1: " - killproc $1 -TERM - rc_status -v - echo - rm -f /var/lock/subsys/slurmdbd -} - -slurmstatus() { - local base=${1##*/} - local pid - local rpid - local pidfile - - pidfile=`grep -i pidfile $CONFDIR/slurmdbd.conf | grep -v '^ *#'` - if [ $? = 0 ]; then - pidfile=${pidfile##*=} - pidfile=${pidfile%#*} - pidfile=${pidfile//\"/} - else - pidfile=/var/run/slurmdbd.pid - fi - - pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \ - pidof -o $$ -o $$PPID -o %PPID -x ${base}` - - if [ -f $pidfile ]; then - read rpid < $pidfile - if [ "$rpid" != "" -a "$pid" != "" ]; then - for i in $pid ; do - if [ "$i" = "$rpid" ]; then - echo $"${base} (pid $pid) is running..." - return 0 - fi - done - elif [ "$rpid" != "" -a "$pid" = "" ]; then - echo $"${base} dead but pid file exists" - return 1 - fi - - fi - - if [ "$base" = "slurmdbd" -a "$pid" != "" ] ; then - echo $"${base} (pid $pid) is running..." - return 0 - fi - - echo $"${base} is stopped" - - return 3 -} - -# -# stop slurm daemons, -# wait for termination to complete (up to 10 seconds) before returning -# -slurmstop() { - stop $1 - - for i in 1 2 3 4 - do - sleep $i - slurmstatus $1 - if [ $? != 0 ]; then - break - fi - done -} -# -# The pathname substitution in daemon command assumes prefix and -# exec_prefix are same. This is the default, unless the user requests -# otherwise. -# -# Any node can be a slurm controller and/or server. -# -case "$1" in - start) - start slurmdbd - ;; - stop) - slurmstop slurmdbd - ;; - status) - slurmstatus slurmdbd - rc_status -v - ;; - restart) - $0 stop - $0 start - ;; - condrestart) - if [ -f /var/lock/subsys/slurm ]; then - stop slurmdbd - start slurmdbd - fi - ;; - reconfig|reload) - echo -n $"Reloading slurmdbd daemon configuration: " - killproc slurmdbd -HUP - echo - ;; - *) - echo "Usage: $0 {start|stop|status|restart|condrestart|reconfig}" - exit 1 - ;; -esac - -rc_exit diff --git a/roles/slurm-start.bak/templates/slurmdbd.service.j2 b/roles/slurm-start.bak/templates/slurmdbd.service.j2 deleted file mode 100644 index cc48193f09d95e8d2886855b9a68002793410950..0000000000000000000000000000000000000000 --- a/roles/slurm-start.bak/templates/slurmdbd.service.j2 +++ /dev/null @@ -1,13 +0,0 @@ - -[Unit] -Description=Slurm DBD accounting daemon -After=network.target -ConditionPathExists={{ slurm_dir }}/etc/slurmdbd.conf - -[Service] -Type=forking -ExecStart={{ slurm_dir }}/sbin/slurmdbd -PIDFile={{ slurmdbdpiddir }}/slurmdbd.pid - -[Install] -WantedBy=multi-user.target diff --git a/roles/slurm-trigger.bak/README.rst b/roles/slurm-trigger.bak/README.rst deleted file mode 100644 index 61779b1815da7ec036ee5352fe029a5fb3c55fc6..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/README.rst +++ /dev/null @@ -1,27 +0,0 @@ -THis role sets up trigger events on your slurm cluster. -What you want the triggers to do is up to you, so you will probably modify the templated shell files. -Copy the role to a local role directory? - -Triggers used in this role as it stands -- primary_slurmctld_failure -- primary_slurmctld_resumed_operation.sh -- node down - -USAGE: -- hosts: 'ManagementNodes' - tasks: - - include_vars: vars/slurm.yml - -- hosts: 'ManagementNodes' - roles: - - { role: slurm-trigger, slurm_dir: "/opt/slurm-18.08.6", admin_email: "hpc-alerts-warning-l@monash.edu", tags: [slurm, slurm-trigger] } - - - - -The role uses several variables that need to be defined: -{{ slurm_dir }} The directory of slurm install. Shell scripts are copied to sbin -{{ admin_email }} Email address (defined in slurm.yml, or defined some other way) to send alerts to - -Each trigger has 2 files. One to respond to a trigger. And one to reset the trigger. The role calls the last one to start the process. - diff --git a/roles/slurm-trigger.bak/tasks/main.yml b/roles/slurm-trigger.bak/tasks/main.yml deleted file mode 100644 index 0e65185c9bd6e7b560db9a434199de6ba992006b..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/tasks/main.yml +++ /dev/null @@ -1,56 +0,0 @@ ---- -############################ -- name: template primary_slurmctld_failure - template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_failure.sh" src=primary_slurmctld_failure.sh.j2 mode="0755" - become: true - become_user: root - -- name: template set primary_slurmctld_failure trigger - template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" src=set_primary_slurmctld_failure_trigger.sh.j2 mode="0755" - become: true - become_user: root - -- name: Execute set_primary_slurmctld_failure)trigger - command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" - become: true - become_user: slurm - run_once: true - ignore_errors: true - -- name: template primary_slurmctld_resumed_operation - template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh" src=primary_slurmctld_resumed_operation.sh.j2 mode="0755" - become: true - become_user: root - -- name: template set primary_slurmctld_resumed trigger - template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" src=set_primary_slurmctld_resumed_operation_trigger.sh.j2 mode="0755" - become: true - become_user: root - -- name: Execute primary_slurmctld_resumed_operation.sh - command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" - become: true - become_user: slurm - run_once: true - ignore_errors: true - -- name: template node_down - template: dest="{{ slurm_dir }}/sbin/node_down.sh" src=node_down.sh.j2 mode="0755" - become: true - become_user: root - -- name: template node_down trigger command - template: dest="{{ slurm_dir }}/sbin/set_node_trigger.sh" src=set_node_trigger.sh.j2 mode="0755" - become: true - become_user: root - - -- name: Execute set_node_trigger.sh - command: "{{ slurm_dir }}/sbin/set_node_trigger.sh" - become: true - become_user: slurm - run_once: true - ignore_errors: true - - - diff --git a/roles/slurm-trigger.bak/templates/node_down.sh.j2 b/roles/slurm-trigger.bak/templates/node_down.sh.j2 deleted file mode 100644 index 679e18c7daa2b2f2e9dff282daf9d0a1f2967802..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/node_down.sh.j2 +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -# Notify the administrator of the failure using by e-mail -echo "On `hostname`:`date`:`whoami`: slurm-trigger event for NODE_FAILURE: $*" | `which mail` -s "NODE FAILURE $*" {{ admin_email }} -# Submit trigger for next primary slurmctld failure event -TRIGGER_CMD="{{ slurm_dir }}/sbin/set_node_trigger.sh" - -FILE=/tmp/node_down.txt -#COMMAND="su slurm -c $TRIGGER_CMD" -echo "node_down.sh: `date`: `whoami`: $TRIGGER_CMD" >> $FILE -$TRIGGER_CMD >> $FILE 2>&1 diff --git a/roles/slurm-trigger.bak/templates/primary_slurmctld_failure.sh.j2 b/roles/slurm-trigger.bak/templates/primary_slurmctld_failure.sh.j2 deleted file mode 100644 index 61747379a436a63f89f8583d05ec5d8aa87cd9cb..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/primary_slurmctld_failure.sh.j2 +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -# Notify the administrator of the failure using by e-mail -echo "On `hostname`:`date`:`who`: slurm-trigger event for Primary_SLURMCTLD_FAILURE" | `which mail` -s Primary_SLURMCTLD_FAILURE {{ admin_email }} -# Submit trigger for next primary slurmctld failure event -TRIGGER_CMD="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" - -FILE=/tmp/primary_down.txt -#COMMAND="su slurm -c $TRIGGER_CMD" -echo "primary_slurmctld_failure.sh:`date`:`whoami`: $TRIGGER_CMD" >> $FILE -$TRIGGER_CMD >> $FILE 2>&1 diff --git a/roles/slurm-trigger.bak/templates/primary_slurmctld_resumed_operation.sh.j2 b/roles/slurm-trigger.bak/templates/primary_slurmctld_resumed_operation.sh.j2 deleted file mode 100644 index b8e6788bccef5b36f65575c12b507ef9c624be09..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/primary_slurmctld_resumed_operation.sh.j2 +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -# Notify the administrator of the failure using by e-mail -echo "On `hostname`:`date`:`whoami`: slurm-trigger event for Primary_SLURMCTLD_RESUMED" | `which mail` -s Primary_SLURMCTLD_RESUMED {{ admin_email }} -# Submit trigger for next primary slurmctld failure event - -FILE=/tmp/primary_up.txt -#COMMAND="su slurm -c {{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" -COMMAND="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" -echo "primary_slurmctld_resumed_operation.sh.sh:`date`:`whoami`: $COMMAND" >> $FILE -$COMMAND >> $FILE 2>&1 diff --git a/roles/slurm-trigger.bak/templates/set_node_trigger.sh.j2 b/roles/slurm-trigger.bak/templates/set_node_trigger.sh.j2 deleted file mode 100644 index 87e2938bfec5c1184a7f9e949d2fc16d0cfe571c..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/set_node_trigger.sh.j2 +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --down --program={{ slurm_dir }}/sbin/node_down.sh" -echo "set_node_trigger.sh: `date`: $TRIGGER_CMD" -$TRIGGER_CMD diff --git a/roles/slurm-trigger.bak/templates/set_primary_slurmctld_failure_trigger.sh.j2 b/roles/slurm-trigger.bak/templates/set_primary_slurmctld_failure_trigger.sh.j2 deleted file mode 100644 index 30b0e9ea3200e0305c5c052d3a0c282d66fc4b2f..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/set_primary_slurmctld_failure_trigger.sh.j2 +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_failure --program={{ slurm_dir }}/sbin/primary_slurmctld_failure.sh" -echo "set_primary_slurmctld_failure_trigger.sh: `date`: $TRIGGER_CMD" -$TRIGGER_CMD diff --git a/roles/slurm-trigger.bak/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 b/roles/slurm-trigger.bak/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 deleted file mode 100644 index ea3ad3f7cead736a0a64cc1f9b11add14d1ad610..0000000000000000000000000000000000000000 --- a/roles/slurm-trigger.bak/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_resumed_operation --program={{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh" -echo "set_primary_slurmctld_resumed_operation_trigger.sh: `date`: $TRIGGER_CMD" -$TRIGGER_CMD diff --git a/roles/slurm_config.bak/tasks/main.yml b/roles/slurm_config.bak/tasks/main.yml deleted file mode 100644 index 93912a851dda2ccb18c18cb26b6c84b2f684c481..0000000000000000000000000000000000000000 --- a/roles/slurm_config.bak/tasks/main.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -- name: install slurm.conf - copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf - become: true - become_user: root - -- name: setup plugin - template: src=job_submit.lua.j2 dest={{ slurm_dir }}/etc/job_submit.lua mode=755 - run_once: true - become: true - become_user: root - when: slurm_lua is defined - diff --git a/roles/slurm_config.bak/templates/job_submit.lua.j2 b/roles/slurm_config.bak/templates/job_submit.lua.j2 deleted file mode 100644 index 22b05df79c76d4e33a0aae386ac6f5102454ee32..0000000000000000000000000000000000000000 --- a/roles/slurm_config.bak/templates/job_submit.lua.j2 +++ /dev/null @@ -1,70 +0,0 @@ ---[[ - - Example lua script demonstrating the SLURM job_submit/lua interface. - This is only an example, not meant for use in its current form. - - Leave the function names, arguments, local varialbes and setmetatable - set up logic in each function unchanged. Change only the logic after - the lSUCCESSine containing "*** YOUR LOGIC GOES BELOW ***". - - For use, this script should be copied into a file name "job_submit.lua" - in the same directory as the SLURM configuration file, slurm.conf. - - ---]] - -function slurm_job_submit(job_desc, part_list, submit_uid) - - --- Check no default account - -if job_desc.account == "default" then - slurm.log_user("You have to specify your project ID as part of your job submission. The account=default is now deprecated on M3 job scheduler.") - return slurm.ERROR -end - - --- Check Desktop requests with more than one node - -if ((job_desc.name == "desktop") and (job_desc.min_nodes > 1 )) then - slurm.log_user("The current M3 Desktop applications are unable to utilise more than one node, please select one node instead") - return slurm.ERROR -end - - - --- Check for gres.gpu requirements in m3c, m3h and m3g, else move job to comp - -if ((job_desc.partition == "m3c" ) or (job_desc.partition == "m3h" ) or (job_desc.partition == "m3g" )) then - local partition = "" - if (job_desc.gres == nil) then - partition = "comp" - slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition) - job_desc.partition = partition - end - return slurm.SUCCESS -end - - --- Check for QOS rtq in m3c, m3h , m3g and partition=nil, then forward job to rtqp,comp,m3g - -if ((job_desc.qos == "rtq") and (job_desc.partition == nil)) then - local partition = "" - partition = "rtqp,comp,m3g" - slurm.log_info("slurm_job_submit: for user: %u, partition: %s", submit_uid, partition) - job_desc.partition = partition - return slurm.SUCCESS -end - - - -end - - - -function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) - return slurm.SUCCESS -end - -slurm.log_info("initialized") -return slurm.SUCCESS