Skip to content
Snippets Groups Projects
Commit 9a1451a6 authored by Andreas Hamacher's avatar Andreas Hamacher
Browse files

Merge remote-tracking branch 'origin/master' into upcomingMaintenance

parents 1c365c65 8dbc2567
No related branches found
No related tags found
1 merge request!460Upcoming maintenance
......@@ -73,6 +73,7 @@
- hwloc-devel
- lua
- lua-devel
- python3
become: true
when: ansible_os_family == "RedHat"
......@@ -92,6 +93,7 @@
- liblua5.2-dev
- hwloc
- libhwloc-dev
- python3
become: true
when: ansible_os_family == "Debian"
......@@ -118,32 +120,12 @@
- include: createSlurmDirectories.yml
- name: check slurm generic resource
shell: "{{ slurm_gres_check }}"
register: slurm_generic_resource
ignore_errors: true
when: slurm_gres_check is defined
check_mode: no
changed_when: False
- name: Gres - Test for Nvidia devices
script: scripts/nvidia-probe.py
register: probeOutput
check_mode: no
changed_when: False
- name: get cpu count
shell: 'lscpu | grep "On-line CPU" | cut -f 2 -d ":" | sed "s/\ *//g"'
register: cpucount
check_mode: no
changed_when: False
- name: "set nvidiaprobe slurm_gres_list"
set_fact: "slurm_gres_list={{ probeOutput.stdout }}"
- name: template gres.conf file
template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres.conf mode=644
become: true
- name: make slurm prolog dir
file: path=/opt/slurm/etc state=directory mode=755
......@@ -158,15 +140,6 @@
template: src=slurm.epilog.j2 dest=/opt/slurm/etc/slurm.epilog mode=755
become: true
- name: install slurm.conf
copy: src=files/slurm.conf dest={{ slurm_dir }}/etc/slurm.conf
become: true
when: slurm_use_vpn==False
- name: install slurm.conf
template: src=slurm-vpn.conf.j2 dest={{ slurm_dir }}/etc/slurm.conf
become: true
when: slurm_use_vpn==True
- name: setup envirnment variables
template: src=slurm_setup.sh.j2 dest=/etc/profile.d/slurm_setup.sh
......
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation={{ slurmstatedir }}
SlurmdSpoolDir={{ slurmdatadir }}
SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
{% if slurm_lua is defined %}
JobSubmitPlugins=lua
{% endif %}
OverTimeLimit=1
CompleteWait=10
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed
#SlurmctldTimeout=300
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
#
# SCHEDULING
SchedulerType={{ slurmschedulertype }}
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
{% if slurmselecttype.find("cons_res") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
SlurmctldDebug={{ slurmctlddebug.level }}
SlurmctldLogFile={{ slurmctlddebug.log }}
{% else %}
#SlurmctldDebug=
#SlurmctldLogFile=
{% endif %}
{% if slurmddebug %}
SlurmdDebug={{ slurmddebug.level }}
SlurmdLogFile={{ slurmddebug.log }}
{% else %}
#SlurmdDebug=
#SlurmdLogFile=
{% endif %}
{% if slurmschedlog %}
SlurmSchedlogLevel={{ slurmschedlog.level }}
SlurmSchedLogFile={{ slurmschedlog.log }}
{% else %}
#SlurmSchedlogLevel=
#SlurmSchedLogFile=
{% endif %}
JobCompType=jobcomp/none
#JobCompLoc=
#
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% endif %}
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmctrl }}
#AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}
DisableRootJobs=YES
MpiParams=ports=12000-12999
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
{% endfor %}
{% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP
{% endfor %}
[Unit]
Description=Slurm node daemon
After=network.target
ConditionPathExists={{ slurm_dir }}/etc/slurm.conf
# After Slurm-20.02 (configless), this is not needed
# ConditionPathExists={{ slurm_dir }}/etc/slurm.conf
[Service]
Type=forking
KillMode=process
LimitMEMLOCK=infinity
#EnvironmentFile=/etc/default/slurmd
ExecStart={{ slurm_dir }}/sbin/slurmd $SLURMD_OPTIONS
ExecStart={{ slurm_dir }}/sbin/slurmd --conf-server {{ slurmctrl }}:6817
PIDFile={{ slurmpiddir }}/slurmd.pid
[Install]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment