Skip to content
Snippets Groups Projects
slurm.conf.j2 4.52 KiB
Newer Older
Trung Nguyen's avatar
Trung Nguyen committed
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation={{ slurmstatedir }}
SlurmdSpoolDir={{ slurmdatadir }}
SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
{% if slurm_lua is defined %}
JobSubmitPlugins=lua
{% endif %}
OverTimeLimit=1
CompleteWait=10

#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS

SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed

#SlurmctldTimeout=300
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
#
# SCHEDULING
SchedulerType={{ slurmschedulertype }}
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
{% if slurmselecttype.find("cons_res") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
SlurmctldDebug={{ slurmctlddebug.level }}
SlurmctldLogFile={{ slurmctlddebug.log }}
{% else %}
#SlurmctldDebug=
#SlurmctldLogFile=
{% endif %}
{% if slurmddebug %}
SlurmdDebug={{ slurmddebug.level }}
SlurmdLogFile={{ slurmddebug.log }}
{% else %}
#SlurmdDebug=
#SlurmdLogFile=
{% endif %}
{% if slurmschedlog %}
SlurmSchedlogLevel={{ slurmschedlog.level }}
SlurmSchedLogFile={{ slurmschedlog.log }}
{% else %}
#SlurmSchedlogLevel=
#SlurmSchedLogFile=
{% endif %}
JobCompType=jobcomp/none
#JobCompLoc=
#
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% endif %}
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmctrl }}
#AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu

# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}

DisableRootJobs=YES
MpiParams=ports=12000-12999
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN  
{% endfor %}

{% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP 
{% endfor %}