slurm.conf.j2 4.52 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName={{ clustername }}
ControlMachine={{ slurmctrl }}
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
Jupiter Hu's avatar
Jupiter Hu committed
24
25
StateSaveLocation={{ slurmstatedir }}
SlurmdSpoolDir={{ slurmdatadir }}
26
SwitchType=switch/none
Jupiter Hu's avatar
Jupiter Hu committed
27
MpiDefault=pmi2
Jupiter Hu's avatar
Jupiter Hu committed
28
29
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
30
ProctrackType=proctrack/linuxproc
31
32
33
#PluginDir=
CacheGroups=0
#FirstJobId=
34
ReturnToService=1
35
36
37
38
39
40
41
42
43
44
45
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
46
47
48
TaskPlugin=task/cgroup
#TaskPlugin=task/affinity
#TaskPlugin=task/affinity,task/cgroup
49
{% if slurm_lua is defined %}
Jupiter Hu's avatar
Jupiter Hu committed
50
51
JobSubmitPlugins=lua
{% endif %}
52
53
54
OverTimeLimit=1
CompleteWait=10

55
56
57
58
59
60
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
61
62
63

SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed

64
65
66
67
68
69
#SlurmctldTimeout=300
#SlurmdTimeout=300
#InactiveLimit=0
#MinJobAge=300
KillWait=10
#Waittime=0
70
71
72
73
74
75
76
#
# SCHEDULING
SchedulerType={{ slurmschedulertype }}
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType={{ slurmselecttype }}
Jupiter Hu's avatar
Jupiter Hu committed
77
78
79
{% if slurmselecttype.find("cons_res") > 0 %}
SelectTypeParameters=CR_Core_Memory
{% endif %}
80
81
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
82
83
84
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
85
#PriorityUsageResetPeriod=14-0
86
87
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
88
#PriorityWeightPartition=10000
89
90
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
91
92
93
94
#
# LOGGING
{% if slurmctlddebug %}
SlurmctldDebug={{ slurmctlddebug.level }}
95
SlurmctldLogFile={{ slurmctlddebug.log }}
96
97
98
99
100
101
{% else %}
#SlurmctldDebug=
#SlurmctldLogFile=
{% endif %}
{% if slurmddebug %}
SlurmdDebug={{ slurmddebug.level }}
102
SlurmdLogFile={{ slurmddebug.log }}
103
104
105
106
107
108
{% else %}
#SlurmdDebug=
#SlurmdLogFile=
{% endif %}
{% if slurmschedlog %}
SlurmSchedlogLevel={{ slurmschedlog.level }}
109
SlurmSchedLogFile={{ slurmschedlog.log }}
110
111
112
113
114
115
116
{% else %}
#SlurmSchedlogLevel=
#SlurmSchedLogFile=
{% endif %}
JobCompType=jobcomp/none
#JobCompLoc=
#
117
{% if slurmjob is defined %}
Jupiter Hu's avatar
Jupiter Hu committed
118
119
120
121
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% endif %}
#
122
123
124
125
126
127
128
129
130
131
132
133
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmctrl }}
#AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
134
GresTypes=gpu
135
136
137
138
139
140
141

# Fair share
{% if slurmfairshare.def %}
PriorityWeightFairshare={{ slurmfairshare.val }}
{% endif %}

DisableRootJobs=YES
Jupiter Hu's avatar
Jupiter Hu committed
142
MpiParams=ports=12000-12999
143
144
145
146
147
148
149
150
151
# COMPUTE NODES
{% set nodelist = [] %}
{% for queue in slurmqueues %}
{% for node in groups[queue.group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
{% endfor %}
{% for node in nodelist|unique %}
Jupiter Hu's avatar
Jupiter Hu committed
152
NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN  
153
154
155
{% endfor %}

{% for queue in slurmqueues %}
156
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP 
157
{% endfor %}