Skip to content
Snippets Groups Projects
Commit 5ab8b75d authored by Simon Michnowicz (Monash University)'s avatar Simon Michnowicz (Monash University)
Browse files

Modified to template gres.conf so that the correct Nvidia devices are now placed in gres

depending upon node. This is done by a script nvidia-probe.py which probes node for correct information


Former-commit-id: 9842e3f5
parent 910efb7c
No related branches found
No related tags found
No related merge requests found
#!/bin/env python
# prints a list of NIDIA devices and their type in json format for
# parsing by ansible program;
# fields are 'name':'gpu' (fixed)
# 'file': devicePath, (i.e. /dev/nvidia0)
# 'type':typeOfDevice (i.e. 80 parsed from nvidia-smi outout)
# program returns nothing upon error (i.e. no error messages)
# Also checks for existance of /dev/nvidia? where ? is number from nvidia-smi GPU count
# nvidia-smi -L produces output like
#GPU 0: Tesla K80 (UUID: GPU-8bdb2956-4c10-7bd0-80d4-46da054663b4)
#GPU 1: Tesla K80 (UUID: GPU-19ed5f7c-435a-036e-54f0-f64209c3cede)
#GPU 2: Tesla K80 (UUID: GPU-a2f8cfe2-5bbc-de2a-8adc-4038f3379b5e)
#GPU 3: Tesla K80 (UUID: GPU-1c9c0d02-4590-c915-18d2-d709efb56d8d)
#GPU 4: Tesla K80 (UUID: GPU-b0f290c8-3b69-a518-ac77-22718f43e946)
#GPU 5: Tesla K80 (UUID: GPU-565ebca2-6b37-3bc0-a355-72330049a349)
#GPU 6: Tesla K80 (UUID: GPU-d8096845-d8a1-e3ef-ad00-c1d069c1b685)
#GPU 7: Tesla K80 (UUID: GPU-20ee0841-22b5-9974-66c0-b49e5be3e469)
import subprocess
import sys
import re
import os
import json
try:
#run nvidia-smi -L to parse output
p = subprocess.Popen(['nvidia-smi', '-L'], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = p.communicate()
lines=out.strip().split('\n')
numberOfDevices=len(lines)
typeofDevice=""
deviceList=[] #return list
for line in lines:
if not line :
break
#print "Line is ",line
pe=re.compile('GPU\s*(\d*).*Tesla\s*(\S*)')
m=pe.search(line)
if not m:
#print "No match found"
break
numberOfDevice=m.group(1)
typeOfDevice=m.group(2)
#print "Number of Devics is "+numberOfDevice+" Type of device is "+typeOfDevice
#check device file existance
devicePath="/dev/nvidia"+numberOfDevice
if os.path.exists(devicePath):
#print "OK"
deviceList.append( { 'name':'gpu' , 'file': devicePath, 'type':typeOfDevice } )
else:
#print "Error file not found ",devicePath
sys.exit(0)
#now convert list to json
output=json.dumps(deviceList)
print output
except OSError:
#if nvidia-smi is not installed on computer then this error is thrown by subprocess.Popen
sys.exit(0)
...@@ -124,21 +124,19 @@ ...@@ -124,21 +124,19 @@
- include: installSlurmFromSource.yml - include: installSlurmFromSource.yml
- name: check slurm generic resource - name: Gres - Test for Nvidia devices
shell: "{{ slurm_gres_check }}" script: ./nvidia-probe.py
register: slurm_generic_resource register: probeOutput
ignore_errors: true
when: slurm_gres_check is defined
- name: install gres config file - set_fact: slurm_gres_list= "[ ]"
template: src=gres.conf.j2 dest={{ slurm_dir }}/etc/gres.conf mode=644
sudo: true - name: "set nvidiaprobe slurm_gres_list"
when: slurm_generic_resource is defined and slurm_generic_resource.stdout set_fact: slurm_gres_list={{ probeOutput.stdout | from_json }}
when: probeOutput.stdout is defined and ( probeOutput.stdout|length !=0 )
- name: install gres sub config file - name: template gres.conf file
template: src=gres_sub.conf.j2 dest={{ slurm_dir }}/etc/gres/gres.conf mode=644 template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres/gres.conf mode=644
sudo: true sudo: true
when: slurm_gres_list is defined
- name: install slurm prolog - name: install slurm prolog
template: src=slurm.prolog.j2 dest={{ slurm_dir }}/bin/slurm.prolog mode=755 template: src=slurm.prolog.j2 dest={{ slurm_dir }}/bin/slurm.prolog mode=755
......
#slurm gres file for {{ ansible_hostname }}
#No Of Devices={{ slurm_gres_list | length }}
{% for gr in slurm_gres_list %} {% for gr in slurm_gres_list %}
Name={{ gr.name }} File={{ gr.file }} Name={{ gr.name }} Type={{ gr.type }} File={{ gr.file }}
{% endfor %} {% endfor %}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment