Skip to content
Snippets Groups Projects
Commit 5dc83d73 authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'grestemplate1' into 'master'

Grestemplate1

Added a script to probe nvidia-smi and return list of info to template out gres.conf per file.
No nvidia-smi or no devices means an empty gres file.
tested on mc300

See merge request !76

Former-commit-id: e1897712
parents aae7ba50 26888daa
No related branches found
No related tags found
No related merge requests found
#!/bin/env python
# prints a list of NIDIA devices and their type in json format for
# parsing by ansible program;
# fields are 'name':'gpu' (fixed)
# 'file': devicePath, (i.e. /dev/nvidia0)
# 'type':typeOfDevice (i.e. 80 parsed from nvidia-smi outout)
# program returns nothing upon error (i.e. no error messages)
# Also checks for existance of /dev/nvidia? where ? is number from nvidia-smi GPU count
# nvidia-smi -L produces output like
#GPU 0: Tesla K80 (UUID: GPU-8bdb2956-4c10-7bd0-80d4-46da054663b4)
#GPU 1: Tesla K80 (UUID: GPU-19ed5f7c-435a-036e-54f0-f64209c3cede)
#GPU 2: Tesla K80 (UUID: GPU-a2f8cfe2-5bbc-de2a-8adc-4038f3379b5e)
#GPU 3: Tesla K80 (UUID: GPU-1c9c0d02-4590-c915-18d2-d709efb56d8d)
#GPU 4: Tesla K80 (UUID: GPU-b0f290c8-3b69-a518-ac77-22718f43e946)
#GPU 5: Tesla K80 (UUID: GPU-565ebca2-6b37-3bc0-a355-72330049a349)
#GPU 6: Tesla K80 (UUID: GPU-d8096845-d8a1-e3ef-ad00-c1d069c1b685)
#GPU 7: Tesla K80 (UUID: GPU-20ee0841-22b5-9974-66c0-b49e5be3e469)
import subprocess
import sys
import re
import os
import json
try:
#run nvidia-smi -L to parse output
p = subprocess.Popen(['nvidia-smi', '-L'], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = p.communicate()
lines=out.strip().split('\n')
numberOfDevices=len(lines)
typeofDevice=""
deviceList=[] #return list
for line in lines:
if not line :
break
#print "Line is ",line
pe=re.compile('GPU\s*(\d*).*Tesla\s*(\S*)')
m=pe.search(line)
if not m:
#print "No match found"
break
numberOfDevice=m.group(1)
typeOfDevice=m.group(2)
#print "Number of Devics is "+numberOfDevice+" Type of device is "+typeOfDevice
#check device file existance
devicePath="/dev/nvidia"+numberOfDevice
if os.path.exists(devicePath):
#print "OK"
deviceList.append( { 'name':'gpu' , 'file': devicePath, 'type':typeOfDevice } )
else:
#print "Error file not found ",devicePath
sys.exit(0)
#now convert list to json
output=json.dumps(deviceList)
print output
except OSError:
#if nvidia-smi is not installed on computer then this error is thrown by subprocess.Popen
sys.exit(0)
...@@ -124,21 +124,19 @@ ...@@ -124,21 +124,19 @@
- include: installSlurmFromSource.yml - include: installSlurmFromSource.yml
- name: check slurm generic resource - name: Gres - Test for Nvidia devices
shell: "{{ slurm_gres_check }}" script: ./nvidia-probe.py
register: slurm_generic_resource register: probeOutput
ignore_errors: true
when: slurm_gres_check is defined
- name: install gres config file - set_fact: slurm_gres_list= "[ ]"
template: src=gres.conf.j2 dest={{ slurm_dir }}/etc/gres.conf mode=644
sudo: true - name: "set nvidiaprobe slurm_gres_list"
when: slurm_generic_resource is defined and slurm_generic_resource.stdout set_fact: slurm_gres_list={{ probeOutput.stdout | from_json }}
when: probeOutput.stdout is defined and ( probeOutput.stdout|length !=0 )
- name: install gres sub config file - name: template gres.conf file
template: src=gres_sub.conf.j2 dest={{ slurm_dir }}/etc/gres/gres.conf mode=644 template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres/gres.conf mode=644
sudo: true sudo: true
when: slurm_gres_list is defined
- name: install slurm prolog - name: install slurm prolog
template: src=slurm.prolog.j2 dest={{ slurm_dir }}/bin/slurm.prolog mode=755 template: src=slurm.prolog.j2 dest={{ slurm_dir }}/bin/slurm.prolog mode=755
......
#slurm gres file for {{ ansible_hostname }}
#No Of Devices={{ slurm_gres_list | length }}
{% for gr in slurm_gres_list %} {% for gr in slurm_gres_list %}
Name={{ gr.name }} File={{ gr.file }} Name={{ gr.name }} Type={{ gr.type }} File={{ gr.file }}
{% endfor %} {% endfor %}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment