Merge branch 'grestemplate1' into 'master'

Grestemplate1 Added a script to probe nvidia-smi and return list of info to template out gres.conf per file. No nvidia-smi or no devices means an empty gres file. tested on mc300 See merge request !76 Former-commit-id: e1897712

Merge branch 'grestemplate1' into 'master'
Grestemplate1 Added a script to probe nvidia-smi and return list of info to template out gres.conf per file. No nvidia-smi or no devices means an empty gres file. tested on mc300 See merge request !76 Former-commit-id: e1897712
5dc83d73 · Chris Hines · aae7ba50 · 26888daa · 5dc83d73 · 5dc83d73
Commit 5dc83d73 authored 8 years ago by Chris Hines
--- a/roles/slurm-common/scripts/nvidia-probe.py
+++ b/roles/slurm-common/scripts/nvidia-probe.py
+#!/bin/env python
+# prints  a list of NIDIA devices and their type in json format for 
+# parsing by ansible program; 
+# fields are 'name':'gpu' (fixed)
+#            'file': devicePath, (i.e. /dev/nvidia0)
+#            'type':typeOfDevice (i.e. 80 parsed from nvidia-smi outout)
+# program returns nothing upon error (i.e. no error messages)
+# Also checks for existance of /dev/nvidia? where ? is number from nvidia-smi GPU count
+# nvidia-smi -L produces output like
+#GPU 0: Tesla K80 (UUID: GPU-8bdb2956-4c10-7bd0-80d4-46da054663b4)
+#GPU 1: Tesla K80 (UUID: GPU-19ed5f7c-435a-036e-54f0-f64209c3cede)
+#GPU 2: Tesla K80 (UUID: GPU-a2f8cfe2-5bbc-de2a-8adc-4038f3379b5e)
+#GPU 3: Tesla K80 (UUID: GPU-1c9c0d02-4590-c915-18d2-d709efb56d8d)
+#GPU 4: Tesla K80 (UUID: GPU-b0f290c8-3b69-a518-ac77-22718f43e946)
+#GPU 5: Tesla K80 (UUID: GPU-565ebca2-6b37-3bc0-a355-72330049a349)
+#GPU 6: Tesla K80 (UUID: GPU-d8096845-d8a1-e3ef-ad00-c1d069c1b685)
+#GPU 7: Tesla K80 (UUID: GPU-20ee0841-22b5-9974-66c0-b49e5be3e469)
+import subprocess
+import sys
+import re
+import os
+import json
+try:
+	#run nvidia-smi -L to parse output
+        p = subprocess.Popen(['nvidia-smi', '-L'], stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+        out, err = p.communicate()
+        lines=out.strip().split('\n')
+        numberOfDevices=len(lines)
+        typeofDevice=""
+        deviceList=[] #return list
+        for line in lines:
+                if not line :
+                        break
+                #print "Line is ",line
+                pe=re.compile('GPU\s*(\d*).*Tesla\s*(\S*)')
+                m=pe.search(line)
+                if not m:
+                        #print "No match found"
+                        break
+                numberOfDevice=m.group(1)
+                typeOfDevice=m.group(2)
+                #print "Number of Devics is "+numberOfDevice+" Type of device is "+typeOfDevice
+                #check device file existance
+                devicePath="/dev/nvidia"+numberOfDevice
+                if os.path.exists(devicePath):
+                        #print "OK"
+                        deviceList.append( { 'name':'gpu' , 'file': devicePath, 'type':typeOfDevice } )
+ 		else:
+                        #print "Error file not found ",devicePath
+			sys.exit(0)
+        #now convert list to json 
+        output=json.dumps(deviceList)
+        print output
+except OSError:
+#if nvidia-smi is not installed on computer then this error is thrown by subprocess.Popen
+        sys.exit(0)
--- a/roles/slurm-common/tasks/main.yml
+++ b/roles/slurm-common/tasks/main.yml
@@ -124,21 +124,19 @@
 - include: installSlurmFromSource.yml
- name: check slurm generic resource
+- name: Gres - Test for Nvidia devices
-  shell: "{{ slurm_gres_check }}"
+  script: ./nvidia-probe.py
-  register: slurm_generic_resource
+  register: probeOutput
-  ignore_errors: true
-  when: slurm_gres_check is defined
- name: install gres config file 
+- set_fact: slurm_gres_list= "[ ]"
-  template: src=gres.conf.j2 dest={{ slurm_dir }}/etc/gres.conf mode=644
-  sudo: true
+- name: "set nvidiaprobe slurm_gres_list"
-  when: slurm_generic_resource is defined and slurm_generic_resource.stdout 
+  set_fact: slurm_gres_list={{  probeOutput.stdout | from_json }}
+  when: probeOutput.stdout is defined and ( probeOutput.stdout|length !=0 )
- name: install gres sub config file 
+- name: template gres.conf file
-  template: src=gres_sub.conf.j2 dest={{ slurm_dir }}/etc/gres/gres.conf mode=644
+  template: src="gres.conf.j2" dest={{ slurm_dir }}/etc/gres/gres.conf mode=644
  sudo: true
-  when: slurm_gres_list is defined
 - name: install slurm prolog
  template: src=slurm.prolog.j2 dest={{ slurm_dir }}/bin/slurm.prolog mode=755

--- a/roles/slurm-common/templates/gres.conf.j2
+++ b/roles/slurm-common/templates/gres.conf.j2
+#slurm gres file for {{ ansible_hostname }} 
+#No Of Devices={{ slurm_gres_list | length }}
 {% for gr in slurm_gres_list %}
-Name={{ gr.name }} File={{ gr.file }}
+Name={{ gr.name }} Type={{ gr.type }} File={{ gr.file }}
 {% endfor %}