Commit 2fa0b3ef authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'master' into 'master'

Master

Mellanx drivers reboot did not work. Converted "command:" to "shell:"
Need new drivers for new kernel version

See merge request !6
parents e1d8dc2b 0a676a01
--- ---
lustre_pkgs: lustre_pkgs:
- lustre-client-modules-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm # old rmps for older kernel
- lustre-client-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm #- lustre-client-modules-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm
#- lustre-client-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm
#simon comment out
#- lustre-client-modules-2.7.0-3.10.0_229.20.1.el7.x86_64.x86_64.rpm
#- lustre-client-2.7.0-3.10.0_229.20.1.el7.x86_64.x86_64.rpm
# shahahh mods
- lustre-client-modules-2.7.65-3.10.0_327.4.4.el7.x86_64_gab38c3a.x86_64.rpm
- lustre-client-2.7.65-3.10.0_327.4.4.el7.x86_64_gab38c3a.x86_64.rpm
--- ---
- include_vars: mellanoxVars.yml - include_vars: mellanoxVars.yml
- name: yum install dependencies - name: yum install dependencies
yum: name=perl,pciutils,gtk2,atk,cairo,gcc-gfortran,libxml2-python,tcsh,libnl,lsof,tcl,tk yum: name=perl,pciutils,gtk2,atk,cairo,gcc-gfortran,libxml2-python,tcsh,libnl,lsof,tcl,tk
sudo: true sudo: true
...@@ -105,9 +104,9 @@ ...@@ -105,9 +104,9 @@
# A REBOOT IS NEEDED AFTER SUCCESSFUL INSTALL # A REBOOT IS NEEDED AFTER SUCCESSFUL INSTALL
# #
- name: restart machine - name: restart machine
command: "sleep 5; sudo shutdown -r now" shell: "sleep 5; sudo shutdown -r now"
async: 2 async: 2
poll: 0 poll: 1
ignore_errors: true ignore_errors: true
sudo: true sudo: true
when: ansible_os_family=="RedHat" and drivers_installed|failed when: ansible_os_family=="RedHat" and drivers_installed|failed
......
--- ---
#note. do not add '.tgz' to driver src. done in playbook #note. do not add '.tgz' to driver src. done in playbook
MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.1-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" #MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.1-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}"
MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.2-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}"
MELLANOX_DEVICE_NAME: "{% if ansible_os_family == 'RedHat' %}ens6{% elif ansible_os_family == 'Debian' %}eth1{% endif %}" MELLANOX_DEVICE_NAME: "{% if ansible_os_family == 'RedHat' %}ens6{% elif ansible_os_family == 'Debian' %}eth1{% endif %}"
...@@ -58,6 +58,9 @@ CompleteWait=10 ...@@ -58,6 +58,9 @@ CompleteWait=10
#UsePAM= #UsePAM=
# #
# TIMERS # TIMERS
SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed
#SlurmctldTimeout=300 #SlurmctldTimeout=300
#SlurmdTimeout=300 #SlurmdTimeout=300
#InactiveLimit=0 #InactiveLimit=0
...@@ -150,5 +153,5 @@ NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMe ...@@ -150,5 +153,5 @@ NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMe
{% endfor %} {% endfor %}
{% for queue in slurmqueues %} {% for queue in slurmqueues %}
PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=24:00:00 State=UP PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP
{% endfor %} {% endfor %}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment