diff --git a/roles/lustre-client/vars/CentOS_7.yaml b/roles/lustre-client/vars/CentOS_7.yaml index 762d8c74619a6eeb819d18bdefd9d379c705a9a4..2d3cc205004a9f61ef3859aa90bfb29c0c6f4036 100644 --- a/roles/lustre-client/vars/CentOS_7.yaml +++ b/roles/lustre-client/vars/CentOS_7.yaml @@ -1,5 +1,12 @@ --- lustre_pkgs: - - lustre-client-modules-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm - - lustre-client-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm + # old rmps for older kernel + #- lustre-client-modules-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm + #- lustre-client-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm + #simon comment out + #- lustre-client-modules-2.7.0-3.10.0_229.20.1.el7.x86_64.x86_64.rpm + #- lustre-client-2.7.0-3.10.0_229.20.1.el7.x86_64.x86_64.rpm + # shahahh mods + - lustre-client-modules-2.7.65-3.10.0_327.4.4.el7.x86_64_gab38c3a.x86_64.rpm + - lustre-client-2.7.65-3.10.0_327.4.4.el7.x86_64_gab38c3a.x86_64.rpm diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml index a662dc369342369848fe56b0d4a1bf1590463901..d2541f9f40d2cd8dd2506357443a0b045c9d56be 100644 --- a/roles/mellanox_drivers/tasks/main.yml +++ b/roles/mellanox_drivers/tasks/main.yml @@ -1,7 +1,6 @@ --- - include_vars: mellanoxVars.yml - - name: yum install dependencies yum: name=perl,pciutils,gtk2,atk,cairo,gcc-gfortran,libxml2-python,tcsh,libnl,lsof,tcl,tk sudo: true @@ -105,9 +104,9 @@ # A REBOOT IS NEEDED AFTER SUCCESSFUL INSTALL # - name: restart machine - command: "sleep 5; sudo shutdown -r now" + shell: "sleep 5; sudo shutdown -r now" async: 2 - poll: 0 + poll: 1 ignore_errors: true sudo: true when: ansible_os_family=="RedHat" and drivers_installed|failed diff --git a/roles/mellanox_drivers/vars/mellanoxVars.yml b/roles/mellanox_drivers/vars/mellanoxVars.yml index e2277dd40ad203653226ea0a356c0806689d7822..77f83630d62fb6c7317a61386adce9fca60775c7 100644 --- a/roles/mellanox_drivers/vars/mellanoxVars.yml +++ b/roles/mellanox_drivers/vars/mellanoxVars.yml @@ -1,4 +1,5 @@ --- #note. do not add '.tgz' to driver src. done in playbook - MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.1-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" + #MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.1-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" + MELLANOX_DRIVER_SRC: "{% if ansible_os_family == 'RedHat' %}MLNX_OFED_LINUX-3.1-1.0.3-rhel7.2-x86_64-ext{% elif ansible_os_family == 'Debian' %}MLNX_OFED_LINUX-3.1-1.0.3-ubuntu14.04-x86_64{% endif %}" MELLANOX_DEVICE_NAME: "{% if ansible_os_family == 'RedHat' %}ens6{% elif ansible_os_family == 'Debian' %}eth1{% endif %}" diff --git a/roles/slurm-common/templates/slurm.conf.j2 b/roles/slurm-common/templates/slurm.conf.j2 index b3770b09ec71e4128d0819113dc14982a256a52d..4d868b18af4d1f62074380c95a438b7f707f8858 100644 --- a/roles/slurm-common/templates/slurm.conf.j2 +++ b/roles/slurm-common/templates/slurm.conf.j2 @@ -58,6 +58,9 @@ CompleteWait=10 #UsePAM= # # TIMERS + +SlurmctldTimeout=3000 #added due to network failures causing jobs to be killed + #SlurmctldTimeout=300 #SlurmdTimeout=300 #InactiveLimit=0 @@ -150,5 +153,5 @@ NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMe {% endfor %} {% for queue in slurmqueues %} -PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=24:00:00 State=UP +PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }} DefaultTime=72:00:00 State=UP {% endfor %}