From 6bf2f7d598af0bc4dcc8bf11d1abe0017ea0596c Mon Sep 17 00:00:00 2001 From: handreas <andreas.hamacher@monash.edu> Date: Tue, 7 Dec 2021 19:51:05 +1100 Subject: [PATCH] service reliability improved. still not super satisfied --- maintenance.yml | 4 ++-- roles/mellanox_config/templates/pfc-ecn.sh.j2 | 3 +++ .../mellanox_config/templates/roce_mode.service.j2 | 14 +++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/maintenance.yml b/maintenance.yml index 2436b711..87fe2ad0 100644 --- a/maintenance.yml +++ b/maintenance.yml @@ -68,7 +68,7 @@ become: true tags: [never,sqlverify] -- hosts: 'LoginNodes:!perfsonar01:!GlobusNodes' +- hosts: 'LoginNodes:!perfsonar01' gather_facts: false tasks: - name: set nologin @@ -87,7 +87,7 @@ - name: terminate user ssh processes block: - { name: kill shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root } - - { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root } + - { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp|sftp-server|sshd" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root } - { name: kill vscode, shell: 'pgrep -f vscode | xargs -I{} kill -09 {}', become: true, become_user: root, ignore_errors: true } become: true tags: [never,terminateusersshscprsync] diff --git a/roles/mellanox_config/templates/pfc-ecn.sh.j2 b/roles/mellanox_config/templates/pfc-ecn.sh.j2 index 82bffe29..be5f4291 100644 --- a/roles/mellanox_config/templates/pfc-ecn.sh.j2 +++ b/roles/mellanox_config/templates/pfc-ecn.sh.j2 @@ -66,6 +66,9 @@ for int in "${mellanox_interface_array[@]}"; do `which mlnx_qos` -i $int --pfc 0,0,0,1,0,0,0,0 done +/sbin/ethtool -A p1p1 rx off tx off || /bin/true +/sbin/ethtool -A p1p2 rx off tx off || /bin/true + {% endif %} {% if "OpenStack Nova" in ansible_product_name %} diff --git a/roles/mellanox_config/templates/roce_mode.service.j2 b/roles/mellanox_config/templates/roce_mode.service.j2 index 3c36753a..5cfc0b6b 100644 --- a/roles/mellanox_config/templates/roce_mode.service.j2 +++ b/roles/mellanox_config/templates/roce_mode.service.j2 @@ -10,20 +10,16 @@ RemainAfterExit=yes ExecStart=/usr/local/bin/flush_neigh_devs.sh {% endif %} +{% if rocemode == "2" %} +ExecStartPre=/bin/sleep 30 +{% endif %} + {% for device in qibdevicenames.stdout_lines %} ExecStart=/usr/sbin/cma_roce_mode -d {{ device }} -p 1 -m {{ rocemode }} {% endfor %} {% if rocemode == "2" %} -ExecStartPost=/usr/sbin/pfc-ecn.sh -{% endif %} - -{% if "OpenStack Nova" not in ansible_product_name %} -ExecStartPost=/sbin/ethtool -A p1p1 rx off tx off -ExecStartPost=/sbin/ethtool -A p1p2 rx off tx off -{% else %} -# disable global pause counters. Query via ethtool -a mlx0. Commented because they are always off anyway and turning them of returns code 78 -#ExecStartPost=/sbin/ethtool -A mlx0 rx off tx off +ExecStart=/usr/sbin/pfc-ecn.sh {% endif %} -- GitLab