From 6bf2f7d598af0bc4dcc8bf11d1abe0017ea0596c Mon Sep 17 00:00:00 2001
From: handreas <andreas.hamacher@monash.edu>
Date: Tue, 7 Dec 2021 19:51:05 +1100
Subject: [PATCH] service reliability improved. still not super satisfied

---
 maintenance.yml                                    |  4 ++--
 roles/mellanox_config/templates/pfc-ecn.sh.j2      |  3 +++
 .../mellanox_config/templates/roce_mode.service.j2 | 14 +++++---------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/maintenance.yml b/maintenance.yml
index 2436b711..87fe2ad0 100644
--- a/maintenance.yml
+++ b/maintenance.yml
@@ -68,7 +68,7 @@
     become: true
     tags: [never,sqlverify]
 
-- hosts: 'LoginNodes:!perfsonar01:!GlobusNodes'
+- hosts: 'LoginNodes:!perfsonar01'
   gather_facts: false
   tasks:
   - name: set nologin
@@ -87,7 +87,7 @@
   - name: terminate user ssh processes
     block:
       - { name: kill shells, shell: 'ps aux | grep -i bash | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
-      - { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
+      - { name: kill rsync sftp scp, shell: 'ps aux | egrep "sleep|sh|rsync|sftp|scp|sftp-server|sshd" | grep -v "ec2-user" | grep -v "root" | sed "s/\ \ */\ /g" | cut -f 2 -d " " | xargs -I{} kill -09 {}', become: true, become_user: root }
       - { name: kill vscode, shell: 'pgrep -f vscode | xargs -I{} kill -09 {}', become: true, become_user: root, ignore_errors: true }
     become: true
     tags: [never,terminateusersshscprsync]
diff --git a/roles/mellanox_config/templates/pfc-ecn.sh.j2 b/roles/mellanox_config/templates/pfc-ecn.sh.j2
index 82bffe29..be5f4291 100644
--- a/roles/mellanox_config/templates/pfc-ecn.sh.j2
+++ b/roles/mellanox_config/templates/pfc-ecn.sh.j2
@@ -66,6 +66,9 @@ for int in "${mellanox_interface_array[@]}"; do
    `which mlnx_qos` -i $int --pfc 0,0,0,1,0,0,0,0
 done
 
+/sbin/ethtool -A p1p1 rx off tx off || /bin/true
+/sbin/ethtool -A p1p2 rx off tx off || /bin/true
+
 {% endif %}
 
 {% if "OpenStack Nova" in ansible_product_name %}
diff --git a/roles/mellanox_config/templates/roce_mode.service.j2 b/roles/mellanox_config/templates/roce_mode.service.j2
index 3c36753a..5cfc0b6b 100644
--- a/roles/mellanox_config/templates/roce_mode.service.j2
+++ b/roles/mellanox_config/templates/roce_mode.service.j2
@@ -10,20 +10,16 @@ RemainAfterExit=yes
 ExecStart=/usr/local/bin/flush_neigh_devs.sh
 {% endif %}
 
+{% if rocemode == "2" %}
+ExecStartPre=/bin/sleep 30
+{% endif %}
+
 {% for device in qibdevicenames.stdout_lines %}
 ExecStart=/usr/sbin/cma_roce_mode -d {{ device }} -p 1 -m {{ rocemode }}
 {% endfor %}
 
 {% if rocemode == "2" %}
-ExecStartPost=/usr/sbin/pfc-ecn.sh
-{% endif %}
-
-{% if "OpenStack Nova" not in ansible_product_name %}
-ExecStartPost=/sbin/ethtool -A p1p1 rx off tx off
-ExecStartPost=/sbin/ethtool -A p1p2 rx off tx off
-{% else %}
-# disable global pause counters. Query via ethtool -a mlx0. Commented because they are always off anyway and turning them of returns code 78
-#ExecStartPost=/sbin/ethtool -A mlx0 rx off tx off
+ExecStart=/usr/sbin/pfc-ecn.sh
 {% endif %}
 
 
-- 
GitLab