From 3b125ca5ad5c83fcd3b23ca13b7f67023634f472 Mon Sep 17 00:00:00 2001
From: Chris Hines <chris.hines@monash.edu>
Date: Fri, 18 Sep 2020 10:30:54 +1000
Subject: [PATCH] still trying to get the ctrl_socket right

---
 tes/sshwrapper/__init__.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tes/sshwrapper/__init__.py b/tes/sshwrapper/__init__.py
index 68d6e3a..b73be84 100644
--- a/tes/sshwrapper/__init__.py
+++ b/tes/sshwrapper/__init__.py
@@ -105,8 +105,8 @@ class Ssh:
         import stat
         import logging
         logger = logging.getLogger()
-        ctrlsocket = "/tmp/cm-{}-{}".format(user,host)
         sess.lock.acquire()
+        ctrlsocket = "/tmp/cm-{}-{}".format(user,host)
         try:
             mode = os.stat(ctrlsocket).st_mode
             # If the control process died (docker restarted, or otherwise exited) but the socket was not removed:
@@ -148,6 +148,7 @@ class Ssh:
                       '-p', sshport, '-N','-l', user, host])
             env = os.environ.copy()
             if sess.socket is None:
+                sess.lock.release()
                 raise SshAgentException("No ssh-agent yet")
             env['SSH_AUTH_SOCK'] = sess.socket
             logger.debug("creating master socket")
@@ -161,6 +162,15 @@ class Ssh:
             #logger.debug('communicate on the control port complete')
             logger.debug("spanwed ssh mux with pid {}".format(ctrl_p.pid))
             #sess.pids.append(ctrl_p.pid)
+            if ctrlsocket in sess.ctrl_processes:
+                logger.error('existing control process!!!')
+                old_ctrl_p = sess.ctrl_processes[ctrlsocket]
+                old_ctrl_p.poll()
+                if old_ctrl_p.returncode is not None:
+                    logger.error('old ctrl_p is still running?')
+                    old_ctrl_p.kill()
+                    (stderr,stdout) = old_ctrl_p.communicate()
+                    logger.error('{} {}'.format(stderr,stdout))
             sess.ctrl_processes[ctrlsocket] = ctrl_p
             notstarted = True
             notdead = True
@@ -172,6 +182,7 @@ class Ssh:
                     notdead = False
                     (stdout,stderr) = ctrl_p.communicate()
                     logger.error('ctrl_p died {} {} {}'.format(ctrl_p.returncode,stdout,stderr))
+                    sess.lock.release()
                     raise SshCtrlException(stderr.decode())
                 try:
                     mode = os.stat(ctrlsocket).st_mode
@@ -182,6 +193,10 @@ class Ssh:
                     wait=wait+1
                 if wait>60:
                     ctrl_p.kill()
+                    (stdout,stderr) = ctrl_p.communicate()
+                    logger.error('ctrl_p was killed due to timeout {} {} {}'.format(ctrl_p.returncode,stdout,stderr))
+                    sess.lock.release()
+                    raise SshCtrlException(stderr.decode())
 
         sess.lock.release()
         return ctrlsocket
-- 
GitLab