From 4662d60752383d398a1cf4de77772b15265513d2 Mon Sep 17 00:00:00 2001 From: Chris Hines <chris.hines@monash.edu> Date: Fri, 30 Jun 2023 13:16:38 +1000 Subject: [PATCH] make sure we will processes on TimeoutExpired ... also use start_new_session which may help with killing processes in an uninteruptable state --- tes/sshwrapper/__init__.py | 42 +++++++++++--------------------------- tes/tunnelstat/__init__.py | 7 ++++--- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/tes/sshwrapper/__init__.py b/tes/sshwrapper/__init__.py index e8ecbf6..45424f3 100644 --- a/tes/sshwrapper/__init__.py +++ b/tes/sshwrapper/__init__.py @@ -98,10 +98,14 @@ class Ssh: @staticmethod def check_ctrlsocket(sess, ctrlsocket): cmd = ['ssh','-S',ctrlsocket,'-O','check','-l','noone','nohost'] + check_p = None try: - check_p = subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,stdin=None) + check_p = subprocess.Popen(cmd,start_new_session=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE,stdin=None) (stderr,stdout) = check_p.communicate(timeout=Ssh.TIMEOUT) except subprocess.TimeoutExpired as e: + if check_p is not None: + check_p.kill() + (stderr, stdout) = check_p.communicate() raise e return check_p.returncode == 0 @@ -115,7 +119,6 @@ class Ssh: import stat import logging logger = logging.getLogger() - logger.debug("enter get_ctrl_master_socket") ctrlsocket = "/tmp/cm-{}-{}-{}".format(user,host,sess.sshsessid) sess.lock.acquire() try: @@ -128,7 +131,7 @@ class Ssh: try: mode = os.stat(ctrlsocket).st_mode - logger.debug('removing stale control socket') + logger.debug(f'removing stale control socket for {user}') os.unlink(ctrlsocket) # This might occur if a container was restarted and the file wasn't removed # This is expected: It indicates there is not yet a control master for this user on this login node except FileNotFoundError as e: @@ -146,12 +149,12 @@ class Ssh: logger.debug("Exception get_ctrl_master_socket") raise SshAgentException("No ssh-agent yet") env['SSH_AUTH_SOCK'] = sess.socket - logger.debug("creating master socket") - logger.debug("ssh_auth_sock {}".format(sess.socket)) + logger.debug(f'creating master socket for {user}') logger.debug(" ".join(sshcmd)) ctrl_p = subprocess.Popen(sshcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=None, + start_new_session=True, env=env) (stderr,stdout) = ctrl_p.communicate() # The process should exit, while the mux persists except Exception as e: @@ -304,7 +307,6 @@ class Ssh: import os import logging logger = logging.getLogger() - logger.debug("enter execute") if cmd is None and stdin is None: return {'stdout': b'', 'stderr': b'No command given to execute'} env = os.environ.copy() @@ -341,18 +343,17 @@ class Ssh: sshcmd.extend([ '-o', "ProxyCommand={}".format(proxycmd), '-p', str(bastionsshport), '-l', user, bastion, cmd]) - exec_p = subprocess.Popen(sshcmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,stdin=subprocess.PIPE,env=env) + exec_p = subprocess.Popen(sshcmd,start_new_session=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE,stdin=subprocess.PIPE,env=env) try: if stdin is not None: - logger.debug('begining comms with timeout') (stdout, stderr) = exec_p.communicate(stdin.encode(),timeout=Ssh.TIMEOUT) else: - logger.debug('begining comms with timeout') (stdout, stderr) = exec_p.communicate(timeout=Ssh.TIMEOUT) except subprocess.TimeoutExpired as e: logger.debug('cmd timed out') - logger.debug("exception execute") + exec_p.kill() + (stdout,stderr) = exec_p.communicate() raise e #raise SshExecException(message="A program timed out on the backend server") if exec_p.returncode == 255: @@ -393,7 +394,6 @@ class Ssh: else: msg = "The program {} on {} failed. The error message was {}".format(cmd,host,stderr.decode()) raise SshExecException(message=msg) - logger.debug("leaving execute") # Because people often put stuff in their bashrc which casuses stderr to be non-empty, we will swallow errors in the bashrc without comment if b'bashrc' in stderr or b'locale' in stderr: return {'stdout':stdout,'stderr':b''} @@ -414,7 +414,6 @@ class Ssh: import os import logging logger = logging.getLogger() - logger.debug("enter tunnel") env = os.environ.copy() env['SSH_AUTH_SOCK'] = sess.socket if ":" in host: @@ -452,9 +451,7 @@ class Ssh: '-N', '-p', sshport, '-l', user, batchhost]) - logger.debug('starting tunnel with {}'.format(" ".join(sshcmd))) - tunnel_p = subprocess.Popen(sshcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - logger.debug('tunnel created with pid {}'.format(tunnel_p.pid)) + tunnel_p = subprocess.Popen(sshcmd, start_new_session=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) try: if not Ssh.wait_for_tunnel(localport): @@ -476,18 +473,6 @@ class Ssh: sess.port.update({authtok: localport}) sess.pids.append(tunnel_p.pid) sess.tunnels.append(tunnel_p) -# try: -# #tunnel_p.wait(timeout=3) -# import time -# time.sleep(3) -# tunnel_p.poll() -# if tunnel_p.returncode != None: -# (stdout,stderr) = tunnel_p.communicate() -# logger.error('tunnel process exited {} {} {}'.format(tunnel_p.returncode, stdout, stderr)) -# sess.tunnels.remove(tunnel_p) -# except subprocess.TimeoutExpired: -# pass - logger.debug("leaving tunnel") return localport, [tunnel_p.pid] # @staticmethod @@ -505,14 +490,11 @@ class Ssh: while notopen: ssock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ssock.setblocking(True) - logger.debug('testing port {}'.format(localport)) try: ssock.connect(('127.0.0.1', localport)) - logger.debug('port is open') notopen = False ssock.close() except socket.error: - logger.debug('port is not open yet') time.sleep(0.1) ssock.close() return True diff --git a/tes/tunnelstat/__init__.py b/tes/tunnelstat/__init__.py index bddc41b..569ab9b 100644 --- a/tes/tunnelstat/__init__.py +++ b/tes/tunnelstat/__init__.py @@ -37,7 +37,7 @@ class SSHSession: if app.config['ENABLELAUNCH'] and 'SSH_AUTH_SOCK' in os.environ and os.environ['SSH_AUTH_SOCK']: self.socket = os.environ['SSH_AUTH_SOCK'] return - p = subprocess.Popen([self.sshagent],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + p = subprocess.Popen([self.sshagent],start_new_session=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) (stdout,stderr) = p.communicate() for l in stdout.decode().split(';'): if 'SSH_AUTH_SOCK=' in l: @@ -187,13 +187,14 @@ class SSHSession: for pid in self.pids: logger.debug("killing pid {}".format(pid)) try: - os.killpg(int(pid), signal.SIGTERM) # Sometimes this fails and I don't know why + os.killpg(os.getpgid(int(pid)), signal.SIGTERM) # Sometimes this fails and I don't know why try: os.kill(int(pid), 0) # If the first kill worked, this will raise a ProcessLookupError time.sleep(2) - os.killpg(int(pid),signal.SIGKILL) + os.killpg(os.getpgid(int(pid)),signal.SIGKILL) logger.error('resorting to sigkill for pid {}'.format(pid)) except ProcessLookupError: + logger.debug(f'sigterm succesfull for {pid}') pass logger.debug("killed {}".format(pid)) except ProcessLookupError as e: -- GitLab