""" A script to generate and transfer md5 checksum files from a remote/source server to a local/destination computer. This runs on the local Linux machine, on which the tape archive system is mounted; in our case, this is a machine at Monash. Prior to running this an ssh key pair must be shared between the systems. See https://confluence.apps.monash.edu/display/XI/Australian+Synchrotron for details on how to do this between a Monash Linux machine and ASCI (Australian Synchrotron Compute Infrastructure). Requires Python 3.7 or higher and uses the fabric module. Authors: gary.ruben@monash.edu michelle.croughan@monash.edu Running this creates two files in the same directory as this script file: 1. A .log file named based on the start-time timestamp which is a capture of all stdout activity. 2. A Python pickle file named md5_state.pickle that contains the transfer state from which failed transfers can be restarted by setting the READ_PICKLE_FILE file to True. Known issues ------------ Note: The current version of fabric generates harmless warnings. This issue is discussed here: https://github.com/paramiko/paramiko/issues/1369 """ import os import sys import warnings from dataclasses import dataclass import pathlib import subprocess import pickle import pprint import time from fabric import Connection READ_PICKLE_FILE = False EXPERIMENT_NAME = "13660a" PICKLE_FILENAME = os.path.join(os.path.dirname(__file__), "md5_state.pickle") timestamp = time.strftime("%Y-%m-%d-%H%M%S") LOG_FILENAME = os.path.join( os.path.dirname(__file__), f"md5-{EXPERIMENT_NAME}-{timestamp}.log" ) REMOTE_LOGIN = "gary.ruben@monash.edu@sftp2.synchrotron.org.au" SRC_PATH = f"/data/{EXPERIMENT_NAME}/asci/output" DEST_PATH = "/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick/output" @dataclass class Node: """A directory tree node""" src: str # source tree node path dest: str # destination tree node path count: int = None # number of files at the node processed: bool = False # True iff a node transfer completes class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(LOG_FILENAME, "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): self.terminal.flush() self.log.flush() def send_checksum(node): """Checksums all files in the node.src directory and sends these to the node.dest directory across an ssh connection. The checksum file is named after the directories trailing the SRC_PATH. Permissions are set to r_x for group and owner. Args: node: Node object Contains source and destination directories as follows: src: full path to a remote node e.g. /data/13660a/asci/input dest: full path to a destination node e.g. /home/grub0002/bapcxi/vault/imbl2018 """ # Check if there are any files in the node with Connection(REMOTE_LOGIN) as c: files = c.run( rf"cd {node.src}; find -maxdepth 1 -type f -printf '%f\n'", echo=True ) files = files.stdout.strip() node.count = len(files.splitlines()) print(f'Node:{node.src}, file count:{node.count}') if node.count == 0: # No files at this node, just return print('No files at node') else: # Checksum files. if node.src == SRC_PATH: filename = os.path.basename(node.src) else: filename = node.src.replace(SRC_PATH+'/', '').replace('/', '_') output = subprocess.run( f"ssh {REMOTE_LOGIN} 'cd {node.src};" "md5sum $(find -maxdepth 1 -type f | cut -c3-)'" f"| cat > {node.dest}/{filename}.md5", shell=True, check=True ) print('stdout:', output.stdout) print('stderr:', output.stderr) os.chmod(f'{node.dest}/{filename}.md5', 0o550) print(f'Checksummed {node.count} files {node.src} -> {node.dest}') node.processed = True if __name__ == "__main__": sys.stdout = Logger() # Log all stdout to a log file # A hacky way to restart an interrupted transfer is to set # READ_PICKLE_FILE = True above so that the transfer state is retrieved. By # default the tree is built from scratch from the remote file system. if READ_PICKLE_FILE: # Read the saved transfer state from the locally pickled tree object. with open(PICKLE_FILENAME, 'rb') as f: tree = pickle.load(f) print('tree:') pprint.pprint(tree) # Reset nodes at the end of the list with count==0 to unprocessed # This is done because we observed a failure that mistakenly reported # source tree nodes to have 0 files, so force a recheck of those. for node in reversed(tree): if node.count == 0: node.processed = False else: break else: # Get the directory tree from remote server as a list with Connection(REMOTE_LOGIN) as c: result = c.run(f'find {SRC_PATH} -type d') remote_dirs = result.stdout.strip().split('\n') # Create a tree data structure that represents both source and # destination tree paths. tree = [] for src in remote_dirs: dest = src.replace(SRC_PATH, DEST_PATH) tree.append(Node(src, dest)) # Transfer all directory tree nodes for i, node in enumerate(tree): if not node.processed: pathlib.Path(node.dest).mkdir(parents=True, exist_ok=True) os.chmod(node.dest, 0o770) send_checksum(node) # pickle the tree to keep a record of the processed state with open(PICKLE_FILENAME, 'wb') as f: pickle.dump(tree, f) print(f"Processed {i + 1} of {len(tree)} directory tree nodes")