From 0e20caa0ecadc92093f73fb21a15eb88088fcd95 Mon Sep 17 00:00:00 2001 From: "Gary Ruben (Monash University)" <gruben@m3-login2.massive.org.au> Date: Mon, 27 Apr 2020 14:16:05 +1000 Subject: [PATCH] Changed to use commandline instead of hardcoded options --- asci_to_vault.py | 184 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 134 insertions(+), 50 deletions(-) diff --git a/asci_to_vault.py b/asci_to_vault.py index d1b863d..feb726f 100644 --- a/asci_to_vault.py +++ b/asci_to_vault.py @@ -1,8 +1,8 @@ """ A script to transfer a tree of data files from a remote/source server to a -local/destination computer. This runs on the local Linux machine, on which the -tape archive system is mounted; in our case, this is a machine at Monash. Prior -to running this an ssh key pair must be shared between the systems. See +local/destination computer. This runs on a local Linux machine or the eResearch dtn, on +which the tape archive system is mounted; in our case, this is a machine at Monash. +Prior to running this an ssh key pair must be shared between the systems. See https://confluence.apps.monash.edu/display/XI/Australian+Synchrotron for details on how to do this between a Monash Linux machine and ASCI (Australian Synchrotron Compute Infrastructure). Requires Python 3.7 or higher @@ -11,25 +11,25 @@ and uses the fabric module. Authors: gary.ruben@monash.edu michelle.croughan@monash.edu +linda.croton@monash.edu Note that current version creates two files in the same directory as this script 1. A .log file named based on the start-time timestamp which is a capture of all stdout activity. 2. A Python pickle file named tree_state.pickle that contains the transfer state - from which failed transfers can be restarted by setting the READ_PICKLE_FILE + from which failed transfers can be restarted by setting the read_pickle_file file to True. Known issues ------------ Note: The current version of fabric generates harmless warnings. This issue is - discussed - here: https://github.com/paramiko/paramiko/issues/1369 + discussed here: https://github.com/paramiko/paramiko/issues/1369 Notes ----- This is a possible option for checksumming: https://stackoverflow.com/q/45819356/ -KERNEL_CHECKSUM=$(cpio --to-stdout -i kernel.fat16 < archive.cpio | sha256sum | awk '{print $1}') +KERNEL_CHECKSUM=$(cpio --to-stdout -i kernel.fat16 < archive.cpio | sha256sum | awk "{print $1}") We used the following command to check whether a transfer was successful immediately prior to a failure of the ASCI filesystem. @@ -42,6 +42,7 @@ http://docs.pyinvoke.org/en/1.2/api/runners.html#invoke.runners.Runner.run """ import os +import re import sys import warnings from dataclasses import dataclass @@ -50,22 +51,11 @@ import subprocess import pickle import pprint import time +import click +import textwrap from fabric import Connection -READ_PICKLE_FILE = False -EXPERIMENT_NAME = "13660a" -PICKLE_FILENAME = os.path.join(os.path.dirname(__file__), "tree_state.pickle") -timestamp = time.strftime("%Y-%m-%d-%H%M%S") -LOG_FILENAME = os.path.join( - os.path.dirname(__file__), - f"{EXPERIMENT_NAME}-{timestamp}.log" -) -REMOTE_LOGIN = "gary.ruben@monash.edu@sftp1.synchrotron.org.au" -SRC_PATH = f"/data/{EXPERIMENT_NAME}/asci/output" -DEST_PATH = "/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick/output" - - @dataclass class Node: """A directory tree node""" @@ -76,9 +66,9 @@ class Node: class Logger(object): - def __init__(self): + def __init__(self, log_filename): self.terminal = sys.stdout - self.log = open(LOG_FILENAME, "a") + self.log = open(log_filename, "a") def write(self, message): self.terminal.write(message) @@ -96,7 +86,7 @@ def send_directory(node): Different methods are used for single versus multiple files. For single files, scp is used. For multiple files cpio is used to tar the files into a single tarball. The destination tarball is named after the directories - trailing SRC_PATH. Permissions are set to r_x for group and owner. + trailing src_path. Permissions are set to r_x for group and owner. Args: node: Node object @@ -109,7 +99,7 @@ def send_directory(node): """ # Check if there are any files in the node. - with Connection(REMOTE_LOGIN) as c: + with Connection(remote_login) as c: files = c.run( rf"cd {node.src}; find -maxdepth 1 -type f -printf '%f\n'", echo=True @@ -117,53 +107,141 @@ def send_directory(node): files = files.stdout.strip() node.count = len(files.splitlines()) - print(f'Node:{node.src}, file count:{node.count}') + print(f"Node:{node.src}, file count:{node.count}") if node.count == 0: # No files at this node, just return - print('No files to transfer') + print("No files to transfer") elif node.count == 1: # Only one file. Just copy unchanged. output = subprocess.run( - f"scp -q {REMOTE_LOGIN}:{node.src}/{files} {node.dest}", + f"scp -q {remote_login}:{node.src}/{files} {node.dest}", shell=True, check=True ) - print('stdout:', output.stdout) - print('stderr:', output.stderr) - os.chmod(f'{node.dest}/{files}', 0o550) - print(f'Transferred single file {node.src} -> {node.dest}') + print("stdout:", output.stdout) + print("stderr:", output.stderr) + os.chmod(f"{node.dest}/{files}", 0o550) + print(f"Transferred single file {node.src} -> {node.dest}") else: # More than one file. Transfer all files to a tarball. - if node.src == SRC_PATH: + if node.src == src_path: filename = os.path.basename(node.src) else: - filename = node.src.replace(SRC_PATH+'/', '').replace('/', '_') + filename = node.src.replace(src_path+"/", "").replace("/", "_") output = subprocess.run( - f"ssh {REMOTE_LOGIN} 'cd {node.src};" + f"ssh {remote_login} 'cd {node.src};" f"find -maxdepth 1 -type f -print0 |" f"cpio -o -H ustar -0' | cat > {node.dest}/{filename}.tar", shell=True, check=True ) - print('stdout:', output.stdout) - print('stderr:', output.stderr) - os.chmod(f'{node.dest}/{filename}.tar', 0o550) - print(f'Transferred {node.count} files {node.src} -> {node.dest}') + print("stdout:", output.stdout) + print("stderr:", output.stderr) + os.chmod(f"{node.dest}/{filename}.tar", 0o550) + print(f"Transferred {node.count} files {node.src} -> {node.dest}") node.processed = True -if __name__ == "__main__": - sys.stdout = Logger() # Log all stdout to a log file +@click.command() +@click.argument("remote_login") +@click.argument("experiment_name") +@click.argument("src_path", type=click.Path()) +@click.argument("dest_path", type=click.Path()) +@click.option("-p","pickle_filename", help="Pickle filename, e.g. 'foo.pickle' (default = experiment_name.pickle") +@click.option("-r","read_pickle_file",is_flag=True, help="If True, continue from current pickle state") +def main( + remote_login, + experiment_name, + src_path, + dest_path, + pickle_filename, + read_pickle_file +): + """ + \b + Example + ------- + $ python asci_to_vault.py gary.ruben@monash.edu@sftp1.synchrotron.org.au 15223 /data/15223/asci/input /home/gruben/vault/vault/IMBL/IMBL_2019_Nov_Croton/input + + A script to transfer a tree of data files from a remote/source server to a + local/destination computer. This runs on a local Linux machine or the eResearch dtn, on + which the tape archive system is mounted; in our case, this is a machine at Monash. + Prior to running this an ssh key pair must be shared between the systems. See + https://confluence.apps.monash.edu/display/XI/Australian+Synchrotron + for details on how to do this between a Monash Linux machine and ASCI + (Australian Synchrotron Compute Infrastructure). Requires Python 3.7 or higher + and uses the fabric module. + + Note that current version creates two files in the same directory as this script + 1. A .log file named based on the start-time timestamp which is a capture of all + stdout activity. + 2. A Python pickle file named tree_state.pickle that contains the transfer state + from which failed transfers can be restarted by setting the read_pickle_file + file to True. + + """ + assert 5 <= len(experiment_name) <= 6 + if pickle_filename is None: + pickle_filename = experiment_name+".pickle" + + path, base = os.path.split(pickle_filename) + + if path == "": + pickle_filename = os.path.join(os.path.dirname(__file__), pickle_filename) + + timestamp = time.strftime("%Y-%m-%d-%H%M%S") + + log_filename = os.path.join( + os.path.dirname(__file__), + f"{experiment_name}-{timestamp}.log" + ) + + if re.fullmatch(r"[a-zA-z0-9_\-\.@]+@[a-zA-Z0-9_\-\.]+", remote_login) is None: + raise Exception("Invalid form for login address") - # A hacky way to restart an interrupted transfer is to set - # READ_PICKLE_FILE = True above so that the transfer state is retrieved. - if READ_PICKLE_FILE: + """ + Possible file name formats: + /data/<experiment number>/asci/input + /data/<experiment number>/asci/output + input + output + output/username/working/ + output/username/working + """ + + src_file_path = src_path.split("/")[:5] + + if src_file_path[0] == "": + # Assume full path specified + assert "/".join(src_file_path[:4]) == f"/data/{experiment_name}/asci" + assert src_file_path[4] == "input" or src_file_path[4] == "output" + else: + assert src_file_path[0] == "input" or src_file_path[0] == "output" + src_path = os.path.join(f"/data/{experiment_name}/asci/", *src_file_path) + + # remote_login = "gary.ruben@monash.edu@sftp1.synchrotron.org.au" + # src_path = f"/data/{experiment_name}/asci/output" + # dest_path = "/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick/output" + + sys.stdout = Logger(log_filename) # Log all stdout to a log file + + print(textwrap.dedent(f""" + remote_login = {remote_login} + experiment_name = {experiment_name} + src_path = {src_path} + dest_path = {dest_path} + pickle_filename = {pickle_filename} + read_pickle_file = {read_pickle_file} + """)) + + # If the read_pickle_file flag is set, resume the transfer. + if read_pickle_file: # Read the saved transfer state from the locally pickled tree object. - with open(PICKLE_FILENAME, 'rb') as f: + with open(pickle_filename, "rb") as f: tree = pickle.load(f) - print('tree:') + print("tree:") pprint.pprint(tree) # Reset nodes at the end of the list with count==0 to unprocessed @@ -176,26 +254,32 @@ if __name__ == "__main__": break else: # Get the directory tree from the remote server as a list. - with Connection(REMOTE_LOGIN) as c: - result = c.run(f'find {SRC_PATH} -type d') + with Connection(remote_login) as c: + result = c.run(f"find {src_path} -type d") remote_dirs = result.stdout.strip().splitlines() # Create a tree data structure that represents both source and # destination tree paths. tree = [] for src in remote_dirs: - dest = src.replace(SRC_PATH, DEST_PATH) + dest = src.replace(src_path, dest_path) tree.append(Node(src, dest)) # Transfer all directory tree nodes. for i, node in enumerate(tree): if not node.processed: - pathlib.Path(node.dest).mkdir(parents=True, exist_ok=True) + print(pathlib.Path(node.dest)) + 1/0 + pathlib.Path(node.dest).mkdir(mode=0o770, parents=True, exist_ok=True) os.chmod(node.dest, 0o770) send_directory(node) # pickle the tree to keep a record of the processed state. - with open(PICKLE_FILENAME, 'wb') as f: + with open(pickle_filename, "wb") as f: pickle.dump(tree, f) print(f"Processed {i + 1} of {len(tree)} directory tree nodes") + + +if __name__ == "__main__": + main() -- GitLab