Commit 0acb6a6d authored by Gary Ruben's avatar Gary Ruben
Browse files

A script to generate and transfer md5 checksum files from a remote/source server

parent cb9de9f3
A script to generate and transfer md5 checksum files from a remote/source server
to a local/destination computer. This runs on the local Linux machine, on which
the tape archive system is mounted; in our case, this is a machine at Monash.
Prior to running this an ssh key pair must be shared between the systems. See
for details on how to do this between a Monash Linux machine and ASCI
(Australian Synchrotron Compute Infrastructure). Requires Python 3.7 or higher
and uses the fabric module.
Running this creates two files in the same directory as this script file:
1. A .log file named based on the start-time timestamp which is a capture of all
stdout activity.
2. A Python pickle file named md5_state.pickle that contains the transfer state
from which failed transfers can be restarted by setting the READ_PICKLE_FILE
file to True.
Known issues
Note: The current version of fabric generates harmless warnings. This issue is
discussed here:
import os
import sys
import warnings
from dataclasses import dataclass
import pathlib
import subprocess
import pickle
import pprint
import time
from fabric import Connection
PICKLE_FILENAME = os.path.join(os.path.dirname(__file__), "md5_state.pickle")
timestamp = time.strftime("%Y-%m-%d-%H%M%S")
LOG_FILENAME = os.path.join(
SRC_PATH = f"/data/{EXPERIMENT_NAME}/asci/input"
DEST_PATH = "/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick"
class Node:
"""A directory tree node"""
src: str # source tree node path
dest: str # destination tree node path
count: int = None # number of files at the node
processed: bool = False # True iff a node transfer completes
class Logger(object):
def __init__(self):
self.terminal = sys.stdout
self.log = open(LOG_FILENAME, "a")
def write(self, message):
def flush(self):
def send_checksum(node):
"""Checksums all files in the node.src directory and sends these to the
node.dest directory across an ssh connection. The checksum file is named
after the directories trailing the SRC_PATH. Permissions are set to r_x for
group and owner.
node: Node object
Contains source and destination directories as follows:
src: full path to a remote node
e.g. /data/13660a/asci/input
dest: full path to a destination node
e.g. /home/grub0002/bapcxi/vault/imbl2018
# Check if there are any files in the node
with Connection(REMOTE_LOGIN) as c:
files =
rf"cd {node.src}; find -maxdepth 1 -type f -printf '%f\n'",
files = files.stdout.strip()
node.count = files.count('\n')
print(f'Node:{node.src}, file count:{node.count}')
if node.count == 0:
# No files at this node, just return
print('No files at node')
# Checksum files.
if node.src == SRC_PATH:
filename = os.path.basename(node.src)
filename = node.src.replace(SRC_PATH+'/', '').replace('/', '_')
output =
f"ssh {REMOTE_LOGIN} 'cd {node.src};"
"md5sum $(find -maxdepth 1 -type f | cut -c3-)'"
f"| cat > {node.dest}/{filename}.md5",
print('stdout:', output.stdout)
print('stderr:', output.stderr)
os.chmod(f'{node.dest}/{filename}.md5', 0o550)
print(f'Checksummed {node.count} files {node.src} -> {node.dest}')
node.processed = True
if __name__ == "__main__":
sys.stdout = Logger() # Log all stdout to a log file
# A hacky way to restart an interrupted transfer is to set
# READ_PICKLE_FILE = True above so that the transfer state is retrieved. By
# default the tree is built from scratch from the remote file system.
# Read the saved transfer state from the locally pickled tree object.
with open(PICKLE_FILENAME, 'rb') as f:
tree = pickle.load(f)
# Reset nodes at the end of the list with count==0 to unprocessed
# This is done because we observed a failure that mistakenly reported
# source tree nodes to have 0 files, so force a recheck of those.
for node in reversed(tree):
if node.count == 0:
node.processed = False
# Get the directory tree from remote server as a list
with Connection(REMOTE_LOGIN) as c:
result ='find {SRC_PATH} -type d')
remote_dirs = result.stdout.strip().split('\n')
# Create a tree data structure that represents both source and
# destination tree paths.
tree = []
for src in remote_dirs:
dest = src.replace(SRC_PATH, DEST_PATH)
tree.append(Node(src, dest))
# Transfer all directory tree nodes
for i, node in enumerate(tree):
if not node.processed:
pathlib.Path(node.dest).mkdir(parents=True, exist_ok=True)
os.chmod(node.dest, 0o770)
# pickle the tree to keep a record of the processed state
with open(PICKLE_FILENAME, 'wb') as f:
pickle.dump(tree, f)
print(f"Processed {i + 1} of {len(tree)} directory tree nodes")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment