diff --git a/veritar_walk.py b/veritar_walk.py new file mode 100644 index 0000000000000000000000000000000000000000..96ce6cf9b79cfd25b671eeef5bb1a67264315770 --- /dev/null +++ b/veritar_walk.py @@ -0,0 +1,112 @@ +""" +A script to verify files in our destination tree structure that have been +created using the asci_to_vault.py and md5_to_vault.py scripts. +This runs on a local Linux machine, on which the tape archive system is mounted. +This script uses veritar https://github.com/gazzar/veritar which is installed +with the system Python on the Linux machine. Because veritar is quite old, this +currently requires Python 2.7. It could alternatively be installed into a +Python 2.7 virtual environent or conda environment. + +Following generation of the archive tree with the asci_to_vault.py and +md5_to_vault.py scripts, each directory node in the tree should contain either +0 or 2 files. In the case of 2 files, one of which is a .md5, if the other file +is not a tar file, the node is currently skipped. If it is a tar file, veritar +is used to compare the tar file contents against the md5sum-generated .md5 file. + +Todo: +This script needs to be rewritten so that non-tars are checked by +running md5sum and using diff to check against the .md5 file. + +Authors: +gary.ruben@monash.edu + +""" + +from __future__ import print_function +import veritar.veritar as vv +import os +import time + + +BASE = '/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick' +SKIP_TO_DIR = '/home/grub0002/bapcxi/vault/IMBL_2018_Oct_McGillick/output/Iodine_CT/R1820/K1/Scan2_FRC' +SKIP_FORWARD = False +JUST_SHOW_NON_TARS = False + + +class optobj: + def __init__(self): + self.verbose = False + self.quiet = True + + +def check_oneof_tar_and_md5(files): + """True iff files contains two entries in the order .md5 then .tar""" + exts = [os.path.splitext(f)[1] for f in files] + if len(files) != 2: + return False + if exts[0] != '.md5' or exts[1] != '.tar': + return False + return True + + +def check_two_files_one_md5(files): + """True iff files contains two entries and exactly one is a .md5""" + exts = [os.path.splitext(f)[1] for f in files] + if len(files) != 2: + return False + if '.md5' not in exts: + return False + if exts[0] == exts[1]: + return False + return True + + +def lines_in(filename): + """I'm using this https://stackoverflow.com/a/15074925 + It actually gets it wrong, but I don't care; it's just to get an estimate + + """ + lines = 0 + buffer = bytearray(2048) + with open(filename) as f: + while f.readinto(buffer) > 0: + lines += buffer.count('\n') + return lines + + +opts = optobj() + +failures_of_check_oneof_tar_and_md5 = [] +failures_of_check_two_files_one_md5 = [] +tar_checksum_failures = [] + +for root, dirs, files in os.walk(BASE): + print('Directory: %s' % root) + if SKIP_FORWARD and root != SKIP_TO_DIR: + continue + SKIP_FORWARD = False + sorted_files = [os.path.join(root, f) for f in sorted(files)] + if len(sorted_files) != 0: + oneof_tar_and_md5 = check_oneof_tar_and_md5(sorted_files) + two_files_one_md5 = check_two_files_one_md5(sorted_files) + if not JUST_SHOW_NON_TARS and (oneof_tar_and_md5 and two_files_one_md5): + md5, tar = sorted_files + print(time.strftime("%H:%M:%S:")) + print(lines_in(md5), ' lines in ', md5) + v = vv.TarVerification(tar, md5, opts) + v.run() + if v.s.Processed != v.s.Good: + tar_checksum_failures.append(root) + continue + if not oneof_tar_and_md5: + failures_of_check_oneof_tar_and_md5.append(root) + if not two_files_one_md5: + failures_of_check_two_files_one_md5.append(root) + +print('Failures of check_oneof_tar_and_md5:') +print(failures_of_check_oneof_tar_and_md5) +print('Failures of check_two_files_one_md5:') +print(failures_of_check_two_files_one_md5) +print('Checksum failures:') +print(tar_checksum_failures)