-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuuid_util.py
executable file
·122 lines (100 loc) · 4.32 KB
/
uuid_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
# This script can be used to verify or replace UUIDs generated by file-copier
# UUID should be generated by using project's domain name, node type of 'file' and file's file_location (new) or MD5
# (legacy used before UBC01 and UBC02) as signature
import argparse
import csv
import os
from bento.common.utils import LOG_PREFIX, APP_NAME, get_logger, get_uuid
if LOG_PREFIX not in os.environ:
os.environ[LOG_PREFIX] = 'UUID_util'
os.environ[APP_NAME] = 'UUID_util'
log = get_logger('UUID_util')
def get_new_manifest_name(manifest):
folder = os.path.dirname(manifest)
org_name, ext = os.path.basename(manifest).split('.')
new_name = f"{org_name}_corrected.{ext}"
return os.path.join(folder, new_name)
def process_file(file_obj, signature_column, uuid_column, domain, indexd_mode):
file_name = file_obj.name
log.info(f"Processing {file_name}")
data = []
reader = csv.DictReader(file_obj, delimiter='\t')
total = 0
succeeded = 0
failed = 0
for obj in reader:
total += 1
fname = os.path.basename(obj['url']) if indexd_mode else obj['file_name']
log.info(f"Comparing {fname}")
signature = obj.get(signature_column)
current_uuid = obj.get(uuid_column)
if indexd_mode:
guid_prefix, current_uuid = current_uuid.split('/')
new_uuid = get_uuid(domain, 'file', signature)
if current_uuid != new_uuid:
log.error(f"UUIDs don't match! current: {current_uuid}, new: {new_uuid}")
failed += 1
if indexd_mode:
obj[uuid_column] = f'{guid_prefix}/{new_uuid}'
else:
obj[uuid_column] = new_uuid
else:
log.info(f"UUID match!")
succeeded += 1
data.append(obj)
log.info(f'{total} rows compared')
log.info(f"{succeeded} match, {failed} don't match")
if failed > 0:
new_manifest = get_new_manifest_name(file_name)
log.info(f"Saving corrected manifest to: {new_manifest}")
with open(new_manifest, 'w') as of:
writer = csv.DictWriter(of, fieldnames=reader.fieldnames, delimiter='\t')
writer.writeheader()
for obj in data:
writer.writerow(obj)
log.info("Done")
log.info("=" * 70)
return total, succeeded, failed
def main():
valid_projects = {
'ICDC': 'caninecommons.cancer.gov',
'CTDC': 'trialcommons.cancer.gov',
'BENTO': 'bento-tools.org'
}
parser = argparse.ArgumentParser(description='Script to validate or replace UUID generated by file-copier')
parser.add_argument('manifests', type=argparse.FileType('r'),
help='Neo4j manifest file generated by file-copier',
nargs='+'
)
parser.add_argument('-p', '--project', help='Project name', choices=valid_projects.keys(), default='ICDC')
parser.add_argument('-u', '--uuid-column', help='column that contains UUID', default='uuid')
parser.add_argument('-s', '--signature-column', help='column that contains file signature new files should use '
'file_location, legacy files (before UBC01) should use '
'md5sum', default='file_location')
parser.add_argument('-i', '--indexd-mode', help='IndexD Mode', action='store_true')
args = parser.parse_args()
domain = valid_projects[args.project]
uuid_column = args.uuid_column
signature_column = args.signature_column
if args.indexd_mode:
if uuid_column == 'uuid':
uuid_column = 'GUID'
if signature_column == 'file_location':
signature_column = 'url'
if args.indexd_mode:
log.info('IndexD mode on, will process IndexD GUID properly')
log.info(f'Domain name: {domain}')
log.info(f'UUID column name: {uuid_column}, Signature column name: {signature_column}')
total = 0
succeeded = 0
failed = 0
for file_obj in args.manifests:
tot, suc, fai = process_file(file_obj, signature_column, uuid_column, domain, args.indexd_mode)
total += tot
succeeded += suc
failed += fai
log.info(f'Total: {total} rows compared')
log.info(f"{succeeded} match, {failed} don't match")
if __name__ == '__main__':
main()