forked from adsabs/ADSReferencePipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
361 lines (320 loc) · 16.9 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import sys
import os, fnmatch
from adsputils import setup_logging, load_config, get_date
from datetime import timedelta
import argparse
from adsrefpipe import tasks
from adsrefpipe.refparsers.handler import verify
from adsrefpipe.utils import get_date_modified_struct_time, ReprocessQueryType
proj_home = os.path.realpath(os.path.dirname(__file__))
config = load_config(proj_home=proj_home)
app = tasks.app
logger = setup_logging('run.py')
def run_diagnostics(bibcodes, source_filenames):
"""
Show information about what we have in our storage.
:param: bibcodes - list of bibcodes
:param: source_filenames - list of source filenames
"""
max_entries_diagnostics = config['MAX_ENTRIES_DIAGNOSTICS']
# make sure we only send max number of entires per bibcode/source_file to be queried
if bibcodes:
bibcodes = bibcodes[:max_entries_diagnostics]
if source_filenames:
source_filenames = source_filenames[:max_entries_diagnostics]
results = app.diagnostic_query(bibcodes, source_filenames)
for result in results:
print(result)
return
def get_source_filenames(source_file_path, file_extension, date_cutoff):
"""
:param source_file_path:
:param date_cutoff: if modified date is after this date
:return: list of files in the directory with modified date after the cutoff, if any
"""
list_files = []
for root, dirs, files in os.walk(source_file_path):
for basename in files:
if fnmatch.fnmatch(basename, file_extension):
filename = os.path.join(root, basename)
if get_date_modified_struct_time(filename) >= date_cutoff:
list_files.append(filename)
return list_files
def queue_references(references, source_filename, source_bibcode, parsername):
"""
:param reference:
:param source_filename:
:param source_bibcode:
:param parsername:
:return:
"""
resolver_service_url = config['REFERENCE_PIPELINE_SERVICE_URL'] + app.get_reference_service_endpoint(parsername)
for reference in references:
reference_task = {'reference': reference,
'source_bibcode': source_bibcode,
'source_filename': source_filename,
'resolver_service_url': resolver_service_url}
tasks.task_process_reference.delay(reference_task)
def process_files(filenames):
"""
two ways to queue references: one is to read source files, the other is to query database
this is to read the source reference file and queue each reference for processing
:param files:
:return:
"""
for filename in filenames:
# from filename get the parser info
# file extension, and bibstem and volume directories are used to query database and return the parser info
# ie for filename `adsrefpipe/tests/unittests/stubdata/txt/ARA+A/0/0000ADSTEST.0.....Z.ref.raw`
# extension ref.raw, bibstem directory ARA+A and volume directory 0 is used and the
# parser info is {'name': 'ThreeBibsTxt',
# 'extension_pattern': '.ref.raw',
# 'reference_service_endpoint': '/text',
# 'matches': [[{'journal': 'AnRFM', 'volume_end': 37, 'volume_begin': 34},
# {'journal': 'ARA+A', 'volume_end': 43, 'volume_begin': 40},
# {'journal': 'ARNPS', 'volume_end': 56, 'volume_begin': 52}]]}
parser_dict = app.get_parser(filename)
# now map parser name to the class (see adsrefpipe/refparsers/handler.py)
# ie parser name ThreeBibsTxt is mapped to ThreeBibstemsTXTtoREFs
# 'ThreeBibsTxt': ThreeBibstemsTXTtoREFs,
# note that from the class name it is clear which type of parser this is
# (ie, this is a TXT parser implemented in module adsrefpipe/refparsers/ADStxt.py)
parser = verify(parser_dict.get('name'))
if not parser:
logger.error("Unable to detect which parser to use for the file %s." % filename)
continue
# now read the source file
toREFs = parser(filename=filename, buffer=None)
if toREFs:
# next parse the references
parsed_references = toREFs.process_and_dispatch()
if not parsed_references:
logger.error("Unable to parse %s." % toREFs.filename)
continue
for block_references in parsed_references:
# save the initial records in the database,
# this is going to be useful since it allows us to be able to tell if
# anything went wrong with the service that we did not get back the
# resolved reference
references = app.populate_tables_pre_resolved_initial_status(source_bibcode=block_references['bibcode'],
source_filename=filename,
parsername=parser_dict.get('name'),
references=block_references['references'])
if not references:
logger.error("Unable to insert records from %s to db." % toREFs.filename)
continue
queue_references(references, filename, block_references['bibcode'], parser_dict.get('name'))
else:
logger.error("Unable to process %s. Skipped!" % toREFs.filename)
def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_cutoff=None):
"""
two ways to queue references: one is to read source files, the other is to query database
this is to query the db and queue each reference for processing
:param reprocess_type:
:param param:
:param date_cutoff:
:return:
"""
records = app.get_reprocess_records(reprocess_type, score_cutoff, match_bibcode, date_cutoff)
for record in records:
# from filename get the parser info
# file extension, and bibstem and volume directories are used to query database and return the parser info
# ie for filename `adsrefpipe/tests/unittests/stubdata/txt/ARA+A/0/0000ADSTEST.0.....Z.ref.raw`
# extension ref.raw, bibstem directory ARA+A and volume directory 0 is used and the
# parser info is {'name': 'ThreeBibsTxt',
# 'extension_pattern': '.ref.raw',
# 'reference_service_endpoint': '/text',
# 'matches': [[{'journal': 'AnRFM', 'volume_end': 37, 'volume_begin': 34},
# {'journal': 'ARA+A', 'volume_end': 43, 'volume_begin': 40},
# {'journal': 'ARNPS', 'volume_end': 56, 'volume_begin': 52}]]}
parser_dict = app.get_parser(record['source_filename'])
# now map parser name to the class (see adsrefpipe/refparsers/handler.py)
# ie parser name ThreeBibsTxt is mapped to ThreeBibstemsTXTtoREFs
# 'ThreeBibsTxt': ThreeBibstemsTXTtoREFs,
# note that from the class name it is clear which type of parser this is
# (ie, this is a TXT parser implemented in module adsrefpipe/refparsers/ADStxt.py)
parser = verify(parser_dict.get('name'))
if not parser:
logger.error("Unable to detect which parser to use for the file %s." % record['source_filename'])
continue
# now pass the result records from query to the parser object
toREFs = parser(filename=None, buffer=record)
if toREFs:
# next parse the references
parsed_references = toREFs.dispatch()
if not parsed_references:
logger.error("Unable to parse %s." % toREFs.filename)
continue
for block_references in parsed_references:
# save the retry records in the database,
references = app.populate_tables_pre_resolved_retry_status(source_bibcode=block_references['bibcode'],
source_filename=record['source_filename'],
source_modified=record['source_modified'],
retry_records=block_references['references'])
if not references:
logger.error("Unable to reprocess records from file %s." % toREFs.filename)
continue
queue_references(references, toREFs.filename, block_references['bibcode'], parser_dict.get('name'))
else:
logger.error("Unable to process %s. Skipped!" % toREFs.filename)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process user input.')
subparsers = parser.add_subparsers(help='commands', dest="action", required=True)
diagnostics = subparsers.add_parser('DIAGNOSTICS', help='Show diagnostic message')
diagnostics.add_argument('-b',
'--bibcodes',
dest='bibcodes',
action='store',
nargs='+',
default=[],
help='List of bibcodes separated by spaces')
diagnostics.add_argument('-s',
'--source_filenames',
dest='source_filenames',
action='store',
nargs='+',
default=[],
help='List of source_filenames separated by spaces')
diagnostics.add_argument('-p',
'--parse_filename',
dest='parse_filename',
action='store',
default=None,
help='Verify that the file can be parsed and references resolved')
resolve = subparsers.add_parser('RESOLVE', help='Resolve references')
resolve.add_argument('-s',
'--source_filenames',
dest='source_filenames',
action='store',
nargs='+',
default=[],
help='List of source file names (either xml or raw) separated by spaces')
resolve.add_argument('-p',
'--path',
dest='path',
action='store',
default=None,
help='Path of source files for resolving')
resolve.add_argument('-e',
'--extension',
dest='extension',
action='store',
default=None,
help='Extension of files to locate in the path directory')
resolve.add_argument('-d',
'--days',
dest='days',
action='store',
default=None,
help='Resolve only those that are this many days old')
resolve.add_argument('-c',
'--confidence',
dest='confidence',
action='store',
default=None,
help='Reprocess resolved records confidence score lower than this value')
resolve.add_argument('-b',
'--bibstem',
dest='bibstem',
action='store',
default=None,
help='Reprocess resolved records having this bibstem')
resolve.add_argument('-y',
'--year',
dest='year',
action='store',
default=None,
help='Reprocess resolved records having this year')
resolve.add_argument('-f',
'--fail',
dest='fail',
action='store_true',
help='Reprocess records that failed to get resolved')
stats = subparsers.add_parser('STATS', help='Print out statistics of the reference source file')
stats.add_argument('-b',
'--bibcode',
dest='bibcode',
action='store',
default=None,
help='Statistics of source reference, comparing classic and service reference resolvering if available')
stats.add_argument('-s',
'--source_filename',
dest='source_filename',
action='store',
default=None,
help='Statistics of source reference, comparing classic and service reference resolvering if available')
stats.add_argument('-p',
'--parser',
dest='parser',
action='store',
default=None,
help='To list all source filenames resolved from a specific parser')
stats.add_argument('-c',
'--count',
dest='count',
action='store_true',
help='Print out the count of records in the four main tables')
args = parser.parse_args()
if args.action == 'DIAGNOSTICS':
if args.parse_filename:
name = app.get_parser(args.parse_filename)
if name:
print('Source file `%s` shall be parsed using `%s` parser.' % (args.parse_filename, name))
else:
print('No parser yet to parse source file `%s`.' % args.parse_filename)
# either pass in the list of bibcodes, or list of filenames to query db on
# if neither bibcode nor filenames are supplied, number of records for the tables are displayed
else:
run_diagnostics(args.bibcodes, args.source_filenames)
elif args.action == 'RESOLVE':
if args.source_filenames:
process_files(args.source_filenames)
elif args.path or args.extension:
if not args.extension:
print('Both path and extension are required params. Provide extention by -e <extension of files to locate in the path directory>.')
elif not args.path:
print('Both path and extension are required params. Provide path by -p <path of source files for resolving>.')
else:
# if days has been specified, read it and only consider files with date from today-days,
# otherwise we are going with everything
if args.days:
date_cutoff = get_date() - timedelta(days=int(args.days))
else:
date_cutoff = get_date('1972')
source_filenames = get_source_filenames(args.path, args.extension, date_cutoff.timetuple())
if len(source_filenames) > 0:
process_files(source_filenames)
elif args.confidence:
date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None
reprocess_references(ReprocessQueryType.score, score_cutoff=float(args.confidence), date_cutoff=date_cutoff)
elif args.bibstem:
date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None
reprocess_references(ReprocessQueryType.bibstem, match_bibcode=args.bibstem, date_cutoff=date_cutoff)
elif args.year:
date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None
reprocess_references(ReprocessQueryType.year, match_bibcode=args.bibstem, date_cutoff=date_cutoff)
elif args.fail:
date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None
reprocess_references(ReprocessQueryType.failed, date_cutoff=date_cutoff)
# TODO: do we need more command for querying db
elif args.action == 'STATS':
if args.bibcode or args.source_filename:
table, num_references, num_resolved = app.get_service_classic_compare_stats_grid(args.bibcode, args.source_filename)
print('\n',table,'\n')
print('Num References:', num_references)
print('Num References Resolved:', num_resolved)
print('\n')
elif args.parser:
records = app.query_reference_source_tbl(parsername=args.parser)
if not records:
print('No records found for parser %s.'%args.parser)
else:
for record in records:
print(record['source_filename'])
elif args.count:
results = app.get_count_records()
print('\n')
for result in results:
print('Currently there are %d records in `%s` table, which holds %s.'%(result['count'], result['name'], result['description']))
print('\n')
sys.exit(0)