-
Notifications
You must be signed in to change notification settings - Fork 1
/
harvest_solr.py
165 lines (125 loc) · 5.14 KB
/
harvest_solr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import time
import os
import json
import requests
from adsputils import setup_logging, load_config
# from tqdm import tqdm
# import ptree
#from ..utilities import remove_control_chars
# import remove_control_chars as rcc
# load API config
#solr_config = load_config(proj_home=os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
solr_config = load_config(proj_home=os.path.realpath(os.path.join(os.path.dirname(__file__), '.')))
def harvest_solr(bibcodes_list, start_index=0, fields='bibcode, title, abstract'):
''' Harvests citations for an input list of bibcodes using the ADS API.
It will perform minor cleaning of utf-8 control characters.
Log in output_dir/logs/harvest_clean.log -> tail -f logs/harvest_clean.log .
bibcodes_list: a list of bibcodes to harvest citations for.
paths_list:_list: a list of paths to save the output
start_index (optional): starting index for harvesting
fields (optional): fields to harvest from the API. Default is 'bibcode, title, abstract'.
'''
# save the log next to the bibcode files
logger = setup_logging('harvest_clean', proj_home=os.path.dirname('harvest_log.txt'))
# COnvert list of bibcodes to a set for comparison later
out_path = 'data/'
# starting index of harvesting
idx=start_index
# params for stats
# number of bibcodes to harvest at once
step_size = 2000
# step_size = 20
# json dict to dump
dataset = {}
logger.info('Start of harvest')
#start progress bar
# pbar = tqdm(total=len(bibcodes_list), initial=start_index)
# print('checkpoint harvest_solr.py')
# import pdb;pdb.set_trace()
# loop through list of bibcodes and query solr
while idx<len(bibcodes_list):
start_time = time.perf_counter()
# string to log
to_log = ''
# limit attempts to 10
attempts = 0
successful_req = False
# extract next step_size list
input_bibcodes = bibcodes_list[idx:idx+step_size]
bibcodes = 'bibcode\n' + '\n'.join(input_bibcodes)
# start attempts
while (not successful_req) and (attempts<10):
r_json = None
r = requests.post(solr_config['API_URL']+'/search/bigquery',
params={'q':'*:*', 'wt':'json', 'fq':'{!bitset}', 'fl':fields, 'rows':len(input_bibcodes)},
headers={'Authorization': 'Bearer ' + solr_config['API_TOKEN'], "Content-Type": "big-query/csv"},
data=bibcodes)
# check that request worked
# proceed if r.status_code == 200
# if fails, log r.text, then repeat for x tries
if r.status_code==200:
successful_req=True
else:
to_log += 'REQUEST {} FAILED: CODE {}\n'.format(attempts, r.status_code)
to_log += str(r.text)+'\n'
# inc count
attempts+=1
# after request
if successful_req:
#extract json
r_json = r.json()
# add to stat counts
# astronomy_count += len(r_json['response']['docs'])
# info to log
to_log += 'Harvested links up to {}\n'.format(idx)
# to_log += 'Running astronomy count={}, body count={}, ack count={}\n'.format(astronomy_count,body_count,ack_count)
# if not successful_req
else:
# add to log
to_log += 'FAILING BIBCODES: {}\n'.format(input_bibcodes)
# # raise error
# r.raise_for_status()
print()
print('index')
print(idx)
# import pdb;pdb.set_trace()
end_time = time.perf_counter()
total_time = end_time - start_time
print()
print()
print('index')
print(idx)
print()
print('Time for loop segment')
print(total_time)
# with open('data/abstracts/test.txt','w') as f:
# f.write("test")
# import pdb;pdb.set_trace()
# pause to not go over API rate limit
if len(bibcodes_list)>step_size:
time.sleep(45)
# ALWAYS DO:
# increment counter for next batch
idx+=step_size
# update progress bar
# pbar.update(step_size)
#update log
logger.info(to_log)
# print('checkpoint harvest_solr.py')
# import pdb;pdb.set_trace()
return transform_r_json(r_json)
def transform_r_json(r_json):
"""
Extract the needed information from the json response from the solr query.
"""
# extract the needed information
bibcodes = [doc['bibcode'] for doc in r_json['response']['docs']]
titles = [doc['title'][0] for doc in r_json['response']['docs']] # without [0] it returns a list
abstracts = [doc['abstract'] for doc in r_json['response']['docs']]
# list of dictionaries with the bibcode, title, and abstract for each record
record_list = [{'bibcode': bibcodes[i],
'title' : titles[i],
'abstract' : abstracts[i],
'text': f'{titles[i]} {abstracts[i]}'} for i in range(len(bibcodes))]
# return bibcodes, titles, abstracts
return record_list