-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscrape-data.py
110 lines (88 loc) · 3.44 KB
/
scrape-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os, json, grequests, requests, functools
SEARCH_LIMIT = 10000
REQUEST_BATCH_SIZE = 50
REQUEST_FEEDBACK_INTERVAL = 50
USER_AGENT = "NamUs Scraper / github.com/prepager/namus-scraper"
API_ENDPOINT = "https://www.namus.gov/api"
STATE_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/States"
CASE_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/{type}/Cases/{case}"
SEARCH_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/{type}/Search"
DATA_OUTPUT = "./output/{type}/{type}.json"
CASE_TYPES = {
"MissingPersons": {"stateField": "stateOfLastContact"},
"UnidentifiedPersons": {"stateField": "stateOfRecovery"},
"UnclaimedPersons": {"stateField": "stateFound"},
}
completedCases = 0
def main():
print("Fetching states\n")
states = requests.get(STATE_ENDPOINT, headers={"User-Agent": USER_AGENT}).json()
for caseType in CASE_TYPES:
print("Collecting: {type}".format(type=caseType))
global completedCases
completedCases = 0
print(" > Fetching case identifiers")
searchRequests = (
grequests.post(
SEARCH_ENDPOINT.format(type=caseType),
headers={"User-Agent": USER_AGENT, "Content-Type": "application/json"},
data=json.dumps(
{
"take": SEARCH_LIMIT,
"projections": ["namus2Number"],
"predicates": [
{
"field": CASE_TYPES[caseType]["stateField"],
"operator": "IsIn",
"values": [state["name"]],
}
],
}
),
)
for state in states
)
searchRequests = grequests.map(searchRequests, size=REQUEST_BATCH_SIZE)
cases = functools.reduce(
lambda output, element: output + element.json()["results"],
searchRequests,
[],
)
print(" > Found %d cases" % len(cases))
print(" > Creating output file")
filePath = DATA_OUTPUT.format(type=caseType)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
outputFile = open(filePath, "w")
outputFile.write("[")
print(" > Starting case processing")
caseRequests = (
grequests.get(
CASE_ENDPOINT.format(type=caseType, case=case["namus2Number"]),
hooks={"response": requestFeedback},
headers={"User-Agent": USER_AGENT},
)
for case in cases
)
caseRequests = grequests.map(caseRequests, size=REQUEST_BATCH_SIZE)
for index, case in enumerate(caseRequests):
if not case:
print(
" > Failed parsing case: {case} index {index}".format(
case=cases[index], index=index
)
)
continue
outputFile.write(
case.text + ("," if ((index + 1) != len(caseRequests)) else "")
)
print(" > Closing output file")
outputFile.write("]")
outputFile.close()
print()
print("Scraping completed")
def requestFeedback(response, **kwargs):
global completedCases
completedCases = completedCases + 1
if completedCases % REQUEST_FEEDBACK_INTERVAL == 0:
print(" > Completed {count} cases".format(count=completedCases))
main()