-
Notifications
You must be signed in to change notification settings - Fork 1
/
page_parser.py
49 lines (37 loc) · 1.18 KB
/
page_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import parser_util
import db_util
import parser_util
import os
import os.path
PAGE_EXTENSION = '.html'
def get_record_list():
record_list = [x.strip() for x in open('record_list.txt','r').readlines()]
assert(len(record_list) == 32841)
return record_list
def parse_all_records(overwrite=True):
record_list = get_record_list()
for idx,record in enumerate(record_list):
if not overwrite and os.path.isfile(db_util.SOURCE_DIRECTORY+record+db_util.JSON_EXTENSION):
# TODO: this does not work well because the name format is different between
# the html file name names and the record in the HTML file
continue
# check to see that the file is there
fname = db_util.SOURCE_DIRECTORY + record + PAGE_EXTENSION
print idx,':',fname
#assert(os.path.exists(fname))
r = parser_util.parse_data(fname)
r.save()
if __name__ == "__main__":
#fname = 'jsc2001e07997.html'
#fname = 's111e5224.html'
#fname = 'iss005-366-029.html'
"""
fname = 'iss011e12835.html'
x = parser_util.parse_data(fname)
x.save()
"""
#print x.description
#pages = os.listdir('nasa_gallery/')
parse_all_records()
#print record_list
#print len(record_list)