-
Notifications
You must be signed in to change notification settings - Fork 1
/
tag_json.py
79 lines (70 loc) · 2.49 KB
/
tag_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python2.7
"""
Tags JSON files ccording to valid_tags.txt and saves new tags
onto the JSON files.
"""
import db_util
import tag_analyzer
from progressbar import ProgressBar, SimpleProgress
TAGS_FILE = 'valid_tags.txt'
canonical_tags = {}
canonical_set = None # helps make set intersections easier
tag_index = {}
def get_tag_list():
with (open(TAGS_FILE,'r')) as f:
# strip each element in the resulting list
# ignore comments - lines which start with #
tag_list = [[y.strip() for y in x.split(',')] for x in f.readlines() if not x.startswith('#') and len(x.strip()) > 0]
return tag_list
def tag_list_to_json():
# TODO: write a function to parse the tag list file
pass
def read_tags():
global canonical_tags
global canonical_set
global tag_index
tags_list = get_tag_list()
for idx,tags in enumerate(tags_list):
base = tags[0].strip()
tag_index[base] = idx
for tag in tags:
tag = tag.strip()
# key will be the lower case, "cleaned" version for easy hashing
# value will be the canonical value it should translate to
canonical_tags[tag_analyzer.clean_words(tag)] = base
canonical_set = frozenset(canonical_tags.keys())
#print canonical_tags
def tag_json():
"""
Tags all the files in the JSON directory
"""
records = db_util.get_all_records()
pbar = ProgressBar(widgets=[SimpleProgress()], maxval=len(records)).start()
for idx, record in enumerate(records):
record = db_util.open_NASARecord(record)
# it's OK to concatenate all the text since we're using a
# bag of words approach
record_words = set() # set of all the words & phrases in the record
for i in range(1, tag_analyzer.MAX_PHRASE_LENGTH+1):
word_hash = {}
tag_analyzer.hash_string(record, i, word_hash)
record_words = record_words.union(set(word_hash.keys()))
# now, you have a hash of all the word combinations
#print "Description:"
#print record.description
#print record_words
found_tags = list(canonical_set.intersection(record_words))
#print "\n------------------------------------------------"
#print "Tags found:"
indicies = [tag_index[canonical_tags[x]] for x in found_tags]
record.tags = indicies
record.save()
# TODO: save back into the file
# TODO: create a tag_list.txt file which has all the possible tags
# in a list, so that you don't have to write the whole string into
# the file
pbar.update(idx + 1)
pbar.finish()
if __name__ == "__main__":
read_tags()
tag_json()