-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebpage_process.py
110 lines (88 loc) · 2.75 KB
/
webpage_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from bs4 import BeautifulSoup
from nltk.tag.stanford import StanfordNERTagger
import pydash as _
path_sner_model = os.getenv(
'STANFORD_NER_MODEL',
os.path.realpath('./stanford-ner/models/english.all.3class.distsim.crf.ser.gz')
)
path_sner_jar = os.getenv(
'STANFORD_NER_JAR',
os.path.realpath('./stanford-ner/tagger/stanford-ner.jar')
)
stanford_tagger = StanfordNERTagger(path_sner_model, path_sner_jar)
"""
Strips out HTML tags
@param webpage_data:string
@returns string
"""
def cleanse_tags(webpage_data):
return BeautifulSoup(webpage_data, "html.parser").get_text()
"""
Mapper that works on each word token, tagging it as usual with a
Named Entity Recognition Tagger
@param webpage_data:string
@returns [(string, tag)]
"""
def ner_tagging(webpage_data):
ner_tuple_list = stanford_tagger.tag(webpage_data.split())
return ner_tuple_list
"""
Filters a Stanford NER tuple, grouping together neighboring words
with the same categories and removing useless categorized words.
@param ner_tuple_list:[(string, tag)]
@returns [(string, tag)]
"""
def reduce_neighbors(ner_tuple_list=[]):
def reducer(filtered_list, ner_tuple):
word, tag = ner_tuple
if tag == 'O':
return filtered_list
if not filtered_list:
filtered_list.append((word, tag))
return filtered_list
recent_filtered_word, recent_filtered_tag = filtered_list[-1]
if recent_filtered_tag == tag:
filtered_list.pop()
filtered_list.append(("%s %s" % (recent_filtered_word, word), tag))
else:
filtered_list.append((word, tag))
return filtered_list
return _.reduce_(ner_tuple_list, reducer, [])
"""
Pick the most 'important' items of count size from ner_tuple_list,
shortening it to count size. Only unique items are obtained.
@param ner_tuple_list:[(string, tag)]
@returns [(string, tag)]
"""
def pick_most_important(ner_tuple_list=[], count=0):
def reducer(word_tagfrequency_kv, ner_tuple):
word, tag = ner_tuple
if word not in word_tagfrequency_kv:
word_tagfrequency_kv[word] = {
"tag": tag,
"count": 1
}
return word_tagfrequency_kv
word_tagfrequency_kv[word] = {
"tag": tag,
"count": word_tagfrequency_kv[word]["count"] + 1
}
return word_tagfrequency_kv
word_tagfrequency_kv = _.reduce_(ner_tuple_list, reducer, {})
sorted_list = sorted(
word_tagfrequency_kv.items(),
key=lambda (key, value): value["count"],
reverse=True
)
sorted_ner_tuple_list = _.map_(
sorted_list,
lambda (key,value): (key, value["tag"])
)
sorted_length = len(sorted_ner_tuple_list)
if sorted_length > count:
return _.drop_right(
sorted_ner_tuple_list,
sorted_length - count
)
return sorted_ner_tuple_list