-
Notifications
You must be signed in to change notification settings - Fork 0
/
show_extractor.py
89 lines (72 loc) · 2.63 KB
/
show_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from requests_html import HTML
def get_shows_html(doc):
html = HTML(html=doc)
# deduplicate movie names as they could be repeated across the text
content = set()
for elem in html.find('font[style*="italic"]'):
title = elem.text.strip()
if title and title != ".":
if ',' in title:
names = title.split(',')
for name in names:
name = name.strip()
if name:
content.add(name)
else:
content.add(title.strip())
return content
def get_shows_nlp(doc):
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(doc)
content = set()
for ent in doc.ents:
# Uncomment to see additional information about each detected
# entity
# print(ent.text, ent.start_char, ent.end_char, ent.label_)
content.add(ent.text)
return content
def extract_content_section(filepath):
content_html = []
content_txt = []
with open(filepath, 'r') as fp:
doc = fp.read()
html = HTML(html=doc)
is_content_div = False
for elem in html.find('div'):
if elem.text == 'Content':
is_content_div = True
continue
if is_content_div:
style = elem.attrs.get('style', '')
if 'font-size:18pt' in style:
is_content_div = False
break
if 'line-height:120%' and 'font-size:11pt' in style\
and 'padding-bottom:6px' not in style:
content_html.append(elem.html)
content_txt.append(elem.text.strip())
return ("\n".join(content_html),
"\n".join(content_txt).replace("\n"," "))
def get_movie_sentences(movies, text):
sentences = text.split(". ")
movie_sentences = {}
for movie in sorted(list(movies)):
if not movie in movie_sentences:
movie_sentences[movie] = []
for sentence in sentences:
if movie in sentence:
movie_sentences[movie].append(sentence)
return movie_sentences
def emit_html(movie_sentences):
for movie in sorted(movie_sentences.keys()):
print(f"<li title='{'. '.join(movie_sentences[movie]).strip()}'>{movie}</li>")
if __name__ == '__main__':
(html, text) = extract_content_section('data/8K-Exhibit-91-2019-Q1.html')
print("Results using html parsing:")
movies_html = get_shows_html(html)
print(movies_html)
print("")
print("Results using nlp named entity detection:")
movies_nlp = get_shows_nlp(text)
print(movies_nlp)