-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_cleaner.py
158 lines (130 loc) · 6.19 KB
/
text_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
def tokenize_remove_stopwords(text):
'''
:param text: text string
:return: remove stopwords from provided text string and return list of all other words.
'''
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
word_tokens = tokenizer.tokenize(text)
words = [word_tokens[i] for i in range(len(word_tokens)) if
len(word_tokens[i]) > 0 and word_tokens[i] not in stop_words]
return words;
def find_bigrams(input_list):
'''
:param input_list: list of words.
:return: list of bigrams of provided list.
'''
bigram_list = []
for i in range(len(input_list) - 1):
bigram_list.append((input_list[i], input_list[i + 1]))
return bigram_list
def get_word_count(words):
'''
:param words: list of words
:return: dict of word count.
'''
distinct_words = list(set(words))
words_count = [words.count(distinct_words[i]) for i in range(len(distinct_words))]
return list(zip(distinct_words, words_count))
def get_top_topic_keywords(word_counts, threshold=10, global_topic_words=[]):
'''
:param word_counts: list of words with their count of occurrence in key pair {word: count}.
:param threshold: Integer value to extract top x% word counts.
:param global_topic_words: list of words from whole article (text)
:return: list threshold% word count.
'''
import math as m
total_words = len(word_counts)
top = m.ceil((total_words * threshold) / 100)
word_counts.sort(
key=lambda word: word[1],
reverse=True
)
topic_words = []
paragraph_topic_words = [paragraph_topic_word[0] for paragraph_topic_word in word_counts]
for g_topic_keyword in global_topic_words:
if (g_topic_keyword in paragraph_topic_words):
if (top < 1):
break
for topic_keyword in word_counts:
if topic_keyword[0] == g_topic_keyword:
topic_words.append(topic_keyword)
top = top - 1
# for paragraph_topic_word in word_counts:
# if paragraph_topic_word[0] in global_topic_words:
# topic_words.append(paragraph_topic_word)
# top = top - 1
if (top > 0):
for paragraph_topic_word in word_counts:
if paragraph_topic_word[0] not in global_topic_words:
topic_words.append(paragraph_topic_word)
top = top - 1
if top < 1: break
return topic_words
def word_count_with_most_used(words, threshold=10, global_topic_keywords=[]):
'''
:param words: list of keywords
:param threshold: default set to 10
:param global_topic_keywords: list of keywords from whole text.
:return: {word_count:[('word': count)],global_words(topic_keywords):[('word):count]}
'''
word_count = get_word_count(words)
global_most_used_words = get_top_topic_keywords(word_counts=word_count, threshold=threshold,
global_topic_words=global_topic_keywords)
return {'word_count': list(word_count), 'global_words': global_most_used_words}
# json.dump(word_count_and_most_used_global, open('global-word-counts.json', 'w'), indent=4)
def process_text(allstr, word_count_and_most_used_global, min_paragraph_length=50):
'''
:param allstr: list of all paragraphs
:param word_count_and_most_used_global: word count and global keywords fro whole text
:return: None
'''
paragraph_list = []
paragraph_text = ""
for str in allstr:
paragraph_text = "%s %s" % (paragraph_text, str)
if len(paragraph_text.strip().split(' ')) > min_paragraph_length:
paragraph_list.append(paragraph_text.replace('\n', ' ').replace('\r', '').rstrip())
paragraph_text = ""
for str in paragraph_list:
if (len(str.strip()) > 0):
import nltk
paragraph_words = tokenize_remove_stopwords(str.lower())
bigrams = nltk.bigrams([word for word in paragraph_words])
bigram_strs = [' '.join(bigram) for bigram in bigrams]
global_topic_keywords = [global_topic_keyword[0] for global_topic_keyword in
word_count_and_most_used_global['global_words']]
word_count_and_most_used = word_count_with_most_used(paragraph_words, threshold=10,
global_topic_keywords=global_topic_keywords)
global_topic_keywords_str = ",".join(
["%s : %s " % (g_count[0], g_count[1]) for g_count in
word_count_and_most_used_global['global_words']])
words_str = ",".join(paragraph_words)
bigrams_str = ",".join(bigram_strs)
word_count_str = ",".join(
["%s : %s " % (w_count[0], w_count[1]) for w_count in word_count_and_most_used['word_count']])
paragraph_topic_word_counts = ",".join(
["%s : %s " % (g_count[0], g_count[1]) for g_count in word_count_and_most_used['global_words']])
paragraph_query_str = " ".join(
["%s " % (g_count[0],) for g_count in word_count_and_most_used['global_words']])
print(
"\n--------- Paragraph Start -------------",
"\nTEXT : %s" % str.rstrip(),
"\nLENGTH : %s" % len(str.rstrip().split(' ')),
"\nWORDS : { %s }" % words_str,
"\nBI-GRAMS : { %s }" % bigrams_str,
"\nWORD-COUNT : { %s }" % word_count_str,
"\nPARAGRAPH-TOPIC-WORD-COUNT : { %s }" % paragraph_topic_word_counts,
"\nGLOBAL-TOPIC-WORD-COUNT : { %s }" % global_topic_keywords_str,
"\nPARAGRAPH-QUERY : { %s }" % paragraph_query_str,
"\n--------- Paragraph End ---------------"
)
if __name__ == '__main__':
with open('test4.txt') as f:
allstr = f.readlines()
words = tokenize_remove_stopwords(" ".join(allstr).lower())
word_count_and_most_used_global = word_count_with_most_used(words)
process_text(allstr=allstr, word_count_and_most_used_global=word_count_and_most_used_global)