-
Notifications
You must be signed in to change notification settings - Fork 0
/
counter.py
137 lines (123 loc) · 5.07 KB
/
counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import matplotlib.pyplot as plt
import string
import numpy as np
from operator import itemgetter
# Read the file line by line and analyse the word frequency
def count_frequency(file_location):
# Create an empty dictionary
d = dict()
# Loop through each line of the file
try:
with open(file_location, "r") as file:
stop_word_list = get_stop_word_list() # List of stopwords()
for line in file:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Remove the punctuation marks from the line
line = line.translate(line.maketrans("", "", string.punctuation))
# Remove the digits marks from the line
line = line.translate(line.maketrans("", "", string.digits))
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
if word == "" or word in stop_word_list:
continue
# Check if the word is already in dictionary
elif word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
# sort the dictionary by the frequency count values, in Descending order
sortedDict = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
return sortedDict
except IOError:
print("Invalid file location")
return None
# Load stop word list from kaggle's englishST.txt
def get_stop_word_list():
with open('englishST.txt') as f:
stop_word_list=[]
for word in f.read().split():
stop_word_list.append(word)
return stop_word_list
# feature to write the word and frequency into file
def write_to_file(sortedDict, file_location):
# make a new text file, then write results to that
with open(file_location, 'w') as f:
for key in list(sortedDict.keys()):
f.write(f"{key} ----> {sortedDict[key]}\n")
# make a new text file, then write user input to that
def create_user_input_file(lines):
try:
with open("userText.txt", 'w') as f:
for item in lines:
f.write(item + '\n')
except IOError:
print("Error writing into the file from user input")
#print the word and the frequency on console
def print_the_words_with_frequency(sortedDict):
print(f"Word{' ':14} Frequency{' ':15}")
for key in list(sortedDict.keys()):
print(f"{key:15} {sortedDict[key]:8}")
# Matplot to generate a plot
def plot_graph(sortedDict):
#Plot top 25 words
top_25 = dict(sorted(sortedDict.items(), key=itemgetter(1), reverse=True)[:25])
data_words = top_25.keys()
words_counts = top_25.values()
indexes = np.arange(len(data_words))
width = 0.4
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.bar(indexes, words_counts, width)
plt.xticks(indexes + width * 0.5, data_words, rotation=30)
plt.title(label="Top 20 Word Frequency counter",
loc="left",
fontstyle='italic')
mng = plt.get_current_fig_manager()
mng.full_screen_toggle()
mng.set_window_title("Word Frequency Counter project by - Jyoti Narang")
plt.show()
def main():
user_input = input("Read text from console or local file location. Choose 1 for to input through console and 2 for input through file :")
result = None
if user_input == "1":
print("Enter Text. To continue entering more press enter or else press q to quit: ")
lines = []
while True:
line = input()
if line.rstrip().lower()=='q':
break
else:
lines.append(line)
create_user_input_file(lines)
result = count_frequency("userText.txt")
elif user_input == "2":
file_name = input("Enter relative path of file : ")
result = count_frequency(file_name)
else:
print("Incorrect choice!")
if result != None:
print("\n\n######## Word Frequency Occurrence Details ########")
print("Total number of words:" , len(result))
print("Most common word used is: " , max(result, key=result.get))
print("######################################################")
#print("{} ===> {}".format("WORDS", "COUNT"))
#print("\n")
#df = pd.DataFrame([result])
#print(df)
print_the_words_with_frequency(result)
print("#####################################################")
print("########### Plotting the graph please wait.... ########")
plot_graph(result)
print("########### Plotting finished ########")
else:
print("Result is unavailable")
if __name__ == "__main__":
main()