forked from x4nth055/pythoncode-tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zipf_curve.py
103 lines (74 loc) · 2.78 KB
/
zipf_curve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Imports
import os
from matplotlib import pyplot as plt
import string
import numpy as np
from scipy.interpolate import make_interp_spline
# define some dictionaries
texts = {}
textlengths = {}
textwordamounts = {}
unwantedCharacters = list(string.punctuation)
# How many ranks well show
depth = 10
xAxis = [str(number) for number in range(1, depth+1)]
# Getting all files in text folder
filePaths = os.listdir('texts')
# Getting text from .txt files in folder
for path in filePaths:
with open(os.path.join('texts', path), 'r', encoding='UTF-8') as f:
texts[path.split('.')[0]] = f.read()
# Cleaning and counting the Text
for text in texts:
# Remove unwanted characters from the texts
for character in unwantedCharacters:
texts[text] = texts[text].replace(character, '').lower()
splittedText = texts[text].split(' ')
# Saving the text length to show in the label of the line later
textlengths[text] = len(splittedText)
# Here will be the amount of occurence of each word stored
textwordamounts[text] = {}
# Loop through all words in the text
for i in splittedText:
# Add to the word at the given position if it already exists
# Else set the amount to one essentially making a new item in the dict
if i in textwordamounts[text].keys():
textwordamounts[text][i] += 1
else:
textwordamounts[text][i] = 1
# Sorting the dict by the values with sorted
# define custom key so the function knows what to use when sorting
textwordamounts[text] = dict(
sorted(
textwordamounts[text ].items(),
key=lambda x: x[1],
reverse=True)[0:depth]
)
# Get the percentage value of a given max value
def percentify(value, max):
return round(value / max * 100)
# Generate smooth curvess
def smoothify(yInput):
x = np.array(range(0, depth))
y = np.array(yInput)
# define x as 600 equally spaced values between the min and max of original x
x_smooth = np.linspace(x.min(), x.max(), 600)
# define spline with degree k=3, which determines the amount of wiggle
spl = make_interp_spline(x, y, k=3)
y_smooth = spl(x_smooth)
# Return the twe x and y axis
return x_smooth, y_smooth
# Make the perfect Curve
ziffianCurveValues = [100/i for i in range(1, depth+1)]
x, y = smoothify(ziffianCurveValues)
plt.plot(x, y, label='Ziffian Curve', ls=':', color='grey')
# Plot the texts
for i in textwordamounts:
maxValue = list(textwordamounts[i].values())[0]
yAxis = [percentify(value, maxValue) for value in list(textwordamounts[i].values())]
x, y = smoothify(yAxis)
plt.plot(x, y, label=i+f' [{textlengths[i]}]', lw=1, alpha=0.5)
plt.xticks(range(0, depth), xAxis)
plt.legend()
plt.savefig('wordamounts.png', dpi=300)
plt.show()