Skip to content

Commit

Permalink
Experiment with sorting through sets of texts
Browse files Browse the repository at this point in the history
  • Loading branch information
MatMoore committed Aug 19, 2023
1 parent 086ece7 commit 8c9f2af
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 0 deletions.
41 changes: 41 additions & 0 deletions guess_readability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sys
from pathlib import Path
from dictionary import load_vocab
from konlpy.tag import Kkma

analyzer = Kkma()

dictionary = load_vocab()

known_nouns = set()
for term in dictionary:
known_nouns.update(analyzer.nouns(term.value))


def percent_known_nouns(text):
"""
Percentage of nouns that appear in the dictionary
"""
known_count = 0
nouns = analyzer.nouns(text)
for noun in nouns:
if noun in known_nouns:
known_count += 1

return known_count / len(nouns)


if __name__ == '__main__':
path = Path(sys.argv[1])
results = []
for filename in path.glob("*.txt"):
with filename.open() as f:
text = f.read()
metric = percent_known_nouns(text)
results.append((metric, filename))

print(min(results))
print(max(results))

for metric, filename in sorted(results, reverse=True)[:10]:
print(f"{metric} {filename}")
9 changes: 9 additions & 0 deletions requirements-nlp.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
konlpy~=v0.5.2
JPype1~=1.4.1
beautifulsoup4~=4.6.0
colorama~=0.4.6
lxml~=4.9.3
numpy~=1.25.2
oauthlib~=3.2.2
requests-oauthlib~=1.3.1
tweepy==3.10.0

0 comments on commit 8c9f2af

Please sign in to comment.