Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Automated Readability Index, Closes #20 ; new PR; Waiting until Checking for NaNs is implemented. #74

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions tests/test_visualization.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import string

import pandas as pd
import numpy as np
import doctest

from texthero import visualization
Expand Down Expand Up @@ -59,3 +60,29 @@ def test_top_words_digits_punctuation(self):
def test_wordcloud(self):
s = pd.Series("one two three")
self.assertEqual(visualization.wordcloud(s), None)

"""
Test automated readability index
"""

def test_automated_readability_index(self):
s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
s_true = pd.Series([3.0, 6.0, 0.0])
self.assertEqual(visualization.automated_readability_index(s), s_true)

def test_automated_readability_index_index(self):
s = pd.Series(
["New York is a beautiful city.", "Look: New York!", "Wow"],
index=[5, 6, 7],
)
self.assertTrue(
visualization.automated_readability_index(s).index.equals(s.index)
)

def test_automated_readability_index_numeric(self):
s = pd.Series([1.0, 2.0])
self.assertRaises(TypeError, visualization.automated_readability_index, s)

def test_automated_readability_index_nan(self):
s = pd.Series(["Test", np.nan])
self.assertRaises(TypeError, visualization.automated_readability_index, s)
47 changes: 45 additions & 2 deletions texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
"""

import pandas as pd
import numpy as np
import plotly.express as px

from wordcloud import WordCloud

from texthero import preprocessing
from texthero import preprocessing, nlp
import string

from matplotlib.colors import LinearSegmentedColormap as lsg
Expand Down Expand Up @@ -158,7 +159,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
Return a pandas series with index the top words and as value the count.

Tokenization: split by space and remove all punctuations that are not between characters.

Parameters
----------
normalize :
Expand All @@ -185,3 +186,45 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
.explode() # one word for each line
.value_counts(normalize=normalize)
)


def automated_readability_index(s: pd.Series) -> pd.Series:
"""
Calculate the automated readability index (ARI).

Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores.

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
>>> hero.automated_readability_index(s)
0 3.0
1 6.0
2 0.0
dtype: float64

Reference
--------
`Automated Readability Index <https://en.wikipedia.org/wiki/Automated_readability_index>`_

"""
# Check if type is strings only.
if not s.map(type).eq(str).all():
raise TypeError(
"Non-string values in series. Use hero.drop_no_content(s) to drop those values."
)

words_s = s.str.split().str.len() - 1
characters_s = s.str.count(r"[a-zA-Z0-9]") # Regex for alphanumeric.
sentences_s = nlp.count_sentences(s)

score_s = 4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43
score_s = np.ceil(score_s)

# Pandas does not raise an Error when dividing by zero -> remove
# wrong values by ourselves.
score_s.loc[~np.isfinite(score_s)] = 0

return score_s