jbesomi · henrifroese · Jul 12, 2020 · Jul 12, 2020 · Jul 13, 2020
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
@@ -1,6 +1,7 @@
 import string
 
 import pandas as pd
+import numpy as np
 import doctest
 
 from texthero import visualization
@@ -59,3 +60,29 @@ def test_top_words_digits_punctuation(self):
     def test_wordcloud(self):
         s = pd.Series("one two three")
         self.assertEqual(visualization.wordcloud(s), None)
+
+    """
+    Test automated readability index
+    """
+
+    def test_automated_readability_index(self):
+        s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
+        s_true = pd.Series([3.0, 6.0, 0.0])
+        self.assertEqual(visualization.automated_readability_index(s), s_true)
+
+    def test_automated_readability_index_index(self):
+        s = pd.Series(
+            ["New York is a beautiful city.", "Look: New York!", "Wow"],
+            index=[5, 6, 7],
+        )
+        self.assertTrue(
+            visualization.automated_readability_index(s).index.equals(s.index)
+        )
+
+    def test_automated_readability_index_numeric(self):
+        s = pd.Series([1.0, 2.0])
+        self.assertRaises(TypeError, visualization.automated_readability_index, s)
+
+    def test_automated_readability_index_nan(self):
+        s = pd.Series(["Test", np.nan])
+        self.assertRaises(TypeError, visualization.automated_readability_index, s)
diff --git a/texthero/visualization.py b/texthero/visualization.py
@@ -3,11 +3,12 @@
 """
 
 import pandas as pd
+import numpy as np
 import plotly.express as px
 
 from wordcloud import WordCloud
 
-from texthero import preprocessing
+from texthero import preprocessing, nlp
 import string
 
 from matplotlib.colors import LinearSegmentedColormap as lsg
@@ -158,7 +159,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
     Return a pandas series with index the top words and as value the count.
 
     Tokenization: split by space and remove all punctuations that are not between characters.
-    
+
     Parameters
     ----------
     normalize :
@@ -185,3 +186,45 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
         .explode()  # one word for each line
         .value_counts(normalize=normalize)
     )
+
+
+def automated_readability_index(s: pd.Series) -> pd.Series:
+    """
+    Calculate the automated readability index (ARI).
+
+    Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
+    >>> hero.automated_readability_index(s)
+    0    3.0
+    1    6.0
+    2    0.0
+    dtype: float64
+
+    Reference
+    --------
+    `Automated Readability Index <https://en.wikipedia.org/wiki/Automated_readability_index>`_
+
+    """
+    # Check if type is strings only.
+    if not s.map(type).eq(str).all():
+        raise TypeError(
+            "Non-string values in series. Use hero.drop_no_content(s) to drop those values."
+        )
+
+    words_s = s.str.split().str.len() - 1
+    characters_s = s.str.count(r"[a-zA-Z0-9]")  # Regex for alphanumeric.
+    sentences_s = nlp.count_sentences(s)
+
+    score_s = 4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43
+    score_s = np.ceil(score_s)
+
+    # Pandas does not raise an Error when dividing by zero -> remove
+    # wrong values by ourselves.
+    score_s.loc[~np.isfinite(score_s)] = 0
+
+    return score_s