From 23042fd43b7678ee351df93bf06eb136e36cff8d Mon Sep 17 00:00:00 2001
From: Henri Froese <henri.froese@yahoo.com>
Date: Sun, 9 Aug 2020 19:40:03 +0200
Subject: [PATCH 1/3] First draft for `most_similar`, closes #45

- function `most_similar` in `representation.py`

Co-authored-by: Maximilian Krahn <maximilian.krahn@icloud.com>
---
 texthero/representation.py | 82 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/texthero/representation.py b/texthero/representation.py
index ba6ebddb..8ef205c1 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -10,10 +10,11 @@
 from sklearn.decomposition import PCA, NMF
 from sklearn.cluster import KMeans, DBSCAN, MeanShift
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics import pairwise_distances
 from sklearn.preprocessing import normalize as sklearn_normalize
 from scipy.sparse import coo_matrix
 
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, List
 
 from texthero import preprocessing
 
@@ -1019,3 +1020,82 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
     s_result.index = s.index
 
     return s_result
+
+
+"""
+Most similar
+"""
+
+
+def most_similar(s, vector: List[float], max_number=None):
+    """
+    Return the most similar vectors in s to the given vector.
+
+    To find the most similar documents to a document, first represent
+    the Pandas Series with the documents, e.g. with
+    :meth:`hero.representation.tfidf`_ . Then use this function
+    to find the most similar documents according to the representation.
+    Similar vectors are returned sorted by similarity descending.
+
+    Internally, euclidian distance is used to judge similarity.
+
+    Series s can either be a :class:`texthero._types.RepresentationSeries`
+    or a :class:`texthero._types.VectorSeries`.
+
+    Parameters
+    ----------
+    s : :class:`texthero._types.RepresentationSeries` or 
+        :class:`texthero._types.VectorSeries`
+        The Series in which to find the most similar vectors.
+
+    vector : List[float]
+        The vector for which we want to find the most similar documents.
+
+    max_number: int or None, default 100
+        Maximum amount of indexes of similar documents to return.
+        If None, returns all .
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
+    >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
+    >>> # want to find the two most similar to "I like football", which has index 0
+    >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2)
+    >>> s_most_similar
+    0                       [0.0]
+    2    [2.1073424255447017e-08]
+    dtype: object
+    >>> # get text of the most similar ones (of course, the text is most similar to itself)
+    >>> s[s_most_similar.index]
+    0    I like football
+    2      I like sports
+    dtype: object
+
+    """
+    if _check_is_valid_representation(s):
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+        else:
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix
+        s_flat_index = s.index.levels[0]
+
+    else:
+        s_for_vectorization = list(s)
+        s_flat_index = s.index
+
+    s_distances = pd.Series(
+        pairwise_distances(s_for_vectorization, np.array([vector])).tolist(),
+        index=s_flat_index,
+    )
+
+    s_distances = s_distances.sort_values()
+
+    if max_number is not None:
+        return s_distances[:max_number]
+    else:
+        return s_distances

From c77ff230367192eabc8cff5cb6437f67ef5919be Mon Sep 17 00:00:00 2001
From: Henri Froese <henri.froese@yahoo.com>
Date: Mon, 10 Aug 2020 18:34:49 +0200
Subject: [PATCH 2/3] Skip doctest (float error)

---
 texthero/representation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/texthero/representation.py b/texthero/representation.py
index 8ef205c1..a12074cb 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -1063,7 +1063,7 @@ def most_similar(s, vector: List[float], max_number=None):
     >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
     >>> # want to find the two most similar to "I like football", which has index 0
     >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2)
-    >>> s_most_similar
+    >>> s_most_similar  # doctest: +SKIP
     0                       [0.0]
     2    [2.1073424255447017e-08]
     dtype: object

From ece3a5a4dfb6eb12d2c8ff34b47bb855d7673cff Mon Sep 17 00:00:00 2001
From: Henri Froese <henri.froese@yahoo.com>
Date: Mon, 10 Aug 2020 20:39:16 +0200
Subject: [PATCH 3/3] Change `most_similar` to return the documents immediately

---
 texthero/representation.py | 57 +++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/texthero/representation.py b/texthero/representation.py
index a12074cb..1403d90d 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union, Any, List
 
 from texthero import preprocessing
+from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries
 
 import logging
 import warnings
@@ -1027,7 +1028,13 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
 """
 
 
-def most_similar(s, vector: List[float], max_number=None):
+@InputSeries(TextSeries)
+def most_similar(
+    s: TextSeries,
+    s_represented: Union[VectorSeries, RepresentationSeries],
+    vector: List[float],
+    max_number=None,
+) -> TextSeries:
     """
     Return the most similar vectors in s to the given vector.
 
@@ -1044,12 +1051,15 @@ def most_similar(s, vector: List[float], max_number=None):
 
     Parameters
     ----------
-    s : :class:`texthero._types.RepresentationSeries` or 
-        :class:`texthero._types.VectorSeries`
-        The Series in which to find the most similar vectors.
+    s : :class:`texthero._types.TextSeries` 
+        The Series in which we want to find similar documents.
+
+    s_represented : :class:`texthero._types.RepresentationSeries` or 
+                    :class:`texthero._types.VectorSeries`
+        The Series by which the similarity is calculated.
 
     vector : List[float]
-        The vector for which we want to find the most similar documents.
+        The vector to which we want to find the most similar documents.
 
     max_number: int or None, default 100
         Maximum amount of indexes of similar documents to return.
@@ -1062,40 +1072,37 @@ def most_similar(s, vector: List[float], max_number=None):
     >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
     >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
     >>> # want to find the two most similar to "I like football", which has index 0
-    >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2)
-    >>> s_most_similar  # doctest: +SKIP
-    0                       [0.0]
-    2    [2.1073424255447017e-08]
-    dtype: object
-    >>> # get text of the most similar ones (of course, the text is most similar to itself)
-    >>> s[s_most_similar.index]
+    >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2)
+    >>> s_most_similar
     0    I like football
     2      I like sports
     dtype: object
 
     """
-    if _check_is_valid_representation(s):
-        if pd.api.types.is_sparse(s):
-            s_coo_matrix = s.sparse.to_coo()[0]
+    if _check_is_valid_representation(s_represented):
+        if pd.api.types.is_sparse(s_represented):
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
         else:
-            s = s.astype("Sparse")
-            s_coo_matrix = s.sparse.to_coo()[0]
+            s_represented = s_represented.astype("Sparse")
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
 
-        s_for_vectorization = s_coo_matrix
-        s_flat_index = s.index.levels[0]
+        s_represented_for_vectorization = s_represented_coo_matrix
+        s_represented_flat_index = s_represented.index.levels[0]
 
     else:
-        s_for_vectorization = list(s)
-        s_flat_index = s.index
+        s_represented_for_vectorization = list(s_represented)
+        s_represented_flat_index = s_represented.index
 
     s_distances = pd.Series(
-        pairwise_distances(s_for_vectorization, np.array([vector])).tolist(),
-        index=s_flat_index,
+        pairwise_distances(
+            s_represented_for_vectorization, np.array([vector])
+        ).tolist(),
+        index=s_represented_flat_index,
     )
 
     s_distances = s_distances.sort_values()
 
     if max_number is not None:
-        return s_distances[:max_number]
+        return s[s_distances[:max_number].index]
     else:
-        return s_distances
+        return s[s_distances.index]