From 23042fd43b7678ee351df93bf06eb136e36cff8d Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 9 Aug 2020 19:40:03 +0200 Subject: [PATCH 1/3] First draft for `most_similar`, closes #45 - function `most_similar` in `representation.py` Co-authored-by: Maximilian Krahn --- texthero/representation.py | 82 +++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index ba6ebddb..8ef205c1 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -10,10 +10,11 @@ from sklearn.decomposition import PCA, NMF from sklearn.cluster import KMeans, DBSCAN, MeanShift from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics import pairwise_distances from sklearn.preprocessing import normalize as sklearn_normalize from scipy.sparse import coo_matrix -from typing import Optional, Union, Any +from typing import Optional, Union, Any, List from texthero import preprocessing @@ -1019,3 +1020,82 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: s_result.index = s.index return s_result + + +""" +Most similar +""" + + +def most_similar(s, vector: List[float], max_number=None): + """ + Return the most similar vectors in s to the given vector. + + To find the most similar documents to a document, first represent + the Pandas Series with the documents, e.g. with + :meth:`hero.representation.tfidf`_ . Then use this function + to find the most similar documents according to the representation. + Similar vectors are returned sorted by similarity descending. + + Internally, euclidian distance is used to judge similarity. + + Series s can either be a :class:`texthero._types.RepresentationSeries` + or a :class:`texthero._types.VectorSeries`. + + Parameters + ---------- + s : :class:`texthero._types.RepresentationSeries` or + :class:`texthero._types.VectorSeries` + The Series in which to find the most similar vectors. + + vector : List[float] + The vector for which we want to find the most similar documents. + + max_number: int or None, default 100 + Maximum amount of indexes of similar documents to return. + If None, returns all . + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"]) + >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series + >>> # want to find the two most similar to "I like football", which has index 0 + >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2) + >>> s_most_similar + 0 [0.0] + 2 [2.1073424255447017e-08] + dtype: object + >>> # get text of the most similar ones (of course, the text is most similar to itself) + >>> s[s_most_similar.index] + 0 I like football + 2 I like sports + dtype: object + + """ + if _check_is_valid_representation(s): + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + else: + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix + s_flat_index = s.index.levels[0] + + else: + s_for_vectorization = list(s) + s_flat_index = s.index + + s_distances = pd.Series( + pairwise_distances(s_for_vectorization, np.array([vector])).tolist(), + index=s_flat_index, + ) + + s_distances = s_distances.sort_values() + + if max_number is not None: + return s_distances[:max_number] + else: + return s_distances From c77ff230367192eabc8cff5cb6437f67ef5919be Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 10 Aug 2020 18:34:49 +0200 Subject: [PATCH 2/3] Skip doctest (float error) --- texthero/representation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index 8ef205c1..a12074cb 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1063,7 +1063,7 @@ def most_similar(s, vector: List[float], max_number=None): >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series >>> # want to find the two most similar to "I like football", which has index 0 >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2) - >>> s_most_similar + >>> s_most_similar # doctest: +SKIP 0 [0.0] 2 [2.1073424255447017e-08] dtype: object From ece3a5a4dfb6eb12d2c8ff34b47bb855d7673cff Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 10 Aug 2020 20:39:16 +0200 Subject: [PATCH 3/3] Change `most_similar` to return the documents immediately --- texthero/representation.py | 57 +++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index a12074cb..1403d90d 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -17,6 +17,7 @@ from typing import Optional, Union, Any, List from texthero import preprocessing +from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries import logging import warnings @@ -1027,7 +1028,13 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ -def most_similar(s, vector: List[float], max_number=None): +@InputSeries(TextSeries) +def most_similar( + s: TextSeries, + s_represented: Union[VectorSeries, RepresentationSeries], + vector: List[float], + max_number=None, +) -> TextSeries: """ Return the most similar vectors in s to the given vector. @@ -1044,12 +1051,15 @@ def most_similar(s, vector: List[float], max_number=None): Parameters ---------- - s : :class:`texthero._types.RepresentationSeries` or - :class:`texthero._types.VectorSeries` - The Series in which to find the most similar vectors. + s : :class:`texthero._types.TextSeries` + The Series in which we want to find similar documents. + + s_represented : :class:`texthero._types.RepresentationSeries` or + :class:`texthero._types.VectorSeries` + The Series by which the similarity is calculated. vector : List[float] - The vector for which we want to find the most similar documents. + The vector to which we want to find the most similar documents. max_number: int or None, default 100 Maximum amount of indexes of similar documents to return. @@ -1062,40 +1072,37 @@ def most_similar(s, vector: List[float], max_number=None): >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"]) >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series >>> # want to find the two most similar to "I like football", which has index 0 - >>> s_most_similar = hero.most_similar(s_pca, s_pca[0], max_number=2) - >>> s_most_similar # doctest: +SKIP - 0 [0.0] - 2 [2.1073424255447017e-08] - dtype: object - >>> # get text of the most similar ones (of course, the text is most similar to itself) - >>> s[s_most_similar.index] + >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2) + >>> s_most_similar 0 I like football 2 I like sports dtype: object """ - if _check_is_valid_representation(s): - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if _check_is_valid_representation(s_represented): + if pd.api.types.is_sparse(s_represented): + s_represented_coo_matrix = s_represented.sparse.to_coo()[0] else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] + s_represented = s_represented.astype("Sparse") + s_represented_coo_matrix = s_represented.sparse.to_coo()[0] - s_for_vectorization = s_coo_matrix - s_flat_index = s.index.levels[0] + s_represented_for_vectorization = s_represented_coo_matrix + s_represented_flat_index = s_represented.index.levels[0] else: - s_for_vectorization = list(s) - s_flat_index = s.index + s_represented_for_vectorization = list(s_represented) + s_represented_flat_index = s_represented.index s_distances = pd.Series( - pairwise_distances(s_for_vectorization, np.array([vector])).tolist(), - index=s_flat_index, + pairwise_distances( + s_represented_for_vectorization, np.array([vector]) + ).tolist(), + index=s_represented_flat_index, ) s_distances = s_distances.sort_values() if max_number is not None: - return s_distances[:max_number] + return s[s_distances[:max_number].index] else: - return s_distances + return s[s_distances.index]