From fa342a92d4f007cebfce29f1f22a4e31fedc56c6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 18 Aug 2020 22:06:14 +0200 Subject: [PATCH 01/42] added MultiIndex DF support suport MultiIndex as function parameter returns MultiIndex, where Representation was returned * missing: correct test Co-authored-by: Henri Froese --- tests/test_indexes.py | 18 +-- tests/test_representation.py | 63 +------- texthero/representation.py | 294 +++++++++++++---------------------- texthero/visualization.py | 4 +- 4 files changed, 115 insertions(+), 264 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - lambda x: representation.flatten(representation.count(x)), - (s_tokenized_lists,), - ], - [ - "term_frequency", - lambda x: representation.flatten(representation.term_frequency(x)), - (s_tokenized_lists,), - ], - [ - "tfidf", - lambda x: representation.flatten(representation.tfidf(x)), - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..41b81ffa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.MultiIndex.from_tuples( - [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")], -) - -s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples( - [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")], -) - -s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_tokenized_output_index = [0,1] +s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -182,55 +175,3 @@ def test_tfidf_formula(self): ).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true) - - """ - flatten. - """ - - def test_flatten(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s), s_true, check_names=False - ) - - def test_flatten_fill_missing_with(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]], - index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, fill_missing_with="FILLED"), - s_true, - check_names=False, - ) - - def test_flatten_missing_row(self): - # Simulating a row with no features, so it's completely missing from - # the representation series. - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]], - index=["doc0", "doc1", "doc2"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, index=s_true.index), s_true, check_names=False - ) diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..042db71a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,90 +27,14 @@ """ -def flatten( - s: Union[pd.Series, pd.Series.sparse], - index: pd.Index = None, - fill_missing_with: Any = 0.0, -) -> pd.Series: - """ - Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series. - - The given Series should have a multiindex with first level being the document - and second level being individual features of that document (e.g. tdidf scores per word). - The flattened Series has one cell per document, with the cell being a list of all - the individual features of that document. - - Parameters - ---------- - s : Sparse Pandas Series or Pandas Series - The multiindexed Pandas Series to flatten. - - index : Pandas Index, optional, default to None - The index the flattened Series should have. - - fill_missing_with : Any, default to 0.0 - Value to fill the NaNs (missing values) with. This _does not_ mean - that existing values that are np.nan are replaced, but rather that - features that are not present in one document but present in others - are filled with fill_missing_with. See example below. - - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> import numpy as np - >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word']) - >>> s = pd.Series([3, np.nan, 4], index=index) - >>> s - document word - doc0 Word1 3.0 - Word3 NaN - doc1 Word2 4.0 - dtype: float64 - >>> hero.flatten(s, fill_missing_with=0.0) - document - doc0 [3.0, 0.0, nan] - doc1 [0.0, 4.0, 0.0] - dtype: object - - """ - s = s.unstack(fill_value=fill_missing_with) - - if index is not None: - s = s.reindex(index, fill_value=fill_missing_with) - # Reindexing makes the documents for which no values - # are present in the Sparse Representation Series - # "reappear" correctly. - - s = pd.Series(s.values.tolist(), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: +def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Representation Series. + Check if the given Pandas Series is a Document Term DF. - Returns true if Series is Document Representation Series, else False. + Returns true if input is Document Term DF, else False. """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True + return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -132,11 +56,11 @@ def count( min_df=1, max_df=1.0, binary=False, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. - Return a Document Representation Series with the + Return a Document Term DataFrame with the number of occurences of a document's words for every document. TODO add tutorial link @@ -144,10 +68,6 @@ def count( The input Series should already be tokenized. If not, it will be tokenized before count is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -177,15 +97,14 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - 0 Sentence 1 - one 1 - 1 Sentence 1 - two 1 - dtype: Sparse[int64, 0] + count + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -204,25 +123,23 @@ def count( ) tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tf_vectors_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("count", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tf_vectors_csr, s.index, multiindexed_columns + ) def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. - Return a Document Representation Series with the + Return a Document Term DataFrame with the term frequencies of the terms for every document. TODO add tutorial link @@ -230,11 +147,6 @@ def term_frequency( The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - - Parameters ---------- s : Pandas Series (tokenized) @@ -261,16 +173,14 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) - 0 Sentence 0.2 - hey 0.2 - one 0.2 - 1 Sentence 0.2 - two 0.2 - dtype: Sparse[float64, nan] + term_frequency + Sentence hey one two + 0 0.2 0.2 0.2 0.0 + 1 0.2 0.0 0.0 0.2 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -291,17 +201,16 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - s_out = pd.Series.sparse.from_coo(frequency_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("term_frequency", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + frequency_coo, s.index, multiindexed_columns + ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: so the result is exactly what you get applying the formula described above. - Return a Document Representation Series with the + Return a Document Term DataFrame with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. - If working with big pandas Series, you might want to limit - the number of features through the max_features parameter. - - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) - 0 Bye 1.000000 - Hi 1.405465 - 1 Bye 2.000000 - Test 1.405465 - dtype: Sparse[float64, nan] + tfidf + Bye Hi Test + 0 1.0 1.405465 0.000000 + 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -395,16 +296,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: tfidf_vectors_csr = tfidf.fit_transform(s) - # Result from sklearn is in Compressed Sparse Row format. - # Pandas Sparse Series can only be initialized from Coordinate format. - tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) - - # Map word index to word name and keep original index of documents. - feature_names = tfidf.get_feature_names() - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("tfidf", word) for word in tfidf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tfidf_vectors_csr, s.index, multiindexed_columns + ) """ @@ -412,7 +310,9 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: """ -def pca(s, n_components=2, random_state=None) -> pd.Series: +def pca( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or MuliIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.values + else: + values = list(s) + + return pd.Series(pca.fit_transform(values).tolist(), index=s.index) -def nmf(s, n_components=2, random_state=None) -> pd.Series: +def nmf( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Performs non-negative matrix factorization. @@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) def tsne( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -557,7 +471,7 @@ def tsne( Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -619,7 +533,13 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) """ @@ -628,7 +548,7 @@ def tsne( def kmeans( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -653,7 +573,7 @@ def kmeans( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -686,7 +606,7 @@ def kmeans( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -702,7 +622,12 @@ def kmeans( `kmeans on Wikipedia `_ """ - vectors = list(s) + + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + kmeans = KMeans( n_clusters=n_clusters, n_init=n_init, @@ -715,7 +640,7 @@ def kmeans( def dbscan( - s, + s: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -743,7 +668,7 @@ def dbscan( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -783,7 +708,7 @@ def dbscan( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 1 1 @@ -801,6 +726,11 @@ def dbscan( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + return pd.Series( DBSCAN( eps=eps, @@ -809,13 +739,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") def meanshift( - s, + s: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -843,7 +773,7 @@ def meanshift( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -901,6 +831,11 @@ def meanshift( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.values + else: + vectors = list(s) + return pd.Series( MeanShift( bandwidth=bandwidth, @@ -909,7 +844,7 @@ def meanshift( cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") @@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: `Norm on Wikipedia `_ """ + isDocumentTermDF = _check_is_valid_DocumentTermDF(s) - is_valid_representation = ( - isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2 - ) - - if not is_valid_representation: - raise TypeError( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - ) - # TODO after merging representation: use _check_is_valid_representation instead - - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if isDocumentTermDF: + s_for_vectorization = s.sparse.to_coo() else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] - - s_for_vectorization = s_coo_matrix + s_for_vectorization = list(s) result = sklearn_normalize( s_for_vectorization, norm=norm ) # Can handle sparse input. - result_coo = coo_matrix(result) - s_result = pd.Series.sparse.from_coo(result_coo) - s_result.index = s.index - - return s_result + if isDocumentTermDF: + return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + else: + return pd.Series(result.tolist(), index=s.index) diff --git a/texthero/visualization.py b/texthero/visualization.py index e213285e..2426ab4d 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -63,8 +63,8 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten - >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) + >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2) >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59a9f8c0df70d8136780b3160bc1d2ca59f48b26 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 19 Aug 2020 19:39:30 +0200 Subject: [PATCH 02/42] beginning with tests --- tests/test_representation.py | 147 +++++++++++++++++------------------ texthero/representation.py | 8 +- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 41b81ffa..d4acd369 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,32 +50,84 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0,1] +s_tokenized_output_index = [0, 1] + +s_tokenized_output_index_noncontinous = [5, 7] + + +def _get_multiindex_for_tokenized_output(first_level_name): + return pd.MultiIndex.from_product( + [[first_level_name], ["!", ".", "?", "TEST", "Test"]] + ) -s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("count"), + ).astype("Sparse"), + ], [ "term_frequency", representation.term_frequency, - [0.125, 0.125, 0.250, 0.250, 0.125, 0.125], - "float", + pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("term_frequency"), + ).astype("Sparse"), ], [ "tfidf", representation.tfidf, - [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index], - "float", + pd.DataFrame( + [ + [ + _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here + for x in ["!", ".", "?", "TEST", "Test"] + ], + [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + ], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("tfidf"), + ).astype("Sparse"), ], ] + test_cases_vectorization_min_df = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [2, 1], "int"], - ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",], - ["tfidf", representation.tfidf, [2.0, 1.0], "float",], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [2, 1], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("count", "Test")]), + ).astype("Sparse"), + ], + [ + "term_frequency", + representation.term_frequency, + pd.DataFrame( + [0.666667, 0.333333], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + ).astype("Sparse"), + ], + [ + "tfidf", + representation.tfidf, + pd.DataFrame( + [2.0, 1.0], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + ).astype("Sparse"), + ], ] @@ -91,62 +143,23 @@ class AbstractRepresentationTest(PandasTestCase): """ @parameterized.expand(test_cases_vectorization) - def test_vectorization_simple( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="int" - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="float" - ).astype(pd.SparseDtype("float", np.nan)) + def test_vectorization_simple(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( - self, name, test_function, correct_output_values, int_or_float + self, name, test_function, correct_output=None ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - result_s = test_function(s_tokenized_with_noncontinuous_index) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) @parameterized.expand(test_cases_vectorization_min_df) - def test_vectorization_min_df( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - + def test_vectorization_min_df(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -159,19 +172,3 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") - - """ - Individual / special tests. - """ - - def test_tfidf_formula(self): - s = pd.Series(["Hi Bye", "Test Bye Bye"]) - s = preprocessing.tokenize(s) - s_true_index = pd.MultiIndex.from_tuples( - [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], - ) - s_true = pd.Series( - [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index - ).astype("Sparse") - - self.assertEqual(representation.tfidf(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index 042db71a..efabc9c6 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,11 +97,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - count - Sentence one two + count + Sentence one two 0 1 1 0 1 1 0 1 - +# FIXME columns pandas doctest See Also -------- Document Term DataFrame: TODO add tutorial link @@ -375,7 +375,7 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) - +# FIXME: merge master again def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None From 19c52de3f5ae6a1a01e4262dca00ea5177718311 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 19 Aug 2020 22:02:41 +0200 Subject: [PATCH 03/42] implemented correct sparse support *missing: test adopting for new types Co-authored-by: Henri Froese --- tests/test_representation.py | 12 ++++---- texthero/representation.py | 59 +++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index d4acd369..7c02ccd2 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("count"), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("count", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "tfidf", representation.tfidf, pd.DataFrame( - [2.0, 1.0], + [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), ).astype("Sparse"), @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) + pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): diff --git a/texthero/representation.py b/texthero/representation.py index efabc9c6..ff691212 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -101,9 +101,12 @@ def count( Sentence one two 0 1 1 0 1 1 0 1 -# FIXME columns pandas doctest + See Also -------- + + # FIXME columns pandas doctest + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -375,8 +378,11 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + + # FIXME: merge master again + def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -437,11 +443,12 @@ def nmf( nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) + return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) def tsne( @@ -535,11 +542,12 @@ def tsne( ) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) + return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) """ @@ -624,9 +632,10 @@ def kmeans( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) kmeans = KMeans( n_clusters=n_clusters, @@ -635,8 +644,8 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(vectors) - return pd.Series(kmeans.predict(vectors), index=s.index).astype("category") + ).fit(s_for_vectorization) + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") def dbscan( @@ -727,9 +736,10 @@ def dbscan( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) return pd.Series( DBSCAN( @@ -739,7 +749,7 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(vectors), + ).fit_predict(s_for_vectorization), index=s.index, ).astype("category") @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> idx = pd.MultiIndex.from_tuples( - ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word") - ... ) - >>> s = pd.Series([1, 2, 3, 4], index=idx) + >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") - document word - 0 a 0.50 - b 1.00 - 1 c 0.75 - d 1.00 - dtype: Sparse[float64, nan] + 0 1 + a b c d + 0 0.250000 0.500000 0.75 1.000000 + 1 0.571429 0.285714 1.00 0.714286 + 2 0.400000 0.400000 0.60 1.000000 + 3 0.111111 0.222222 1.00 0.888889 See Also @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: isDocumentTermDF = _check_is_valid_DocumentTermDF(s) if isDocumentTermDF: - s_for_vectorization = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: s_for_vectorization = list(s) From 41f55a8a359f15ce4ba65e1e726b9e0757fc596b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:20:02 +0200 Subject: [PATCH 04/42] added back list() and rm .tolist() --- texthero/representation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 048b42ec..025652d9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,7 +37,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(s.values.tolist(), index=s.index) + s = pd.Series(list(s.values), index=s.index) return s @@ -415,7 +415,7 @@ def pca( else: values = list(s) - return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=s.index) # FIXME: merge master again @@ -489,7 +489,7 @@ def nmf( else: s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) def tsne( @@ -589,7 +589,7 @@ def tsne( else: s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) """ @@ -963,4 +963,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(result.tolist(), index=s.index) + return pd.Series(list(result), index=s.index) From 217611a2c648db4044d240a9c12a157b94b36bca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:21:41 +0200 Subject: [PATCH 05/42] rm .tolist() and added list() --- texthero/representation.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 025652d9..fdab73dd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,36 +37,6 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(list(s.values), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: - """ - Check if the given Pandas Series is a Document Representation Series. - - Returns true if Series is Document Representation Series, else False. - - """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -963,4 +933,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series((result), index=s.index) From 6a3b56d1a56401880efa7cfa7dd32668e23b25ea Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:22 +0200 Subject: [PATCH 06/42] Adopted the test to the new dataframes --- tests/test_representation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c02ccd2..3564730e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -90,7 +90,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here for x in ["!", ".", "?", "TEST", "Test"] ], - [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), @@ -146,20 +146,28 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) + pd.testing.assert_series_equal( + pd.Series(s_tokenized_output_index_noncontinous), + pd.Series(result_s.index), + check_dtype=False, + ) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): From b8ff5611e550f5f4bc023b2b76ef8ebcff7f8021 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:35 +0200 Subject: [PATCH 07/42] wrong format --- texthero/representation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index fdab73dd..ac0a458f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -657,7 +657,9 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + "category" + ) def dbscan( From e3af2f9da094505861cddc420f57490700ca88ef Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 18:48:51 +0200 Subject: [PATCH 08/42] Address most review comments. --- tests/test_representation.py | 19 ++++++++-------- texthero/representation.py | 42 +++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 3564730e..5f985996 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0, 1] +s_tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = [5, 7] +s_tokenized_output_index_noncontinous = pd.Index([5, 7]) def _get_multiindex_for_tokenized_output(first_level_name): @@ -79,7 +79,8 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), - ).astype("Sparse"), + dtype="Sparse", + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -94,7 +95,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -117,7 +118,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [0.666667, 0.333333], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -126,7 +127,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -155,10 +156,8 @@ def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal( - pd.Series(s_tokenized_output_index_noncontinous), - pd.Series(result_s.index), - check_dtype=False, + pd.testing.assert_index_equal( + s_tokenized_output_index_noncontinous, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) diff --git a/texthero/representation.py b/texthero/representation.py index ac0a458f..7793cb2b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -145,7 +145,7 @@ def term_frequency( Return a Document Term DataFrame with the term frequencies of the terms for every - document. + document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -241,7 +241,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram formula described above. Return a Document Term DataFrame with the - tfidf of every word in the document. + tfidf of every word in the document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -341,9 +341,13 @@ def pca( In general, *pca* should be called after the text has already been represented to a matrix form. + PCA cannot directly handle sparse input, so when calling pca on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s : Pandas Series or MuliIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -388,9 +392,6 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) -# FIXME: merge master again - - def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -410,10 +411,12 @@ def nmf( n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. + NMF can directly handle sparse input, so when calling nmf on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -484,10 +487,12 @@ def tsne( document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. + T-SNE can directly handle sparse input, so when calling tsne on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -591,9 +596,12 @@ def kmeans( function that assigns a scalar (a weight) to each word), K-means will find k topics (clusters) and assign a topic to each document. + Kmeans can directly handle sparse input, so when calling kmeans on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -689,9 +697,12 @@ def dbscan( function that assigns a scalar (a weight) to each word), DBSCAN will find topics (clusters) and assign a topic to each document. + DBSCAN can directly handle sparse input, so when calling dbscan on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -795,9 +806,13 @@ def meanshift( function that assigns a scalar (a weight) to each word), mean shift will find topics (clusters) and assign a topic to each document. + Menashift cannot directly handle sparse input, so when calling meanshift on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -889,11 +904,12 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. - Input has to be a Representation Series. + Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + the sparseness is kept. Parameters ---------- - s: Pandas Series + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. From 77ad80ecf8977a098b73c4f12c8f28951c769dfc Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 19:45:48 +0200 Subject: [PATCH 09/42] Add more unittests for representation --- tests/test_representation.py | 118 +++++++++++++++++++++++++++++++++-- texthero/representation.py | 14 ++--- 2 files changed, 118 insertions(+), 14 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5f985996..2722289e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -132,6 +132,50 @@ def _get_multiindex_for_tokenized_output(first_level_name): ] +s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +s_documenttermDF = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), +).astype("Sparse[float64, nan]") + + +test_cases_dim_reduction_and_clustering = [ + # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "nmf", + representation.nmf, + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + ], + [ + "tsne", + representation.tsne, + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + ], + [ + "kmeans", + representation.kmeans, + pd.Series([1, 0], index=[5, 7], dtype="category"), + ], + [ + "dbscan", + representation.dbscan, + pd.Series([-1, -1], index=[5, 7], dtype="category"), + ], + [ + "meanshift", + representation.meanshift, + pd.Series([0, 1], index=[5, 7], dtype="category"), + ], + [ + "normalize", + representation.normalize, + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + ], +] + + class AbstractRepresentationTest(PandasTestCase): """ Class for representation test cases. Most tests are @@ -147,9 +191,7 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( @@ -164,9 +206,7 @@ def test_vectorization_noncontinuous_index_kept( def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -179,3 +219,69 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction and Clustering + """ + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_vector_series_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "kmeans": + result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_vector_series) + else: + result_s = test_function(s_vector_series, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_documenttermDF_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "normalize": + # testing this below separately + return + + if name == "kmeans": + result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_documenttermDF) + else: + result_s = test_function(s_documenttermDF, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + def test_normalize_documenttermDF_also_as_output(self): + # normalize should also return DocumentTermDF output for DocumentTermDF + # input so we test it separately + result = representation.normalize(s_documenttermDF) + correct_output = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + ) + + pd.testing.assert_frame_equal( + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + ) diff --git a/texthero/representation.py b/texthero/representation.py index 7793cb2b..8e876088 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,7 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP count Sentence one two 0 1 1 0 @@ -106,8 +106,6 @@ def count( See Also -------- - # FIXME columns pandas doctest - Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -177,7 +175,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) + >>> hero.term_frequency(s) # doctest: +SKIP term_frequency Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -273,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) + >>> hero.tfidf(s) # doctest: +SKIP tfidf Bye Hi Test 0 1.0 1.405465 0.000000 @@ -900,7 +898,7 @@ def meanshift( """ -def normalize(s: pd.Series, norm="l2") -> pd.Series: +def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -920,7 +918,7 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: >>> import pandas as pd >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") + >>> hero.normalize(s, norm="max") # doctest: +SKIP 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 @@ -951,4 +949,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series((result), index=s.index) + return pd.Series(list(result), index=s.index) From bee21572fd7d33a2ba9e7c61a76b1bb47f005fba Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 18:40:13 +0200 Subject: [PATCH 10/42] Initial commit to add topic modelling Co-authored-by: Maximilian Krahn --- texthero/visualization.py | 127 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/texthero/visualization.py b/texthero/visualization.py index 94556a93..f0edc92c 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -15,6 +15,10 @@ from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt +from scipy.sparse import csr_matrix +from sklearn.preprocessing import normalize as sklearn_normalize +import pyLDAvis + from collections import Counter @@ -304,3 +308,126 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def plot_topics(s_document_term, s_document_topic): + + metadata_list = s_document_topic._metadata + + for item in metadata_list: + if isinstance(item, tuple): + if item[0] == "vectorizer": + vectorizer = item[1] + break + else: + # no vectorizer found + vectorizer = None + + # Get / build matrices from input + + if vectorizer: + # TODO check sparseness + # s_document_topic is output of hero.lda or hero.lsi + document_term_matrix = s_document_term.values + document_topic_matrix = s_document_topic.values + + topic_term_matrix = vectorizer.components_ + + else: + # s_document_topic is output of some hero clustering function + indexes_of_unassigned_documents = s_document_topic == -1 + s_document_term = s_document_term[~indexes_of_unassigned_documents] + s_document_topic = s_document_topic[~indexes_of_unassigned_documents] + s_document_topic.cat.remove_unused_categories(inplace=True) + + document_term_matrix = s_document_term.sparse.to_coo() + + # Construct document_topic_matrix + n_rows = len(s_document_topic.index) + n_cols = len(s_document_topic.values.categories) + + data = [1 for _ in range(n_rows)] + rows = range(n_rows) + columns = s_document_topic.values + + document_topic_matrix = csr_matrix( + (data, (rows, columns)), shape=(n_rows, n_cols) + ) + + topic_term_matrix = document_topic_matrix.T * document_term_matrix + + vocab = list(s_document_term.columns.levels[1]) + doc_lengths = list(s_document_term.sum(axis=1)) + term_frequency = list(s_document_term.sum(axis=0)) + + document_topic_distributions = sklearn_normalize( + document_topic_matrix, norm="l1", axis=1 + ) + + topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) + print(document_term_matrix.shape, topic_term_distributions.shape) + return pyLDAvis.prepare( + **{ + "vocab": vocab, + "doc_lengths": doc_lengths, + "term_frequency": term_frequency, + "doc_topic_dists": document_topic_distributions.toarray().tolist(), + "topic_term_dists": topic_term_distributions.toarray().tolist(), + } + ) + + +""" +# plot_topics(s_document_term, s_document_topic) +import pyLDAvis +import pyLDAvis.sklearn +import pandas as pd + +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer +from sklearn.decomposition import LatentDirichletAllocation + +newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) +docs_raw = newsgroups.data + + +# TFIDF matrix & vectorizer + +tf_vectorizer = CountVectorizer(strip_accents='unicode', + stop_words='english', + lowercase=True, + token_pattern=r'\b[a-zA-Z]{3,}\b', + max_df=0.5, + min_df=100) + +tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) +dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw) + +# LDA + +lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0) +lda_tfidf.fit(dtm_tfidf) + + +# Prep & Visualize +vis = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer) +pyLDAvis.show(vis) +""" + + +""" +import texthero as hero +from sklearn.preprocessing import normalize as sklearn_normalize +import pyLDAvis +from scipy.sparse import csr_matrix +import pandas as pd +from sklearn.datasets import fetch_20newsgroups +newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) + +s = pd.Series(newsgroups.data) +s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) +s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) + +vis = hero.plot_topics(s_tfidf, s_cluster) +pyLDAvis.show(vis) +""" From dece7b528e050aedd684b8d2f65e0683d9d67513 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 18:41:26 +0200 Subject: [PATCH 11/42] add pyLDAvis to dependencies --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index d6103b02..291b198a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,6 +38,7 @@ install_requires = unidecode>=1.1.1 gensim>=3.6.0 matplotlib>=3.1.0 + pyLDAvis>=2.1.2 # TODO pick the correct version. [options.extras_require] dev = From 6387ce9cb045e1c22db3a7aa5ee6d567c9788835 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 18:48:49 +0200 Subject: [PATCH 12/42] add return_figure option --- texthero/visualization.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index f0edc92c..5037e8fa 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -310,7 +310,7 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def plot_topics(s_document_term, s_document_topic): +def plot_topics(s_document_term, s_document_topic, return_figure=False): metadata_list = s_document_topic._metadata @@ -365,8 +365,8 @@ def plot_topics(s_document_term, s_document_topic): ) topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) - print(document_term_matrix.shape, topic_term_distributions.shape) - return pyLDAvis.prepare( + + figure = pyLDAvis.prepare( **{ "vocab": vocab, "doc_lengths": doc_lengths, @@ -376,6 +376,11 @@ def plot_topics(s_document_term, s_document_topic): } ) + if return_figure: + return figure + else: + pyLDAvis.show(figure) + """ # plot_topics(s_document_term, s_document_topic) From 01c081834c5d1576a4529c06755fe12bd56f2818 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 18:52:39 +0200 Subject: [PATCH 13/42] allow display in Console and Jupyter Notebooks --- texthero/visualization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 5037e8fa..68c47e18 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -379,7 +379,7 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): if return_figure: return figure else: - pyLDAvis.show(figure) + pyLDAvis.display(figure) """ From 9cd113ce1e5297841c42e6a83d0174e3fcf179b1 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 24 Aug 2020 19:32:03 +0200 Subject: [PATCH 14/42] tsvd *missing: some test Co-authored-by: Henri Froese --- texthero/representation.py | 80 +++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index 8e876088..3c87d3aa 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -7,7 +7,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.manifold import TSNE -from sklearn.decomposition import PCA, NMF +from sklearn.decomposition import PCA, NMF, TruncatedSVD from sklearn.cluster import KMeans, DBSCAN, MeanShift from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize as sklearn_normalize @@ -565,6 +565,84 @@ def tsne( return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) +def truncatedSVD( + s: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, +) -> pd.Series: + """ + Performs TruncatedSVD on the given pandas series. + + TruncatedSVD is an algorithmen, which can be used to reduce the dimensions + of a given series. In natural language processing, the high-dimensional data + is usually a document-term matrix (so in texthero usually a Series after + applying :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each word). + This is used as a tool to extract the most important topics and words + of a given Series. In this context it is refered to as latent semantic analysis (LSA), + or Latent Semantic Analysis (LSI) + + + TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a + DocumentTermDF, the advantage of sparseness is kept. + + Parameters + ---------- + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + + n_components : int, default is 2. + Number of components to keep (dimensionality of output vectors). + For LSA, a value of 100 is recommended + + n_iter : int, optional (default: 5) + Number of iterations for randomized SVD solver. + + random_state : int, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. + + + Returns + ------- + Pandas Series with the vector calculated by truncadedSVD for the document in every + cell and the truncadedSVD object in the metadata. This will be used in the + :meth:`plot_topics` + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra","Football, Music"]) + ... + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> hero.truncatedSVD(s, random_state=42) # doctest: +SKIP + 0 [0.14433756729740624, 0.15309310892394884] + 1 [0.14433756729740663, -0.1530931089239484] + 2 [0.14433756729740646, 7.211110073938366e-17] + dtype: object + + See also + -------- + `truncatedSVD on Wikipedia `_ + + """ + truncatedSVD = TruncatedSVD( + n_components=n_components, n_iter=n_iter, random_state=random_state + ) + + if _check_is_valid_DocumentTermDF(s): + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") + else: + s_for_vectorization = list(s) + + result = pd.Series( + list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index + ) + + result._metadata.append(("vectorizer", TruncatedSVD)) + + return result + + """ Clustering """ From 187d8f576a4808dd44849349bcff09d8287a67af Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 19:32:49 +0200 Subject: [PATCH 15/42] Change display at end of function --- texthero/visualization.py | 146 +++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 64 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 68c47e18..775c1d04 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -15,7 +15,7 @@ from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis @@ -310,26 +310,13 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def plot_topics(s_document_term, s_document_topic, return_figure=False): - - metadata_list = s_document_topic._metadata - - for item in metadata_list: - if isinstance(item, tuple): - if item[0] == "vectorizer": - vectorizer = item[1] - break - else: - # no vectorizer found - vectorizer = None - - # Get / build matrices from input +def _get_matrices_for_plot_topics(s_document_term, s_document_topic, vectorizer): if vectorizer: - # TODO check sparseness + # s_document_topic is output of hero.lda or hero.lsi - document_term_matrix = s_document_term.values - document_topic_matrix = s_document_topic.values + document_term_matrix = s_document_term.sparse.to_coo() + document_topic_matrix = np.array(list(s_document_topic)) topic_term_matrix = vectorizer.components_ @@ -356,70 +343,104 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): topic_term_matrix = document_topic_matrix.T * document_term_matrix - vocab = list(s_document_term.columns.levels[1]) - doc_lengths = list(s_document_term.sum(axis=1)) - term_frequency = list(s_document_term.sum(axis=0)) + return s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix + + +def _prepare_matrices_for_pyLDAvis( + document_topic_matrix, topic_term_matrix +): document_topic_distributions = sklearn_normalize( document_topic_matrix, norm="l1", axis=1 ) - topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) + topic_term_distributions = sklearn_normalize( + topic_term_matrix, norm="l1", axis=1) - figure = pyLDAvis.prepare( - **{ - "vocab": vocab, - "doc_lengths": doc_lengths, - "term_frequency": term_frequency, - "doc_topic_dists": document_topic_distributions.toarray().tolist(), - "topic_term_dists": topic_term_distributions.toarray().tolist(), - } - ) - if return_figure: - return figure + # Make sparse matrices dense for pyLDAvis + if issparse(document_topic_distributions): + document_topic_distributions = document_topic_distributions.toarray().tolist() else: - pyLDAvis.display(figure) + document_topic_distributions = document_topic_distributions.tolist() + if issparse(topic_term_distributions): + topic_term_distributions = topic_term_distributions.toarray().tolist() + else: + topic_term_distributions = topic_term_distributions.tolist() -""" -# plot_topics(s_document_term, s_document_topic) -import pyLDAvis -import pyLDAvis.sklearn -import pandas as pd + return document_topic_distributions, topic_term_distributions -from sklearn.datasets import fetch_20newsgroups -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from sklearn.decomposition import LatentDirichletAllocation -newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) -docs_raw = newsgroups.data +def plot_topics(s_document_term, s_document_topic, return_figure=False): + """ + Parameters + ---------- -# TFIDF matrix & vectorizer + Examples + -------- + >>> import texthero as hero + >>> from sklearn.preprocessing import normalize as sklearn_normalize + >>> import pyLDAvis + >>> from scipy.sparse import csr_matrix + >>> import pandas as pd + >>> from sklearn.datasets import fetch_20newsgroups + >>> newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) + >>> s = pd.Series(newsgroups.data) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) + >>> s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) + >>> hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP + + See Also + -------- + """ + metadata_list = s_document_topic._metadata -tf_vectorizer = CountVectorizer(strip_accents='unicode', - stop_words='english', - lowercase=True, - token_pattern=r'\b[a-zA-Z]{3,}\b', - max_df=0.5, - min_df=100) + for item in metadata_list: + if isinstance(item, tuple): + if item[0] == "vectorizer": + vectorizer = item[1] + break + else: + # no vectorizer found + vectorizer = None -tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) -dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw) + # Get / build matrices from input -# LDA + s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix = _get_matrices_for_plot_topics( + s_document_term, + s_document_topic, + vectorizer + ) -lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0) -lda_tfidf.fit(dtm_tfidf) + vocab = list(s_document_term.columns.levels[1]) + doc_lengths = list(s_document_term.sum(axis=1)) + term_frequency = list(s_document_term.sum(axis=0)) -# Prep & Visualize -vis = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer) -pyLDAvis.show(vis) -""" + + document_topic_distributions, topic_term_distributions = _prepare_matrices_for_pyLDAvis( + document_topic_matrix, topic_term_matrix + ) + figure = pyLDAvis.prepare( + **{ + "vocab": vocab, + "doc_lengths": doc_lengths, + "term_frequency": term_frequency, + "doc_topic_dists": document_topic_distributions, + "topic_term_dists": topic_term_distributions, + } + ) + + if return_figure: + return figure + else: + pyLDAvis.display(figure) # For Jupyter Notebooks + pyLDAvis.show(figure) # For command line / scripts + """ import texthero as hero from sklearn.preprocessing import normalize as sklearn_normalize @@ -428,11 +449,8 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): import pandas as pd from sklearn.datasets import fetch_20newsgroups newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) - s = pd.Series(newsgroups.data) s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) - -vis = hero.plot_topics(s_tfidf, s_cluster) -pyLDAvis.show(vis) +hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP """ From 85089b13c3e11ceb5cc32e51ab62a2d69a482382 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 19:42:13 +0200 Subject: [PATCH 16/42] change display --- tests/test_visualization.py | 17 +++++++++++++++++ texthero/visualization.py | 7 ++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d0075389..d1069788 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -79,3 +79,20 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test plot_topics + """ + + def test_plot_topics_clustering_input(self): + import texthero as hero + from sklearn.preprocessing import normalize as sklearn_normalize + import pyLDAvis + from scipy.sparse import csr_matrix + import pandas as pd + from sklearn.datasets import fetch_20newsgroups + newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) + s = pd.Series(newsgroups.data) + s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) + s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) + hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP diff --git a/texthero/visualization.py b/texthero/visualization.py index 775c1d04..2873908f 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -381,9 +381,6 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): Examples -------- >>> import texthero as hero - >>> from sklearn.preprocessing import normalize as sklearn_normalize - >>> import pyLDAvis - >>> from scipy.sparse import csr_matrix >>> import pandas as pd >>> from sklearn.datasets import fetch_20newsgroups >>> newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) @@ -438,6 +435,10 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): if return_figure: return figure else: + try: + pyLDAvis.enable_notebook() + except: + pass pyLDAvis.display(figure) # For Jupyter Notebooks pyLDAvis.show(figure) # For command line / scripts From 77d815a1ee27f7ae61a1dd1b1ab22a147447e287 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 19:54:15 +0200 Subject: [PATCH 17/42] change display for notebook again --- tests/test_visualization.py | 31 ++++++++++++++++++++++--------- texthero/visualization.py | 13 ++++++++++--- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d1069788..5b400039 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -3,7 +3,7 @@ import pandas as pd import doctest -from texthero import visualization +from texthero import visualization, preprocessing, representation from . import PandasTestCase @@ -85,14 +85,27 @@ def test_wordcloud(self): """ def test_plot_topics_clustering_input(self): - import texthero as hero - from sklearn.preprocessing import normalize as sklearn_normalize - import pyLDAvis - from scipy.sparse import csr_matrix - import pandas as pd + from sklearn.datasets import fetch_20newsgroups newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) s = pd.Series(newsgroups.data) - s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) - s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) - hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP + s_tfidf = s.pipe(preprocessing.clean).pipe( + preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) + s_cluster = s_tfidf.pipe( + representation.pca, n_components=20).pipe(representation.dbscan) + + self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_cluster, return_figure=True)) + + def test_plot_topics_lsa_lda_tsvd_input(self): + + from sklearn.datasets import fetch_20newsgroups + newsgroups = fetch_20newsgroups( + remove=('headers', 'footers', 'quotes')) + s = pd.Series(newsgroups.data) + s_tfidf = s.pipe(preprocessing.clean).pipe( + preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) + s_lda = s_tfidf.pipe( + representation.lda, n_components=20).pipe(representation.dbscan) + + self.assertIsNotNone(visualization.plot_topics( + s_tfidf, s_lda, return_figure=True)) diff --git a/texthero/visualization.py b/texthero/visualization.py index 2873908f..dcf578d4 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -18,6 +18,8 @@ from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis +from pyLDAvis import display as notebook_display +from pyLDAvis import show as local_display from collections import Counter @@ -375,6 +377,11 @@ def _prepare_matrices_for_pyLDAvis( def plot_topics(s_document_term, s_document_topic, return_figure=False): """ + Note: If the plot is not shown, try + doing `figure = hero.plot_topics(..., return_figure=True)` + followed by `hero.notebook_display(figure)` if you're working + in a Jupyter Notebook, else `hero.local_display(figure)`. + Parameters ---------- @@ -434,13 +441,13 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): if return_figure: return figure + else: try: pyLDAvis.enable_notebook() + pyLDAvis.display(figure) # For Jupyter Notebooks except: - pass - pyLDAvis.display(figure) # For Jupyter Notebooks - pyLDAvis.show(figure) # For command line / scripts + pyLDAvis.show(figure) # For command line / scripts """ import texthero as hero From 242383ae66f67396989bcc947545fc75ab880edf Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 24 Aug 2020 20:04:46 +0200 Subject: [PATCH 18/42] added lda missing tests --- texthero/representation.py | 87 +++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 3c87d3aa..2cedfb06 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -7,7 +7,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.manifold import TSNE -from sklearn.decomposition import PCA, NMF, TruncatedSVD +from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation from sklearn.cluster import KMeans, DBSCAN, MeanShift from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize as sklearn_normalize @@ -571,7 +571,7 @@ def truncatedSVD( """ Performs TruncatedSVD on the given pandas series. - TruncatedSVD is an algorithmen, which can be used to reduce the dimensions + TruncatedSVD is an algorithmn, which can be used to reduce the dimensions of a given series. In natural language processing, the high-dimensional data is usually a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first @@ -610,8 +610,8 @@ def truncatedSVD( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra","Football, Music"]) - ... + >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", + ... "Football, Music"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.truncatedSVD(s, random_state=42) # doctest: +SKIP 0 [0.14433756729740624, 0.15309310892394884] @@ -621,7 +621,7 @@ def truncatedSVD( See also -------- - `truncatedSVD on Wikipedia `_ + `truncatedSVD on Wikipedia ` """ truncatedSVD = TruncatedSVD( @@ -638,11 +638,86 @@ def truncatedSVD( list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index ) - result._metadata.append(("vectorizer", TruncatedSVD)) + result._metadata.append(("vectorizer", truncatedSVD)) return result +def lda( + s: Union[pd.Series, pd.DataFrame], n_components=10, max_iter=10, random_state=None, + n_jobs = -1 +) -> pd.Series: + """ + Performs Latent Dirichlet Allocation on the given pandas series. + + Latent Dirichlet Allocation(LDA) is a topic modeling algorithm + based on Dirichlet distribution. In natural language processing + LDA is often used to categorise documents into diffenrent topics + and generate top words from these topics. In this process LDA is + used in combination with algorithms, which generate document-term- + matrixes, like :meth:`count` or :meth:`tfidf` + + TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a + DocumentTermDF, the advantage of sparseness is kept. + + Parameters + ---------- + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + + n_components : int, default is 2. + Number of components to keep (in NLP context number of topics) + + max_iter : int, optional (default: 10) + The maximum number of iterations. + + random_state : int, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. + + + Returns + ------- + Pandas Series with the vector calculated by LDA for the document in every + cell and the LDA object in the metadata. This will be used in the + :meth:`plot_topics` + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", + ... "Football, Music"]) + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> hero.lda(s, random_state=42) # doctest: +SKIP + 0 [0.07272782580722714, 0.0727702366844115, 0.07... + 1 [0.07272782580700803, 0.07277023650761331, 0.0... + 2 [0.08000075593366586, 0.27990110380876265, 0.0... + dtype: object + + See also + -------- + `LDA on Wikipedia Date: Mon, 24 Aug 2020 20:06:00 +0200 Subject: [PATCH 19/42] Add tests --- tests/test_visualization.py | 22 +++++++++++---------- texthero/visualization.py | 39 ++++++++++--------------------------- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 5b400039..270947ee 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -86,26 +86,28 @@ def test_wordcloud(self): def test_plot_topics_clustering_input(self): - from sklearn.datasets import fetch_20newsgroups - newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) - s = pd.Series(newsgroups.data) + s = pd.read_csv( + "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", + columns=["text"] + )["text"] + s_tfidf = s.pipe(preprocessing.clean).pipe( preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) s_cluster = s_tfidf.pipe( representation.pca, n_components=20).pipe(representation.dbscan) - self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_cluster, return_figure=True)) + self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_cluster)) def test_plot_topics_lsa_lda_tsvd_input(self): - from sklearn.datasets import fetch_20newsgroups - newsgroups = fetch_20newsgroups( - remove=('headers', 'footers', 'quotes')) - s = pd.Series(newsgroups.data) + s = pd.read_csv( + "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", + columns=["text"] + )["text"] + s_tfidf = s.pipe(preprocessing.clean).pipe( preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) s_lda = s_tfidf.pipe( representation.lda, n_components=20).pipe(representation.dbscan) - self.assertIsNotNone(visualization.plot_topics( - s_tfidf, s_lda, return_figure=True)) + self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_lda)) diff --git a/texthero/visualization.py b/texthero/visualization.py index dcf578d4..0c92734a 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -9,7 +9,7 @@ from wordcloud import WordCloud from texthero import preprocessing -from texthero._types import TextSeries, InputSeries +from texthero._types import TextSeries, InputSeries, DocumentTermDF import string from matplotlib.colors import LinearSegmentedColormap as lsg @@ -18,8 +18,8 @@ from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis -from pyLDAvis import display as notebook_display -from pyLDAvis import show as local_display +from pyLDAvis import display as display_notebook +from pyLDAvis import show as display_browser from collections import Counter @@ -374,9 +374,14 @@ def _prepare_matrices_for_pyLDAvis( return document_topic_distributions, topic_term_distributions -def plot_topics(s_document_term, s_document_topic, return_figure=False): +def plot_topics(s_document_term: DocumentTermDF, s_document_topic): """ + + **To show the plot**: + - Interactively in a Jupyter Notebook: do `hero.display_notebook(hero.plot_topics(...))` + - In a new browser window: do `hero.display_browser(hero.plot_topics(...))` + Note: If the plot is not shown, try doing `figure = hero.plot_topics(..., return_figure=True)` followed by `hero.notebook_display(figure)` if you're working @@ -429,7 +434,7 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): ) - figure = pyLDAvis.prepare( + return pyLDAvis.prepare( **{ "vocab": vocab, "doc_lengths": doc_lengths, @@ -438,27 +443,3 @@ def plot_topics(s_document_term, s_document_topic, return_figure=False): "topic_term_dists": topic_term_distributions, } ) - - if return_figure: - return figure - - else: - try: - pyLDAvis.enable_notebook() - pyLDAvis.display(figure) # For Jupyter Notebooks - except: - pyLDAvis.show(figure) # For command line / scripts - -""" -import texthero as hero -from sklearn.preprocessing import normalize as sklearn_normalize -import pyLDAvis -from scipy.sparse import csr_matrix -import pandas as pd -from sklearn.datasets import fetch_20newsgroups -newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) -s = pd.Series(newsgroups.data) -s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) -s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) -hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP -""" From 46289f28bb6d5a597bd3ee507777448b3d90491a Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 20:09:50 +0200 Subject: [PATCH 20/42] Format; change name; remove new type Signature --- tests/test_visualization.py | 38 ++++++++++++++++++++++--------------- texthero/representation.py | 16 +++++++++------- texthero/visualization.py | 37 ++++++++++++++++-------------------- 3 files changed, 48 insertions(+), 43 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 270947ee..ea940246 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -81,33 +81,41 @@ def test_wordcloud(self): self.assertEqual(visualization.wordcloud(s), None) """ - Test plot_topics + Test visualize_topics """ - def test_plot_topics_clustering_input(self): + def test_visualize_topics_clustering_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - columns=["text"] + columns=["text"], )["text"] - s_tfidf = s.pipe(preprocessing.clean).pipe( - preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) - s_cluster = s_tfidf.pipe( - representation.pca, n_components=20).pipe(representation.dbscan) + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf, max_df=0.5, min_df=100) + ) + s_cluster = s_tfidf.pipe(representation.pca, n_components=20).pipe( + representation.dbscan + ) - self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_cluster)) + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) - def test_plot_topics_lsa_lda_tsvd_input(self): + def test_visualize_topics_lsa_lda_tsvd_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - columns=["text"] + columns=["text"], )["text"] - s_tfidf = s.pipe(preprocessing.clean).pipe( - preprocessing.tokenize).pipe(representation.tfidf, max_df=0.5, min_df=100) - s_lda = s_tfidf.pipe( - representation.lda, n_components=20).pipe(representation.dbscan) + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf, max_df=0.5, min_df=100) + ) + s_lda = s_tfidf.pipe(representation.lda, n_components=20).pipe( + representation.dbscan + ) - self.assertIsNotNone(visualization.plot_topics(s_tfidf, s_lda)) + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) diff --git a/texthero/representation.py b/texthero/representation.py index 2cedfb06..ebc25757 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -604,7 +604,7 @@ def truncatedSVD( ------- Pandas Series with the vector calculated by truncadedSVD for the document in every cell and the truncadedSVD object in the metadata. This will be used in the - :meth:`plot_topics` + :meth:`visualize_topics` Examples -------- @@ -644,8 +644,11 @@ def truncatedSVD( def lda( - s: Union[pd.Series, pd.DataFrame], n_components=10, max_iter=10, random_state=None, - n_jobs = -1 + s: Union[pd.Series, pd.DataFrame], + n_components=10, + max_iter=10, + random_state=None, + n_jobs=-1, ) -> pd.Series: """ Performs Latent Dirichlet Allocation on the given pandas series. @@ -679,7 +682,7 @@ def lda( ------- Pandas Series with the vector calculated by LDA for the document in every cell and the LDA object in the metadata. This will be used in the - :meth:`plot_topics` + :meth:`visualize_topics` Examples -------- @@ -710,14 +713,13 @@ def lda( else: s_for_vectorization = list(s) - result = pd.Series( - list(lda.fit_transform(s_for_vectorization)), index=s.index - ) + result = pd.Series(list(lda.fit_transform(s_for_vectorization)), index=s.index) result._metadata.append(("vectorizer", lda)) return result + """ Clustering """ diff --git a/texthero/visualization.py b/texthero/visualization.py index 0c92734a..67a5e71c 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -312,7 +312,7 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def _get_matrices_for_plot_topics(s_document_term, s_document_topic, vectorizer): +def _get_matrices_for_visualize_topics(s_document_term, s_document_topic, vectorizer): if vectorizer: @@ -348,17 +348,13 @@ def _get_matrices_for_plot_topics(s_document_term, s_document_topic, vectorizer) return s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix -def _prepare_matrices_for_pyLDAvis( - document_topic_matrix, topic_term_matrix -): +def _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix): document_topic_distributions = sklearn_normalize( document_topic_matrix, norm="l1", axis=1 ) - topic_term_distributions = sklearn_normalize( - topic_term_matrix, norm="l1", axis=1) - + topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) # Make sparse matrices dense for pyLDAvis if issparse(document_topic_distributions): @@ -374,16 +370,16 @@ def _prepare_matrices_for_pyLDAvis( return document_topic_distributions, topic_term_distributions -def plot_topics(s_document_term: DocumentTermDF, s_document_topic): +def visualize_topics(s_document_term, s_document_topic): # TODO: add types to signature when they're merged """ **To show the plot**: - - Interactively in a Jupyter Notebook: do `hero.display_notebook(hero.plot_topics(...))` - - In a new browser window: do `hero.display_browser(hero.plot_topics(...))` + - Interactively in a Jupyter Notebook: do `hero.display_notebook(hero.visualize_topics(...))` + - In a new browser window: do `hero.display_browser(hero.visualize_topics(...))` Note: If the plot is not shown, try - doing `figure = hero.plot_topics(..., return_figure=True)` + doing `figure = hero.visualize_topics(..., return_figure=True)` followed by `hero.notebook_display(figure)` if you're working in a Jupyter Notebook, else `hero.local_display(figure)`. @@ -399,7 +395,7 @@ def plot_topics(s_document_term: DocumentTermDF, s_document_topic): >>> s = pd.Series(newsgroups.data) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) >>> s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) - >>> hero.plot_topics(s_tfidf, s_cluster) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster) # doctest: +SKIP See Also -------- @@ -417,22 +413,21 @@ def plot_topics(s_document_term: DocumentTermDF, s_document_topic): # Get / build matrices from input - s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix = _get_matrices_for_plot_topics( + ( s_document_term, s_document_topic, - vectorizer - ) - + document_topic_matrix, + topic_term_matrix, + ) = _get_matrices_for_visualize_topics(s_document_term, s_document_topic, vectorizer) vocab = list(s_document_term.columns.levels[1]) doc_lengths = list(s_document_term.sum(axis=1)) term_frequency = list(s_document_term.sum(axis=0)) - - document_topic_distributions, topic_term_distributions = _prepare_matrices_for_pyLDAvis( - document_topic_matrix, topic_term_matrix - ) - + ( + document_topic_distributions, + topic_term_distributions, + ) = _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix) return pyLDAvis.prepare( **{ From bcfa78d2a850958a616df9e87c4008ddec646253 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 24 Aug 2020 20:48:45 +0200 Subject: [PATCH 21/42] updatewd test --- tests/test_visualization.py | 6 +++--- texthero/visualization.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index ea940246..df839b4b 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -88,13 +88,13 @@ def test_visualize_topics_clustering_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - columns=["text"], + usecols=["text"], )["text"] s_tfidf = ( s.pipe(preprocessing.clean) .pipe(preprocessing.tokenize) - .pipe(representation.tfidf, max_df=0.5, min_df=100) + .pipe(representation.tfidf) ) s_cluster = s_tfidf.pipe(representation.pca, n_components=20).pipe( representation.dbscan @@ -106,7 +106,7 @@ def test_visualize_topics_lsa_lda_tsvd_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - columns=["text"], + usecols=["text"], )["text"] s_tfidf = ( diff --git a/texthero/visualization.py b/texthero/visualization.py index 67a5e71c..3b5a09ab 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -9,7 +9,7 @@ from wordcloud import WordCloud from texthero import preprocessing -from texthero._types import TextSeries, InputSeries, DocumentTermDF +from texthero._types import TextSeries, InputSeries import string from matplotlib.colors import LinearSegmentedColormap as lsg From eb2d31b0408d303a45ecdd7833d0adf47be84296 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 20:51:19 +0200 Subject: [PATCH 22/42] add docstring --- texthero/visualization.py | 101 +++++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 12 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 67a5e71c..973af63f 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -9,7 +9,7 @@ from wordcloud import WordCloud from texthero import preprocessing -from texthero._types import TextSeries, InputSeries, DocumentTermDF +from texthero._types import TextSeries, InputSeries import string from matplotlib.colors import LinearSegmentedColormap as lsg @@ -315,19 +315,21 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: def _get_matrices_for_visualize_topics(s_document_term, s_document_topic, vectorizer): if vectorizer: + # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. - # s_document_topic is output of hero.lda or hero.lsi document_term_matrix = s_document_term.sparse.to_coo() document_topic_matrix = np.array(list(s_document_topic)) topic_term_matrix = vectorizer.components_ else: - # s_document_topic is output of some hero clustering function + # Here, s_document_topic is output of some hero clustering function. + + # First remove documents that are not assigned to any cluster. indexes_of_unassigned_documents = s_document_topic == -1 s_document_term = s_document_term[~indexes_of_unassigned_documents] s_document_topic = s_document_topic[~indexes_of_unassigned_documents] - s_document_topic.cat.remove_unused_categories(inplace=True) + s_document_topic = s_document_topic.cat.remove_unused_categories() document_term_matrix = s_document_term.sparse.to_coo() @@ -370,8 +372,40 @@ def _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix): return document_topic_distributions, topic_term_distributions -def visualize_topics(s_document_term, s_document_topic): # TODO: add types to signature when they're merged +def visualize_topics(s_document_term, s_document_topic): + # TODO: add types everywhere when they're merged """ + Visualize the topics of your dataset. First input has + to be output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + (tfidf suggested). + + Second input can either be the result of + clustering, so output of one of + - :meth:`texthero.representation.kmeans` + - :meth:`texthero.representation.meanshift` + - :meth:`texthero.representation.dbscan` + + or the result of a topic modelling function, so + one of + - :meth:`texthero.representation.lda` + - :meth:`texthero.representation.truncatedSVD` + + (topic modelling output suggested). + + The function uses the given clustering + or topic modelling from the second input, which relates + documents to topics. The first input + relates documents to terms. From those + two relations (documents->topics, documents->terms), + the function calculates a distribution of + documents to topics, and a distribution + of topics to terms. These distributions + are passed to `pyLDAvis `_, + which visualizes them. **To show the plot**: @@ -385,20 +419,62 @@ def visualize_topics(s_document_term, s_document_topic): # TODO: add types to s Parameters ---------- + s_document_term: pd.DataFrame + + One of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + s_document_topic: pd.Series + + One of + - :meth:`texthero.representation.kmeans` + - :meth:`texthero.representation.meanshift` + - :meth:`texthero.representation.dbscan` + (using clustering functions, documents + that are not assigned to a cluster are + not considered in the visualization) + or one of + - :meth:`texthero.representation.lda` + - :meth:`texthero.representation.truncatedSVD` + Examples -------- + Using Clustering: + >>> import texthero as hero >>> import pandas as pd - >>> from sklearn.datasets import fetch_20newsgroups - >>> newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) - >>> s = pd.Series(newsgroups.data) - >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", columns=["text"]) + >>> # Use max_df=0.5, min_df=100 in tfidf to speed things up (fewer features). + >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) >>> s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) - >>> hero.visualize_topics(s_tfidf, s_cluster) # doctest: +SKIP + >>> # Display in a new browser window: + >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> # Display inside the current Jupyter Notebook: + >>> hero.display_notebook(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + + Using LDA: + + >>> import texthero as hero + >>> import pandas as pd + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") + >>> # Use max_df=0.5, min_df=100 in tfidf to speed things up (fewer features). + >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) + >>> s_lda = s_tfidf.pipe(hero.lda, n_components=20) + >>> # Display in a new browser window: + >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> # Display inside the current Jupyter Notebook: + >>> hero.display_notebook(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + See Also -------- + `pyLDAvis `_ + + TODO add tutorial link + """ metadata_list = s_document_topic._metadata @@ -412,13 +488,14 @@ def visualize_topics(s_document_term, s_document_topic): # TODO: add types to s vectorizer = None # Get / build matrices from input - ( s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix, - ) = _get_matrices_for_visualize_topics(s_document_term, s_document_topic, vectorizer) + ) = _get_matrices_for_visualize_topics( + s_document_term, s_document_topic, vectorizer + ) vocab = list(s_document_term.columns.levels[1]) doc_lengths = list(s_document_term.sum(axis=1)) From 3a3934672959bc08d41bb48fce53dfdf63d2d39e Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Mon, 24 Aug 2020 22:49:51 +0200 Subject: [PATCH 23/42] Implement matrix multiplication changes; fix metadata error --- tests/test_visualization.py | 32 ++++++++++++++++++-------------- texthero/representation.py | 24 +++++++++--------------- texthero/visualization.py | 37 ++++++++++++++----------------------- 3 files changed, 41 insertions(+), 52 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index df839b4b..ddf73961 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -2,6 +2,7 @@ import pandas as pd import doctest +import warnings from texthero import visualization, preprocessing, representation from . import PandasTestCase @@ -84,38 +85,41 @@ def test_wordcloud(self): Test visualize_topics """ - def test_visualize_topics_clustering_input(self): + def test_visualize_topics_clustering_for_second_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - usecols=["text"], - )["text"] + )["text"][:100] s_tfidf = ( s.pipe(preprocessing.clean) .pipe(preprocessing.tokenize) .pipe(representation.tfidf) ) - s_cluster = s_tfidf.pipe(representation.pca, n_components=20).pipe( - representation.dbscan + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=20) + .pipe(representation.kmeans) ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) - - def test_visualize_topics_lsa_lda_tsvd_input(self): + def test_visualize_topics_topic_modelling_for_second_input(self): s = pd.read_csv( "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - usecols=["text"], - )["text"] + )["text"][:100] s_tfidf = ( s.pipe(preprocessing.clean) .pipe(preprocessing.tokenize) - .pipe(representation.tfidf, max_df=0.5, min_df=100) + .pipe(representation.tfidf) ) - s_lda = s_tfidf.pipe(representation.lda, n_components=20).pipe( - representation.dbscan + s_lda = s_tfidf.pipe(representation.normalize).pipe( + representation.lda, n_components=20 ) - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) diff --git a/texthero/representation.py b/texthero/representation.py index ebc25757..51a2f969 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -603,8 +603,7 @@ def truncatedSVD( Returns ------- Pandas Series with the vector calculated by truncadedSVD for the document in every - cell and the truncadedSVD object in the metadata. This will be used in the - :meth:`visualize_topics` + cell. Examples -------- @@ -638,8 +637,6 @@ def truncatedSVD( list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index ) - result._metadata.append(("vectorizer", truncatedSVD)) - return result @@ -653,12 +650,12 @@ def lda( """ Performs Latent Dirichlet Allocation on the given pandas series. - Latent Dirichlet Allocation(LDA) is a topic modeling algorithm - based on Dirichlet distribution. In natural language processing - LDA is often used to categorise documents into diffenrent topics - and generate top words from these topics. In this process LDA is - used in combination with algorithms, which generate document-term- - matrixes, like :meth:`count` or :meth:`tfidf` + Latent Dirichlet Allocation(LDA) is a topic modeling algorithm + based on Dirichlet distribution. In natural language processing + LDA is often used to categorise documents into diffenrent topics + and generate top words from these topics. In this process LDA is + used in combination with algorithms, which generate document-term- + matrixes, like :meth:`count` or :meth:`tfidf` TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a DocumentTermDF, the advantage of sparseness is kept. @@ -671,7 +668,7 @@ def lda( Number of components to keep (in NLP context number of topics) max_iter : int, optional (default: 10) - The maximum number of iterations. + The maximum number of iterations. random_state : int, default=None Determines the random number generator. Pass an int for reproducible @@ -681,8 +678,7 @@ def lda( Returns ------- Pandas Series with the vector calculated by LDA for the document in every - cell and the LDA object in the metadata. This will be used in the - :meth:`visualize_topics` + cell. Examples -------- @@ -715,8 +711,6 @@ def lda( result = pd.Series(list(lda.fit_transform(s_for_vectorization)), index=s.index) - result._metadata.append(("vectorizer", lda)) - return result diff --git a/texthero/visualization.py b/texthero/visualization.py index 973af63f..05ff7dd9 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -312,15 +312,18 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def _get_matrices_for_visualize_topics(s_document_term, s_document_topic, vectorizer): +def _get_matrices_for_visualize_topics( + s_document_term, s_document_topic, clustering_function_used +): - if vectorizer: + if not clustering_function_used: # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. document_term_matrix = s_document_term.sparse.to_coo() document_topic_matrix = np.array(list(s_document_topic)) - topic_term_matrix = vectorizer.components_ + # topic_term_matrix = vectorizer.components_ + topic_term_matrix = document_topic_matrix.T * document_term_matrix else: # Here, s_document_topic is output of some hero clustering function. @@ -412,11 +415,6 @@ def visualize_topics(s_document_term, s_document_topic): - Interactively in a Jupyter Notebook: do `hero.display_notebook(hero.visualize_topics(...))` - In a new browser window: do `hero.display_browser(hero.visualize_topics(...))` - Note: If the plot is not shown, try - doing `figure = hero.visualize_topics(..., return_figure=True)` - followed by `hero.notebook_display(figure)` if you're working - in a Jupyter Notebook, else `hero.local_display(figure)`. - Parameters ---------- s_document_term: pd.DataFrame @@ -446,10 +444,11 @@ def visualize_topics(s_document_term, s_document_topic): >>> import texthero as hero >>> import pandas as pd - >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", columns=["text"]) - >>> # Use max_df=0.5, min_df=100 in tfidf to speed things up (fewer features). - >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) - >>> s_cluster = s_tfidf.pipe(hero.pca, n_components=20).pipe(hero.dbscan) + >>> # Take first 1000 documents of some dataset. + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv")[:100] + >>> # Use max_df=0.5, min_df=10 in tfidf to speed things up (fewer features). + >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=10) + >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=20).pipe(hero.dbscan) >>> # Display in a new browser window: >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: @@ -476,16 +475,7 @@ def visualize_topics(s_document_term, s_document_topic): TODO add tutorial link """ - metadata_list = s_document_topic._metadata - - for item in metadata_list: - if isinstance(item, tuple): - if item[0] == "vectorizer": - vectorizer = item[1] - break - else: - # no vectorizer found - vectorizer = None + clustering_function_used = s_document_topic.dtype.name == "category" # Get / build matrices from input ( @@ -494,7 +484,7 @@ def visualize_topics(s_document_term, s_document_topic): document_topic_matrix, topic_term_matrix, ) = _get_matrices_for_visualize_topics( - s_document_term, s_document_topic, vectorizer + s_document_term, s_document_topic, clustering_function_used ) vocab = list(s_document_term.columns.levels[1]) @@ -513,5 +503,6 @@ def visualize_topics(s_document_term, s_document_topic): "term_frequency": term_frequency, "doc_topic_dists": document_topic_distributions, "topic_term_dists": topic_term_distributions, + "R": 15, } ) From 4ad7ee8f0181c80b14589ee8dae175a52f3a6658 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 25 Aug 2020 08:36:16 +0200 Subject: [PATCH 24/42] added test for lda and tSVD --- tests/test_indexes.py | 2 ++ tests/test_representation.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index af7afcd2..ceafad73 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -62,6 +62,8 @@ ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], + ["truncatedSVD", representation.tsne, (s_numeric_lists, 1)], + ["lda", representation.tsne, (s_numeric_lists, 1)], ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists,)], ["meanshift", representation.meanshift, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 2722289e..04a014ec 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -158,6 +158,12 @@ def _get_multiindex_for_tokenized_output(first_level_name): representation.kmeans, pd.Series([1, 0], index=[5, 7], dtype="category"), ], + [ + "truncatedSVD", + representation.truncatedSVD, + pd.Series([[1.0], [0.0]], index=[5, 7],), + ], + ["lda", representation.lda, pd.Series([[1.0], [1.0]], index=[5, 7],),], [ "dbscan", representation.dbscan, @@ -234,6 +240,8 @@ def test_dim_reduction_and_clustering_with_vector_series_input( result_s = test_function(s_vector_series, random_state=42, n_clusters=2) elif name == "dbscan" or name == "meanshift" or name == "normalize": result_s = test_function(s_vector_series) + elif name == "lda" or name == "truncatedSVD": + result_s = test_function(s_vector_series, n_components=1, random_state=42) else: result_s = test_function(s_vector_series, random_state=42) @@ -260,6 +268,8 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) elif name == "dbscan" or name == "meanshift" or name == "normalize": result_s = test_function(s_documenttermDF) + elif name == "lda" or name == "truncatedSVD": + result_s = test_function(s_documenttermDF, n_components=1, random_state=42) else: result_s = test_function(s_documenttermDF, random_state=42) From 65504bbf5c16b2353b826bfef9475ba23979d7d2 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 11:53:52 +0200 Subject: [PATCH 25/42] Implement top_words_per_document, top_words_per_topic, topics_from_topic_model Co-authored-by: Maximilian Krahn --- tests/test_visualization.py | 28 ++- texthero/representation.py | 357 +++++++++++++++++++++--------------- texthero/visualization.py | 196 ++++++++++++++++++-- 3 files changed, 410 insertions(+), 171 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index ddf73961..68d028c6 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -87,9 +87,14 @@ def test_wordcloud(self): def test_visualize_topics_clustering_for_second_input(self): - s = pd.read_csv( - "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - )["text"][:100] + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) s_tfidf = ( s.pipe(preprocessing.clean) @@ -98,8 +103,8 @@ def test_visualize_topics_clustering_for_second_input(self): ) s_cluster = ( s_tfidf.pipe(representation.normalize) - .pipe(representation.pca, n_components=20) - .pipe(representation.kmeans) + .pipe(representation.pca, n_components=2) + .pipe(representation.kmeans, n_clusters=2) ) with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -107,9 +112,14 @@ def test_visualize_topics_clustering_for_second_input(self): def test_visualize_topics_topic_modelling_for_second_input(self): - s = pd.read_csv( - "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv", - )["text"][:100] + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) s_tfidf = ( s.pipe(preprocessing.clean) @@ -117,7 +127,7 @@ def test_visualize_topics_topic_modelling_for_second_input(self): .pipe(representation.tfidf) ) s_lda = s_tfidf.pipe(representation.normalize).pipe( - representation.lda, n_components=20 + representation.lda, n_components=2 ) with warnings.catch_warnings(): diff --git a/texthero/representation.py b/texthero/representation.py index 51a2f969..fb28b24a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -565,155 +565,6 @@ def tsne( return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) -def truncatedSVD( - s: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, -) -> pd.Series: - """ - Performs TruncatedSVD on the given pandas series. - - TruncatedSVD is an algorithmn, which can be used to reduce the dimensions - of a given series. In natural language processing, the high-dimensional data - is usually a document-term matrix (so in texthero usually a Series after - applying :meth:`texthero.representation.tfidf` or some other first - representation function that assigns a scalar (a weight) to each word). - This is used as a tool to extract the most important topics and words - of a given Series. In this context it is refered to as latent semantic analysis (LSA), - or Latent Semantic Analysis (LSI) - - - TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a - DocumentTermDF, the advantage of sparseness is kept. - - Parameters - ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) - - n_components : int, default is 2. - Number of components to keep (dimensionality of output vectors). - For LSA, a value of 100 is recommended - - n_iter : int, optional (default: 5) - Number of iterations for randomized SVD solver. - - random_state : int, default=None - Determines the random number generator. Pass an int for reproducible - results across multiple function calls. - - - Returns - ------- - Pandas Series with the vector calculated by truncadedSVD for the document in every - cell. - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", - ... "Football, Music"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) - >>> hero.truncatedSVD(s, random_state=42) # doctest: +SKIP - 0 [0.14433756729740624, 0.15309310892394884] - 1 [0.14433756729740663, -0.1530931089239484] - 2 [0.14433756729740646, 7.211110073938366e-17] - dtype: object - - See also - -------- - `truncatedSVD on Wikipedia ` - - """ - truncatedSVD = TruncatedSVD( - n_components=n_components, n_iter=n_iter, random_state=random_state - ) - - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") - else: - s_for_vectorization = list(s) - - result = pd.Series( - list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index - ) - - return result - - -def lda( - s: Union[pd.Series, pd.DataFrame], - n_components=10, - max_iter=10, - random_state=None, - n_jobs=-1, -) -> pd.Series: - """ - Performs Latent Dirichlet Allocation on the given pandas series. - - Latent Dirichlet Allocation(LDA) is a topic modeling algorithm - based on Dirichlet distribution. In natural language processing - LDA is often used to categorise documents into diffenrent topics - and generate top words from these topics. In this process LDA is - used in combination with algorithms, which generate document-term- - matrixes, like :meth:`count` or :meth:`tfidf` - - TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a - DocumentTermDF, the advantage of sparseness is kept. - - Parameters - ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) - - n_components : int, default is 2. - Number of components to keep (in NLP context number of topics) - - max_iter : int, optional (default: 10) - The maximum number of iterations. - - random_state : int, default=None - Determines the random number generator. Pass an int for reproducible - results across multiple function calls. - - - Returns - ------- - Pandas Series with the vector calculated by LDA for the document in every - cell. - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", - ... "Football, Music"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) - >>> hero.lda(s, random_state=42) # doctest: +SKIP - 0 [0.07272782580722714, 0.0727702366844115, 0.07... - 1 [0.07272782580700803, 0.07277023650761331, 0.0... - 2 [0.08000075593366586, 0.27990110380876265, 0.0... - dtype: object - - See also - -------- - `LDA on Wikipedia pd.Series: + """ + Performs TruncatedSVD on the given pandas series. + + TruncatedSVD is an algorithmn, which can be used to reduce the dimensions + of a given series. In natural language processing, the high-dimensional data + is usually a document-term matrix (so in texthero usually a Series after + applying :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each word). + This is used as a tool to extract the most important topics and words + of a given Series. In this context it is refered to as latent semantic analysis (LSA), + or Latent Semantic Analysis (LSI) + + + TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a + DocumentTermDF, the advantage of sparseness is kept. + + Parameters + ---------- + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + + n_components : int, default is 2. + Number of components to keep (dimensionality of output vectors). + For LSA, a value of 100 is recommended + + n_iter : int, optional (default: 5) + Number of iterations for randomized SVD solver. + + random_state : int, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. + + + Returns + ------- + Pandas Series with the vector calculated by truncadedSVD for the document in every + cell. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", + ... "Football, Music"]) + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> hero.truncatedSVD(s, random_state=42) # doctest: +SKIP + 0 [0.14433756729740624, 0.15309310892394884] + 1 [0.14433756729740663, -0.1530931089239484] + 2 [0.14433756729740646, 7.211110073938366e-17] + dtype: object + + See also + -------- + `truncatedSVD on Wikipedia ` + + """ + truncatedSVD = TruncatedSVD( + n_components=n_components, n_iter=n_iter, random_state=random_state + ) + + if _check_is_valid_DocumentTermDF(s): + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") + else: + s_for_vectorization = list(s) + + result = pd.Series( + list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index + ) + + return result + + +def lda( + s: Union[pd.Series, pd.DataFrame], + n_components=10, + max_iter=10, + random_state=None, + n_jobs=-1, +) -> pd.Series: + """ + Performs Latent Dirichlet Allocation on the given pandas series. + + Latent Dirichlet Allocation(LDA) is a topic modeling algorithm + based on Dirichlet distribution. In natural language processing + LDA is often used to categorise documents into diffenrent topics + and generate top words from these topics. In this process LDA is + used in combination with algorithms, which generate document-term- + matrixes, like :meth:`count` or :meth:`tfidf` + + TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a + DocumentTermDF, the advantage of sparseness is kept. + + Parameters + ---------- + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + + n_components : int, default is 2. + Number of components to keep (in NLP context number of topics) + + max_iter : int, optional (default: 10) + The maximum number of iterations. + + random_state : int, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. + + + Returns + ------- + Pandas Series with the vector calculated by LDA for the document in every + cell. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", + ... "Football, Music"]) + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> hero.lda(s, random_state=42) # doctest: +SKIP + 0 [0.07272782580722714, 0.0727702366844115, 0.07... + 1 [0.07272782580700803, 0.07277023650761331, 0.0... + 2 [0.08000075593366586, 0.27990110380876265, 0.0... + dtype: object + + See also + -------- + `LDA on Wikipedia >> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> # Use Latent Dirichlet Allocation to relate documents to topics.s + >>> s_lda = s_tfidf.pipe(hero.lda, n_components=2) + >>> # Extract the best-matching topic per document. + >>> hero.topics_from_topic_model(s_lda) # doctest: +SKIP + 0 1 + 1 0 + 2 1 + 3 0 + dtype: category + Categories (2, int64): [0, 1] + + + See Also + -------- + TODO add tutorial link + + :meth:`texthero.visualization.top_words_per_topic`_ to find the top words + per topic after applying this function. + + """ + + document_topic_matrix = np.matrix(s_document_topic.tolist()) + + cluster_IDs = np.argmax(document_topic_matrix, axis=1).getA1() + + return pd.Series(cluster_IDs, index=s_document_topic.index, dtype="category") + """ Normalization. diff --git a/texthero/visualization.py b/texthero/visualization.py index 05ff7dd9..3f2e71ac 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -17,6 +17,7 @@ from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import normalize as sklearn_normalize + import pyLDAvis from pyLDAvis import display as display_notebook from pyLDAvis import show as display_browser @@ -430,7 +431,7 @@ def visualize_topics(s_document_term, s_document_topic): - :meth:`texthero.representation.kmeans` - :meth:`texthero.representation.meanshift` - :meth:`texthero.representation.dbscan` - (using clustering functions, documents + (using clustering functkmeansions, documents that are not assigned to a cluster are not considered in the visualization) or one of @@ -444,11 +445,9 @@ def visualize_topics(s_document_term, s_document_topic): >>> import texthero as hero >>> import pandas as pd - >>> # Take first 1000 documents of some dataset. - >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv")[:100] - >>> # Use max_df=0.5, min_df=10 in tfidf to speed things up (fewer features). - >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=10) - >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=20).pipe(hero.dbscan) + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> # Display in a new browser window: >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: @@ -458,12 +457,11 @@ def visualize_topics(s_document_term, s_document_topic): >>> import texthero as hero >>> import pandas as pd - >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") - >>> # Use max_df=0.5, min_df=100 in tfidf to speed things up (fewer features). - >>> s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df=0.5, min_df=100) - >>> s_lda = s_tfidf.pipe(hero.lda, n_components=20) + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_lda = s_tfidf.pipe(hero.lda, n_components=5) >>> # Display in a new browser window: - >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: >>> hero.display_notebook(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP @@ -496,7 +494,7 @@ def visualize_topics(s_document_term, s_document_topic): topic_term_distributions, ) = _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix) - return pyLDAvis.prepare( + figure = pyLDAvis.prepare( **{ "vocab": vocab, "doc_lengths": doc_lengths, @@ -504,5 +502,179 @@ def visualize_topics(s_document_term, s_document_topic): "doc_topic_dists": document_topic_distributions, "topic_term_dists": topic_term_distributions, "R": 15, + "sort_topics": False, } ) + + return figure + + +def top_words_per_topic(s_document_term, s_clusters, n_words=5): + # TODO: add types everywhere when they're merged + """ + Find the top words per topic of your dataset. First input has + to be output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + (tfidf suggested). + + Second input has to be the result of + clustering, so output of one of + - :meth:`texthero.representation.kmeans` + - :meth:`texthero.representation.meanshift` + - :meth:`texthero.representation.dbscan`. + + The function uses the given clustering + from the second input, which relates + documents to topics. The first input + relates documents to terms. From those + two relations (documents->topics, documents->terms), + the function calculates a distribution of + documents to topics, and a distribution + of topics to terms. These distributions + are used to find the most relevant + terms per topic. + + Parameters + ---------- + s_document_term: pd.DataFrame + + One of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + s_clusters: pd.Series + + One of + - :meth:`texthero.representation.kmeans` + - :meth:`texthero.representation.meanshift` + - :meth:`texthero.representation.dbscan` + - :meth:`texthero.representation.topics_from_topic_model` + + n_words: int, default to 5 + Number of top words per topic, should + be <= 30. + + Examples + -------- + Using Clustering: + + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) + >>> hero.top_words_per_topic(s_tfidf, s_cluster) # doctest: +SKIP + Category + 0 [sports, football, soccer] + 1 [music, violin, orchestra] + Name: Term, dtype: object + + See Also + -------- + `pyLDAvis `_ + + TODO add tutorial link + + """ + + pyLDAvis_result = visualize_topics(s_document_term, s_clusters).to_dict() + + df_topics_and_their_top_words = pd.DataFrame(pyLDAvis_result["tinfo"]) + + # Throw out topic "Default" + df_topics_and_their_top_words = df_topics_and_their_top_words[ + df_topics_and_their_top_words["Category"] != "Default" + ] + + n_topics = df_topics_and_their_top_words["Category"].nunique() + + # Our topics / clusters begin at 0 -> use i-1 + replace_dict = {"Topic{}".format(i): i - 1 for i in range(1, n_topics + 1)} + + df_topics_and_their_top_words["Category"] = df_topics_and_their_top_words[ + "Category" + ].replace(replace_dict) + + df_topics_and_their_top_words = df_topics_and_their_top_words.sort_values( + ["Category", "Freq"], ascending=[1, 0] + ) + + s_topics_with_top_words = df_topics_and_their_top_words.groupby("Category")[ + "Term" + ].apply(list) + + s_topics_with_top_words = s_topics_with_top_words.apply(lambda x: x[:n_words]) + + return s_topics_with_top_words + + +def top_words_per_document(s_document_term, n_words=3): + # TODO: add types everywhere when they're merged + """ + Find the top words per topic of your dataset. First input has + to be output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + (tfidf suggested). + + TODO + + Parameters + ---------- + s_document_term: pd.DataFrame + + One of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency` + + n_words: int, default to 3 + Number of top words per topic, should + be <= 30. + + Examples + -------- + Using Clustering: + + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> hero.top_words_per_document(s_tfidf, n_words=2) # doctest: +SKIP + Category + 0 [sports, football, soccer] + 1 [music, violin, orchestra] + Name: Term, dtype: object + + See Also + -------- + `pyLDAvis `_ + + TODO add tutorial link + + """ + + s_cluster = pd.Series(s_document_term.index.tolist()) + + s_top_words_per_document = top_words_per_topic( + s_document_term, s_cluster.astype("category"), n_words=n_words + ) + + return s_top_words_per_document.reindex(s.index) + + +""" +TODO + +- tests for top_words_per_document, top_words_per_topic, topics_from_topic_model + -> try second one also with error when category == -1 somewhere + +- docstrings of all functions (also private helpers) + comments + +""" From 76e76896e50f2ca01bb67de9cf207c72c1d5899d Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 25 Aug 2020 12:41:54 +0200 Subject: [PATCH 26/42] added tests for topic functions --- tests/test_representation.py | 11 +++++++- tests/test_visualization.py | 54 ++++++++++++++++++++++++++++++++++++ texthero/visualization.py | 9 +++--- 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 04a014ec..ea2390ee 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -179,6 +179,11 @@ def _get_multiindex_for_tokenized_output(first_level_name): representation.normalize, pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), ], + [ + "topics_from_topic_model", + representation.topics_from_topic_model, + pd.Series([0, 0], index=[5, 7], dtype="category"), + ], ] @@ -242,6 +247,10 @@ def test_dim_reduction_and_clustering_with_vector_series_input( result_s = test_function(s_vector_series) elif name == "lda" or name == "truncatedSVD": result_s = test_function(s_vector_series, n_components=1, random_state=42) + elif name == "topics_from_topic_model": + result_s = test_function( + representation.lda(s_vector_series, n_components=1, random_state=42) + ) else: result_s = test_function(s_vector_series, random_state=42) @@ -260,7 +269,7 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( ): s_true = correct_output - if name == "normalize": + if name == "normalize" or "topics_from_topic_model": # testing this below separately return diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 68d028c6..4ef7426c 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -133,3 +133,57 @@ def test_visualize_topics_topic_modelling_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) + + def test_top_words_per_document(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_result = visualization.top_words_per_document(s_tfidf, n_words=2) + + s_true = pd.Series( + [ + ["soccer", "sports"], + ["violin", "orchestra"], + ["fun", "sports"], + ["guitar", "band"], + ], + name="Term", + ) + pd.testing.assert_series_equal(s_result, s_true) + + def test_top_words_per_topic(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=2, random_state=42) + .pipe(representation.kmeans, n_clusters=2, random_state=42) + ) + s_result = visualization.top_words_per_topic(s_tfidf, s_cluster, n_words=3) + s_true = pd.Series( + [["music", "violin", "orchestra"],["sports", "football", "soccer"]], + name="Term" + ) + pd.testing.assert_series_equal(s_result, s_true, check_names=False) diff --git a/texthero/visualization.py b/texthero/visualization.py index 3f2e71ac..0b886a80 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -647,9 +647,10 @@ def top_words_per_document(s_document_term, n_words=3): >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.top_words_per_document(s_tfidf, n_words=2) # doctest: +SKIP - Category - 0 [sports, football, soccer] - 1 [music, violin, orchestra] + 0 [soccer, sports] + 1 [violin, orchestra] + 2 [fun, sports] + 3 [guitar, band] Name: Term, dtype: object See Also @@ -666,7 +667,7 @@ def top_words_per_document(s_document_term, n_words=3): s_document_term, s_cluster.astype("category"), n_words=n_words ) - return s_top_words_per_document.reindex(s.index) + return s_top_words_per_document.reindex(s_document_term.index) """ From 3dfd5281e5c68105486b26a1e393fdb56d594113 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 13:27:46 +0200 Subject: [PATCH 27/42] Add docstrings and function comments to new topic modelling functions (and helpers) --- tests/test_visualization.py | 8 +- texthero/representation.py | 38 +++-- texthero/visualization.py | 277 +++++++++++++++++++++++++----------- 3 files changed, 223 insertions(+), 100 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 68d028c6..3e52f3c9 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -108,7 +108,9 @@ def test_visualize_topics_clustering_for_second_input(self): ) with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) + self.assertIsNotNone( + visualization.visualize_topics(s_tfidf, s_cluster, return_figure=True) + ) def test_visualize_topics_topic_modelling_for_second_input(self): @@ -132,4 +134,6 @@ def test_visualize_topics_topic_modelling_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) + self.assertIsNotNone( + visualization.visualize_topics(s_tfidf, s_lda, return_figure=True) + ) diff --git a/texthero/representation.py b/texthero/representation.py index fb28b24a..a451320f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -896,17 +896,16 @@ def truncatedSVD( s: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, ) -> pd.Series: """ - Performs TruncatedSVD on the given pandas series. + Perform TruncatedSVD on the given pandas Series. - TruncatedSVD is an algorithmn, which can be used to reduce the dimensions + TruncatedSVD is an algorithmn which can be used to reduce the dimensions of a given series. In natural language processing, the high-dimensional data is usually a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each word). This is used as a tool to extract the most important topics and words - of a given Series. In this context it is refered to as latent semantic analysis (LSA), - or Latent Semantic Analysis (LSI) - + of a given Series. In this context it is also referred to as + Latent Semantic Analysis (LSA) or Latent Semantic Indexing (LSI). TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a DocumentTermDF, the advantage of sparseness is kept. @@ -917,7 +916,8 @@ def truncatedSVD( n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). - For LSA, a value of 100 is recommended + When using truncatedSVD for Topic Modelling, this needs to be + the number of topics. n_iter : int, optional (default: 5) Number of iterations for randomized SVD solver. @@ -947,7 +947,7 @@ def truncatedSVD( See also -------- - `truncatedSVD on Wikipedia ` + `truncatedSVD on Wikipedia `_ """ truncatedSVD = TruncatedSVD( @@ -989,10 +989,12 @@ def lda( Parameters ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + s : pd.Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. - Number of components to keep (in NLP context number of topics) + Number of components to keep (dimensionality of output vectors). + When using truncatedSVD for Topic Modelling, this needs to be + the number of topics. max_iter : int, optional (default: 10) The maximum number of iterations. @@ -1001,7 +1003,6 @@ def lda( Determines the random number generator. Pass an int for reproducible results across multiple function calls. - Returns ------- Pandas Series with the vector calculated by LDA for the document in every @@ -1022,7 +1023,7 @@ def lda( See also -------- - `LDA on Wikipedia pd.Series: # TODO: add types everywhere when they're merged """ Find the topics from a topic model. Input has @@ -1049,7 +1050,8 @@ def topics_from_topic_model(s_document_topic): - :meth:`texthero.representation.lda` - :meth:`texthero.representation.truncatedSVD`, so the output of one of Texthero's Topic Modelling - functions. + functions that returns a relation + between documents and topics. The function uses the given relation of documents to topics to calculate the @@ -1094,6 +1096,16 @@ def topics_from_topic_model(s_document_topic): document_topic_matrix = np.matrix(s_document_topic.tolist()) + # The document_topic_matrix relates documents to topics, + # so it shows for each document (so for each row), how + # strongly that document belongs to a topic. So + # document_topic_matrix[X][Y] = how strongly document X belongs to topic Y. + # We use argmax to find the index of the topic that a document + # belongs most strongly to for each document (so for each row). + # E.g. when the first row of the document_topic_matrix is + # [0.2, 0.1, 0.2, 0.5], then the first document will be put into + # topic / cluster 3 as the third entry (counting from 0) is + # the best matching topic. cluster_IDs = np.argmax(document_topic_matrix, axis=1).getA1() return pd.Series(cluster_IDs, index=s_document_topic.index, dtype="category") diff --git a/texthero/visualization.py b/texthero/visualization.py index 3f2e71ac..ad0e5cea 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -19,10 +19,9 @@ from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis -from pyLDAvis import display as display_notebook -from pyLDAvis import show as display_browser from collections import Counter +from typing import Tuple def scatterplot( @@ -314,22 +313,60 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: def _get_matrices_for_visualize_topics( - s_document_term, s_document_topic, clustering_function_used + s_document_term: pd.DataFrame, + s_document_topic: pd.Series, + clustering_function_used: bool, ): + # TODO: add Hero types everywhere when they're merged + """ + Helper function for visualize_topics. Used to extract and + calculate the matrices that pyLDAvis needs. + + Recieves as first argument s_document_term, which is the output of + tfidf / count / term_frequency. From this, s_document_term.values + are the document_term_matrix in the code. + + Recieves as second argument s_document_topic, which is either + the output of a clustering function (so a categorical Series) + or the output of a topic modelling function (so a VectorSeries). + + In the first case (that's when clustering_function_used=True), + we create the document_topic_matrix + through the clusterIDs. So if document X is in cluster Y, + then document_topic_matrix[X][Y] = 1. + + For example, when + `s_document_topic = pd.Series([0, 2, 2, 1, 0, 1], dtype="category")`, + then the document_topic_matrix is + 1 0 0 + 0 0 1 + 0 0 1 + 0 1 0 + 1 0 0 + 0 1 0 + + So e.g. document zero is in cluster 0, so document_topic_matrix[0][0] = 1. + + In the second case (that's when lda or truncatedSVD were used), + their output is already the document_topic_matrix that relates + documents to topics. + + We then have in both cases the document_term_matrix and the document_topic_matrix. + pyLDAvis still needs the topic_term_matrix, which we get through + topic_term_matrix = document_term_matrix.T * document_topic_matrix. + """ if not clustering_function_used: # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. document_term_matrix = s_document_term.sparse.to_coo() document_topic_matrix = np.array(list(s_document_topic)) - # topic_term_matrix = vectorizer.components_ - topic_term_matrix = document_topic_matrix.T * document_term_matrix - else: # Here, s_document_topic is output of some hero clustering function. # First remove documents that are not assigned to any cluster. + # They have clusterID == -1. indexes_of_unassigned_documents = s_document_topic == -1 s_document_term = s_document_term[~indexes_of_unassigned_documents] s_document_topic = s_document_topic[~indexes_of_unassigned_documents] @@ -337,25 +374,54 @@ def _get_matrices_for_visualize_topics( document_term_matrix = s_document_term.sparse.to_coo() - # Construct document_topic_matrix - n_rows = len(s_document_topic.index) - n_cols = len(s_document_topic.values.categories) - - data = [1 for _ in range(n_rows)] - rows = range(n_rows) + # Construct document_topic_matrix from the cluster category Series + # as described in the docstring. + n_rows = len(s_document_topic.index) # n_rows = number of documents + n_cols = len(s_document_topic.values.categories) # n_cols = number of clusters + + # Will get binary matrix: + # document_topic_matrix[X][Y] = 1 <=> document X is in cluster Y. + # We construct this matrix sparsely in CSR format + # -> need the data (will only insert 1s, nothing else), + # the rows (so in which rows we want to insert, which is all of them + # as every document belongs to a cluster), + # and we need the columns (so in which cluster we want to insert, + # which is exactly the clusterID values). + data = [1 for _ in range(n_rows)] # Will insert one 1 per row. + rows = range(n_rows) # rows are just [0, 1, ..., n_rows] columns = s_document_topic.values + # Construct the sparse matrix. document_topic_matrix = csr_matrix( (data, (rows, columns)), shape=(n_rows, n_cols) ) - topic_term_matrix = document_topic_matrix.T * document_term_matrix + topic_term_matrix = document_topic_matrix.T * document_term_matrix return s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix -def _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix): +def _prepare_matrices_for_pyLDAvis( + document_topic_matrix: np.matrix, topic_term_matrix: np.matrix +): + # TODO: add types everywhere when they're merged + """ + Helper function for visualize_topics. Used to prepare the + document_topic_matrix and the topic_term_matrix for pyLDAvis. + + First normalizes both matrices to get the + document_topic_distributions and topic_term_distributions matrix. + For example, the first row of document_topic_distributions + has the probabilities of document zero to belong to the + different topics (so every row sums up to 1 (this is later + checked by pyLDAvis)). + So document_topic_matrix[i][j] = proportion of document i + that belongs to topic j. + + Then densify the (potentially) sparse matrices for pyLDAvis. + """ + # Get distributions through normalization. document_topic_distributions = sklearn_normalize( document_topic_matrix, norm="l1", axis=1 ) @@ -363,6 +429,7 @@ def _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix): topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) # Make sparse matrices dense for pyLDAvis + if issparse(document_topic_distributions): document_topic_distributions = document_topic_distributions.toarray().tolist() else: @@ -376,7 +443,12 @@ def _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix): return document_topic_distributions, topic_term_distributions -def visualize_topics(s_document_term, s_document_topic): +def visualize_topics( + s_document_term: pd.DataFrame, + s_document_topic: pd.Series, + show_in_new_window=False, + return_figure=False, +): # TODO: add types everywhere when they're merged """ Visualize the topics of your dataset. First input has @@ -413,31 +485,37 @@ def visualize_topics(s_document_term, s_document_topic): **To show the plot**: - - Interactively in a Jupyter Notebook: do `hero.display_notebook(hero.visualize_topics(...))` - - In a new browser window: do `hero.display_browser(hero.visualize_topics(...))` + - Interactively in a Jupyter Notebook: set show_in_new_window to False + - In a new browser window: set show_in_new_window to True Parameters ---------- s_document_term: pd.DataFrame - - One of - - :meth:`texthero.representation.tfidf` - - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` + One of + :meth:`texthero.representation.tfidf` + :meth:`texthero.representation.count` + :meth:`texthero.representation.term_frequency` s_document_topic: pd.Series - - One of - - :meth:`texthero.representation.kmeans` - - :meth:`texthero.representation.meanshift` - - :meth:`texthero.representation.dbscan` - (using clustering functkmeansions, documents - that are not assigned to a cluster are - not considered in the visualization) - or one of - - :meth:`texthero.representation.lda` - - :meth:`texthero.representation.truncatedSVD` - + One of + :meth:`texthero.representation.kmeans` + :meth:`texthero.representation.meanshift` + :meth:`texthero.representation.dbscan` + (using clustering functkmeansions, documents + that are not assigned to a cluster are + not considered in the visualization) + or one of + :meth:`texthero.representation.lda` + :meth:`texthero.representation.truncatedSVD` + + show_in_new_window: bool, default to True + Whether to open a new browser window or + show the visualization inline (only + supported in Jupyter Notebooks). + + return_figure: bool, default False + Whether to return the figure + instead of visualizing it. Examples -------- @@ -449,9 +527,9 @@ def visualize_topics(s_document_term, s_document_topic): >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> # Display in a new browser window: - >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=True) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.display_notebook(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=False) # doctest: +SKIP Using LDA: @@ -461,9 +539,9 @@ def visualize_topics(s_document_term, s_document_topic): >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_lda = s_tfidf.pipe(hero.lda, n_components=5) >>> # Display in a new browser window: - >>> hero.display_browser(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_lda, show_in_new_window=True) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.display_notebook(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_lda, show_in_new_window=False) # doctest: +SKIP See Also @@ -473,9 +551,12 @@ def visualize_topics(s_document_term, s_document_topic): TODO add tutorial link """ + # Bool to note whether a clustering function or topic modelling + # functions was used for s_document_topic. clustering_function_used = s_document_topic.dtype.name == "category" # Get / build matrices from input + # (see helper function docstring for explanation) ( s_document_term, s_document_topic, @@ -489,11 +570,14 @@ def visualize_topics(s_document_term, s_document_topic): doc_lengths = list(s_document_term.sum(axis=1)) term_frequency = list(s_document_term.sum(axis=0)) + # Prepare matrices for input to pyLDAvis + # (see helper function docstring for explanation) ( document_topic_distributions, topic_term_distributions, ) = _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix) + # Create pyLDAvis visualization. figure = pyLDAvis.prepare( **{ "vocab": vocab, @@ -506,10 +590,21 @@ def visualize_topics(s_document_term, s_document_topic): } ) - return figure + if return_figure: + return figure + else: + # Different pyLDAvis functions + # for showing in new window and + # showing inside a notebook. + if show_in_new_window: + pyLDAvis.show(figure) + else: + pyLDAvis.display(figure) -def top_words_per_topic(s_document_term, s_clusters, n_words=5): +def top_words_per_topic( + s_document_term: pd.DataFrame, s_clusters: pd.Series, n_words=5 +): # TODO: add types everywhere when they're merged """ Find the top words per topic of your dataset. First input has @@ -540,24 +635,28 @@ def top_words_per_topic(s_document_term, s_clusters, n_words=5): Parameters ---------- s_document_term: pd.DataFrame - - One of - - :meth:`texthero.representation.tfidf` - - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` + One of + :meth:`texthero.representation.tfidf` + :meth:`texthero.representation.count` + :meth:`texthero.representation.term_frequency` s_clusters: pd.Series - - One of - - :meth:`texthero.representation.kmeans` - - :meth:`texthero.representation.meanshift` - - :meth:`texthero.representation.dbscan` - - :meth:`texthero.representation.topics_from_topic_model` + One of + :meth:`texthero.representation.kmeans` + :meth:`texthero.representation.meanshift` + :meth:`texthero.representation.dbscan` + :meth:`texthero.representation.topics_from_topic_model` n_words: int, default to 5 Number of top words per topic, should be <= 30. + Returns + ------- + Series with the topic IDs as index and + a list of n_words relevant words per + topic as values. + Examples -------- Using Clustering: @@ -568,20 +667,22 @@ def top_words_per_topic(s_document_term, s_clusters, n_words=5): >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> hero.top_words_per_topic(s_tfidf, s_cluster) # doctest: +SKIP - Category 0 [sports, football, soccer] 1 [music, violin, orchestra] - Name: Term, dtype: object + dtype: object See Also -------- `pyLDAvis `_ + and their methodology on how to find relevant terms. TODO add tutorial link """ - pyLDAvis_result = visualize_topics(s_document_term, s_clusters).to_dict() + pyLDAvis_result = visualize_topics( + s_document_term, s_clusters, return_figure=True + ).to_dict() df_topics_and_their_top_words = pd.DataFrame(pyLDAvis_result["tinfo"]) @@ -609,13 +710,16 @@ def top_words_per_topic(s_document_term, s_clusters, n_words=5): s_topics_with_top_words = s_topics_with_top_words.apply(lambda x: x[:n_words]) + # Remove series name "Term" from pyLDAvis + s_topics_with_top_words = s_topics_with_top_words.rename(None) + return s_topics_with_top_words -def top_words_per_document(s_document_term, n_words=3): +def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): # TODO: add types everywhere when they're merged """ - Find the top words per topic of your dataset. First input has + Find the top words per document of your dataset. First input has to be output of one of - :meth:`texthero.representation.tfidf` - :meth:`texthero.representation.count` @@ -623,58 +727,61 @@ def top_words_per_document(s_document_term, n_words=3): (tfidf suggested). - TODO + The function assigns every document + to its own cluster (or "topic") and then uses + :meth:`top_words_per_topic` to find + the top words for every document. Parameters ---------- s_document_term: pd.DataFrame - - One of - - :meth:`texthero.representation.tfidf` - - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` + One of + :meth:`texthero.representation.tfidf` + :meth:`texthero.representation.count` + :meth:`texthero.representation.term_frequency` n_words: int, default to 3 - Number of top words per topic, should + Number of words to fetch per topic, should be <= 30. + Returns + ------- + Series with the document IDs as index and + a list of n_words relevant words per + document as values. + Examples -------- - Using Clustering: - >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.top_words_per_document(s_tfidf, n_words=2) # doctest: +SKIP - Category - 0 [sports, football, soccer] - 1 [music, violin, orchestra] - Name: Term, dtype: object + 0 [soccer, sports] + 1 [violin, orchestra] + 2 [fun, sports] + 3 [guitar, band] + dtype: object + >>> # We can see that the function tries to + >>> # find terms that distinguish the documents, + >>> # so "music" is not chosen for documents + >>> # 1 and 3 as it's found in both. See Also -------- - `pyLDAvis `_ + :meth:`top_words_per_topic` TODO add tutorial link """ + # Create a categorical Series that has + # one new cluster for every document. + s_cluster = pd.Series(s_document_term.index.tolist(), dtype="category") - s_cluster = pd.Series(s_document_term.index.tolist()) - + # Call top_words_per_topic with the new cluster series + # (so every document is one distinct "topic") s_top_words_per_document = top_words_per_topic( - s_document_term, s_cluster.astype("category"), n_words=n_words + s_document_term, s_cluster, n_words=n_words ) - return s_top_words_per_document.reindex(s.index) - - -""" -TODO - -- tests for top_words_per_document, top_words_per_topic, topics_from_topic_model - -> try second one also with error when category == -1 somewhere - -- docstrings of all functions (also private helpers) + comments - -""" + return s_top_words_per_document.reindex(s_document_term.index) From 79fc37e1dc6c1df588af4e9f395d74c7802f7c97 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 25 Aug 2020 13:57:31 +0200 Subject: [PATCH 28/42] fixed index and test finsish PR Co-authored-by: Henri Froese --- tests/test_indexes.py | 34 +++++++++++++++++++++++++++++++--- tests/test_visualization.py | 4 +--- texthero/visualization.py | 4 +++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index ceafad73..2984616b 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -12,6 +12,13 @@ s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6]) s_numeric = pd.Series([5.0], index=[5]) s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6]) +df_document_term = pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=[5, 6], + columns=pd.MultiIndex.from_product([["test"], ["!", ".", "?", "TEST", "Test"]]), + dtype="Sparse", +) + # Define all test cases. Every test case is a list # of [name of test case, function to test, tuple of valid input for the function]. @@ -67,9 +74,20 @@ ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists,)], ["meanshift", representation.meanshift, (s_numeric_lists,)], + [ + "topics_from_topic_model", + representation.topics_from_topic_model, + (s_numeric_lists,), + ], ] -test_cases_visualization = [] +test_cases_visualization = [ + [ + "top_words_per_document", + visualization.top_words_per_document, + (df_document_term,), + ], +] test_cases = ( test_cases_nlp @@ -96,12 +114,22 @@ class AbstractIndexTest(PandasTestCase): def test_correct_index(self, name, test_function, valid_input): s = valid_input[0] result_s = test_function(*valid_input) - t_same_index = pd.Series(s.values, s.index) + + if isinstance(s, pd.Series): + t_same_index = pd.Series(s.values, s.index) + else: + t_same_index = pd.DataFrame(s.values, s.index) + self.assertTrue(result_s.index.equals(t_same_index.index)) @parameterized.expand(test_cases) def test_incorrect_index(self, name, test_function, valid_input): s = valid_input[0] result_s = test_function(*valid_input) - t_different_index = pd.Series(s.values, index=None) + + if isinstance(s, pd.Series): + t_different_index = pd.Series(s.values, index=None) + else: + t_different_index = pd.DataFrame(s.values, index=None) + self.assertFalse(result_s.index.equals(t_different_index.index)) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index ae3b114b..34383d8f 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -162,7 +162,6 @@ def test_top_words_per_document(self): ["fun", "sports"], ["guitar", "band"], ], - name="Term", ) pd.testing.assert_series_equal(s_result, s_true) @@ -187,7 +186,6 @@ def test_top_words_per_topic(self): ) s_result = visualization.top_words_per_topic(s_tfidf, s_cluster, n_words=3) s_true = pd.Series( - [["music", "violin", "orchestra"],["sports", "football", "soccer"]], - name="Term" + [["music", "violin", "orchestra"], ["sports", "football", "soccer"]], ) pd.testing.assert_series_equal(s_result, s_true, check_names=False) diff --git a/texthero/visualization.py b/texthero/visualization.py index ad0e5cea..e909aa08 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -776,7 +776,9 @@ def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): """ # Create a categorical Series that has # one new cluster for every document. - s_cluster = pd.Series(s_document_term.index.tolist(), dtype="category") + s_cluster = pd.Series( + np.arange(len(s_document_term)), index=s_document_term.index, dtype="category" + ) # Call top_words_per_topic with the new cluster series # (so every document is one distinct "topic") From 2db883d19c92d205fc7556e8428dadc0e12025b6 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 17:09:09 +0200 Subject: [PATCH 29/42] - Fix display options - Clean up some docstrings --- tests/test_representation.py | 4 ++-- tests/test_visualization.py | 6 ++---- texthero/representation.py | 6 +++--- texthero/visualization.py | 40 ++++++++++-------------------------- 4 files changed, 18 insertions(+), 38 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index ea2390ee..b4d48e3d 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -236,7 +236,7 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): """ @parameterized.expand(test_cases_dim_reduction_and_clustering) - def test_dim_reduction_and_clustering_with_vector_series_input( + def test_dim_reduction_and_clustering_and_topic_modelling_with_vector_series_input( self, name, test_function, correct_output ): s_true = correct_output @@ -264,7 +264,7 @@ def test_dim_reduction_and_clustering_with_vector_series_input( ) @parameterized.expand(test_cases_dim_reduction_and_clustering) - def test_dim_reduction_and_clustering_with_documenttermDF_input( + def test_dim_reduction_and_clustering_and_topic_modelling_with_documenttermDF_input( self, name, test_function, correct_output ): s_true = correct_output diff --git a/tests/test_visualization.py b/tests/test_visualization.py index ae3b114b..638b5d80 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -109,7 +109,7 @@ def test_visualize_topics_clustering_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") self.assertIsNotNone( - visualization.visualize_topics(s_tfidf, s_cluster, return_figure=True) + visualization.visualize_topics(s_tfidf, s_cluster) ) def test_visualize_topics_topic_modelling_for_second_input(self): @@ -135,7 +135,7 @@ def test_visualize_topics_topic_modelling_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") self.assertIsNotNone( - visualization.visualize_topics(s_tfidf, s_lda, return_figure=True) + visualization.visualize_topics(s_tfidf, s_lda) ) def test_top_words_per_document(self): @@ -162,7 +162,6 @@ def test_top_words_per_document(self): ["fun", "sports"], ["guitar", "band"], ], - name="Term", ) pd.testing.assert_series_equal(s_result, s_true) @@ -188,6 +187,5 @@ def test_top_words_per_topic(self): s_result = visualization.top_words_per_topic(s_tfidf, s_cluster, n_words=3) s_true = pd.Series( [["music", "violin", "orchestra"],["sports", "football", "soccer"]], - name="Term" ) pd.testing.assert_series_equal(s_result, s_true, check_names=False) diff --git a/texthero/representation.py b/texthero/representation.py index a451320f..83624436 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -977,14 +977,14 @@ def lda( """ Performs Latent Dirichlet Allocation on the given pandas series. - Latent Dirichlet Allocation(LDA) is a topic modeling algorithm + Latent Dirichlet Allocation (LDA) is a topic modeling algorithm based on Dirichlet distribution. In natural language processing LDA is often used to categorise documents into diffenrent topics and generate top words from these topics. In this process LDA is - used in combination with algorithms, which generate document-term- + used in combination with algorithms which generate document-term- matrixes, like :meth:`count` or :meth:`tfidf` - TruncatedSVD can directly handle sparse input, so when calling truncatedSVD on a + LDA can directly handle sparse input, so when calling truncatedSVD on a DocumentTermDF, the advantage of sparseness is kept. Parameters diff --git a/texthero/visualization.py b/texthero/visualization.py index ad0e5cea..96da42c2 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -19,6 +19,8 @@ from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis +from pyLDAvis import display as notebook_display +from pyLDAvis import show as browser_display from collections import Counter from typing import Tuple @@ -324,7 +326,7 @@ def _get_matrices_for_visualize_topics( Recieves as first argument s_document_term, which is the output of tfidf / count / term_frequency. From this, s_document_term.values - are the document_term_matrix in the code. + is the document_term_matrix in the code. Recieves as second argument s_document_topic, which is either the output of a clustering function (so a categorical Series) @@ -444,10 +446,7 @@ def _prepare_matrices_for_pyLDAvis( def visualize_topics( - s_document_term: pd.DataFrame, - s_document_topic: pd.Series, - show_in_new_window=False, - return_figure=False, + s_document_term: pd.DataFrame, s_document_topic: pd.Series, return_figure=False, ): # TODO: add types everywhere when they're merged """ @@ -485,8 +484,8 @@ def visualize_topics( **To show the plot**: - - Interactively in a Jupyter Notebook: set show_in_new_window to False - - In a new browser window: set show_in_new_window to True + - Interactively in a Jupyter Notebook: use `hero.notebook_display(hero.visualize_topics(...))` + - In a new browser window: `hero.browser_display(hero.visualize_topics(...))` Parameters ---------- @@ -508,15 +507,6 @@ def visualize_topics( :meth:`texthero.representation.lda` :meth:`texthero.representation.truncatedSVD` - show_in_new_window: bool, default to True - Whether to open a new browser window or - show the visualization inline (only - supported in Jupyter Notebooks). - - return_figure: bool, default False - Whether to return the figure - instead of visualizing it. - Examples -------- Using Clustering: @@ -527,9 +517,9 @@ def visualize_topics( >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> # Display in a new browser window: - >>> hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=True) # doctest: +SKIP + >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=True)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=False) # doctest: +SKIP + >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=False)) # doctest: +SKIP Using LDA: @@ -590,16 +580,7 @@ def visualize_topics( } ) - if return_figure: - return figure - else: - # Different pyLDAvis functions - # for showing in new window and - # showing inside a notebook. - if show_in_new_window: - pyLDAvis.show(figure) - else: - pyLDAvis.display(figure) + return figure def top_words_per_topic( @@ -619,7 +600,8 @@ def top_words_per_topic( clustering, so output of one of - :meth:`texthero.representation.kmeans` - :meth:`texthero.representation.meanshift` - - :meth:`texthero.representation.dbscan`. + - :meth:`texthero.representation.dbscan` + - :meth:`texthero.representation.topics_from_topic_model` The function uses the given clustering from the second input, which relates From 72a773604fb9cc06832cac7ab138e8aa966f1553 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 17:28:41 +0200 Subject: [PATCH 30/42] remove return_figure parameter --- texthero/visualization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 944f6043..adb91003 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -446,7 +446,7 @@ def _prepare_matrices_for_pyLDAvis( def visualize_topics( - s_document_term: pd.DataFrame, s_document_topic: pd.Series, return_figure=False, + s_document_term: pd.DataFrame, s_document_topic: pd.Series ): # TODO: add types everywhere when they're merged """ From ebc4171c25d298ff6e34ad1689130e8c4507e64e Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 18:41:25 +0200 Subject: [PATCH 31/42] Fix errors and bugs. --- tests/test_visualization.py | 8 ++----- texthero/visualization.py | 45 +++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 20b18208..4458c276 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -108,9 +108,7 @@ def test_visualize_topics_clustering_for_second_input(self): ) with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone( - visualization.visualize_topics(s_tfidf, s_cluster) - ) + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) def test_visualize_topics_topic_modelling_for_second_input(self): @@ -134,9 +132,7 @@ def test_visualize_topics_topic_modelling_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone( - visualization.visualize_topics(s_tfidf, s_lda) - ) + self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) def test_top_words_per_document(self): s = pd.Series( diff --git a/texthero/visualization.py b/texthero/visualization.py index adb91003..7c8a7744 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -275,7 +275,7 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: Tokenization: split by space and remove all punctuations that are not between characters. - Parameters + Parameters ---------- normalize : optional, default to False. When set to true, return normalized values. @@ -423,7 +423,7 @@ def _prepare_matrices_for_pyLDAvis( Then densify the (potentially) sparse matrices for pyLDAvis. """ - # Get distributions through normalization. + # Get distributions through normalization document_topic_distributions = sklearn_normalize( document_topic_matrix, norm="l1", axis=1 ) @@ -445,9 +445,7 @@ def _prepare_matrices_for_pyLDAvis( return document_topic_distributions, topic_term_distributions -def visualize_topics( - s_document_term: pd.DataFrame, s_document_topic: pd.Series -): +def visualize_topics(s_document_term: pd.DataFrame, s_document_topic: pd.Series): # TODO: add types everywhere when they're merged """ Visualize the topics of your dataset. First input has @@ -464,12 +462,7 @@ def visualize_topics( - :meth:`texthero.representation.meanshift` - :meth:`texthero.representation.dbscan` - or the result of a topic modelling function, so - one of - - :meth:`texthero.representation.lda` - - :meth:`texthero.representation.truncatedSVD` - - (topic modelling output suggested). + or the result of :meth:`texthero.representation.lda`. The function uses the given clustering or topic modelling from the second input, which relates @@ -517,9 +510,9 @@ def visualize_topics( >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> # Display in a new browser window: - >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=True)) # doctest: +SKIP + >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_cluster, show_in_new_window=False)) # doctest: +SKIP + >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP Using LDA: @@ -527,11 +520,11 @@ def visualize_topics( >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) - >>> s_lda = s_tfidf.pipe(hero.lda, n_components=5) + >>> s_lda = s_tfidf.pipe(hero.lda, n_components=2) >>> # Display in a new browser window: - >>> hero.visualize_topics(s_tfidf, s_lda, show_in_new_window=True) # doctest: +SKIP + >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.visualize_topics(s_tfidf, s_lda, show_in_new_window=False) # doctest: +SKIP + >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP See Also @@ -662,9 +655,7 @@ def top_words_per_topic( """ - pyLDAvis_result = visualize_topics( - s_document_term, s_clusters, return_figure=True - ).to_dict() + pyLDAvis_result = visualize_topics(s_document_term, s_clusters).to_dict() df_topics_and_their_top_words = pd.DataFrame(pyLDAvis_result["tinfo"]) @@ -769,3 +760,19 @@ def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): ) return s_top_words_per_document.reindex(s_document_term.index) + + +""" +import texthero as hero +import pandas as pd +df = pd.read_csv( + "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv" +) + + +s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df = 0.5, min_df = 100) +s_lda = s_tfidf.pipe(hero.truncatedSVD, n_components=5) +# a, b = hero.visualize_topics(s_tfidf, s_lda) +hero.browser_display(hero.visualize_topics(s_tfidf, s_lda)) + +""" From 9d85c1487275f4cba74659bf95a7e39bfc7e0fdc Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Tue, 25 Aug 2020 18:43:01 +0200 Subject: [PATCH 32/42] remove test-docstring at the end --- texthero/visualization.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 7c8a7744..109d739c 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -760,19 +760,3 @@ def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): ) return s_top_words_per_document.reindex(s_document_term.index) - - -""" -import texthero as hero -import pandas as pd -df = pd.read_csv( - "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv" -) - - -s_tfidf = df["text"].pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf, max_df = 0.5, min_df = 100) -s_lda = s_tfidf.pipe(hero.truncatedSVD, n_components=5) -# a, b = hero.visualize_topics(s_tfidf, s_lda) -hero.browser_display(hero.visualize_topics(s_tfidf, s_lda)) - -""" From 64edbaf3a752848584d21fe1604d5818375cc2d7 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 29 Aug 2020 18:27:17 +0200 Subject: [PATCH 33/42] Start implementing discussed changes Co-authored-by: Maximilian Krahn --- texthero/visualization.py | 198 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/texthero/visualization.py b/texthero/visualization.py index 109d739c..8f0549a5 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -760,3 +760,201 @@ def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): ) return s_top_words_per_document.reindex(s_document_term.index) + + +# NEW PIPELINE: STEP 1 +def topic_matrices( + s_document_term: pd.DataFrame, + s_document_topic: pd.Series, +): + # TODO: add Hero types everywhere when they're merged + # FIXME: new docstring ~ DocumentTerm & DocumentTopic -> DocumentTopic & TopicTerm. + """ + + Helper function for visualize_topics. Used to extract and + calculate the matrices that pyLDAvis needs. + + Recieves as first argument s_document_term, which is the output of + tfidf / count / term_frequency. From this, s_document_term.values + is the document_term_matrix in the code. + + Recieves as second argument s_document_topic, which is either + the output of a clustering function (so a categorical Series) + or the output of a topic modelling function (so a VectorSeries). + + In the first case (that's when clustering_function_used=True), + we create the document_topic_matrix + through the clusterIDs. So if document X is in cluster Y, + then document_topic_matrix[X][Y] = 1. + + For example, when + `s_document_topic = pd.Series([0, 2, 2, 1, 0, 1], dtype="category")`, + then the document_topic_matrix is + 1 0 0 + 0 0 1 + 0 0 1 + 0 1 0 + 1 0 0 + 0 1 0 + + So e.g. document zero is in cluster 0, so document_topic_matrix[0][0] = 1. + + In the second case (that's when lda or truncatedSVD were used), + their output is already the document_topic_matrix that relates + documents to topics. + + We then have in both cases the document_term_matrix and the document_topic_matrix. + pyLDAvis still needs the topic_term_matrix, which we get through + topic_term_matrix = document_term_matrix.T * document_topic_matrix. + + + Docuement Topic Matrix Topic Term Matrix + 1 2 3 1 2 3 + 0 1 2 3 0 1 2 3 + 1 4 5 6 , 1 4 5 6 + + + """ + # Bool to note whether a clustering function or topic modelling + # functions was used for s_document_topic. + clustering_function_used = s_document_topic.dtype.name == "category" + + if not clustering_function_used: + # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. + + document_term_matrix = s_document_term.sparse.to_coo() + document_topic_matrix = np.array(list(s_document_topic)) + n_topics = len(document_topic_matrix[0]) + + else: + # Here, s_document_topic is output of some hero clustering function. + + # First remove documents that are not assigned to any cluster. + # They have clusterID == -1. + indexes_of_unassigned_documents = s_document_topic == -1 + s_document_term = s_document_term[~indexes_of_unassigned_documents] + s_document_topic = s_document_topic[~indexes_of_unassigned_documents] + s_document_topic = s_document_topic.cat.remove_unused_categories() + + document_term_matrix = s_document_term.sparse.to_coo() + + # Construct document_topic_matrix from the cluster category Series + # as described in the docstring. + n_rows = len(s_document_topic.index) # n_rows = number of documents + # n_cols = number of clusters + n_topics = n_cols = len(s_document_topic.values.categories) + + # Will get binary matrix: + # document_topic_matrix[X][Y] = 1 <=> document X is in cluster Y. + # We construct this matrix sparsely in CSR format + # -> need the data (will only insert 1s, nothing else), + # the rows (so in which rows we want to insert, which is all of them + # as every document belongs to a cluster), + # and we need the columns (so in which cluster we want to insert, + # which is exactly the clusterID values). + data = [1 for _ in range(n_rows)] # Will insert one 1 per row. + rows = range(n_rows) # rows are just [0, 1, ..., n_rows] + columns = s_document_topic.values + + # Construct the sparse matrix. + document_topic_matrix = csr_matrix( + (data, (rows, columns)), shape=(n_rows, n_cols) + ) + + topic_term_matrix = document_topic_matrix.T * document_term_matrix + + # Create s_document_topic and s_topic_term (both multiindexed) + + # Create s_document_topic + s_document_topic_columns = pd.MultiIndex.from_product( + [["Document Topic Matrix"], range(n_topics)] + ) + if isinstance(document_topic_matrix, csr_matrix): + s_document_topic = pd.DataFrame.sparse.from_spmatrix( + document_topic_matrix, + columns=s_document_topic_columns, + index=s_document_term.index + ) + + else: + s_document_topic = pd.DataFrame( + document_topic_matrix, + columns=s_document_topic_columns, + index=s_document_term.index + ) + + # Create s_topic_term + s_topic_term_columns = pd.MultiIndex.from_product( + [["Topic Term Matrix"], s_document_term.columns.tolist()] + ) + if isinstance(topic_term_matrix, csr_matrix): + s_topic_term = pd.DataFrame.sparse.from_spmatrix( + topic_term_matrix, + columns=s_topic_term_columns + ) + + else: + s_topic_term = pd.DataFrame( + topic_term_matrix, + columns=s_topic_term_columns + ) + + return s_document_topic, s_topic_term + +# New Pipeline: Step 2 +# Users just need to l1-normalize + + +# New Pipeline: Step 3 +def relevant_terms_per_topic( + s_document_term, + s_document_topic_distribution, + s_topic_term_distribution, + return_figure=False +): + """ + Use LDAvis to get topics & relevant terms. + """ + + # Define parameters for pyLDAvis. + vocab = s_document_term.columns.levels[1].tolist() + doc_lengths = list(s_document_term.sum(axis=1)) + term_frequency = list(s_document_term.sum(axis=0)) + + doc_topic_dists = s_document_topic_distribution.values.tolist() + topic_term_dists = s_topic_term_distribution.values.tolist() + + # Create pyLDAvis visualization. + figure = pyLDAvis.prepare( + **{ + "vocab": vocab, + "doc_lengths": doc_lengths, + "term_frequency": term_frequency, + "doc_topic_dists": doc_topic_dists, + "topic_term_dists": topic_term_dists, + "R": 15, + "sort_topics": False, + } + ) + + # TODO Extract relevant info etc. from figure + if return_figure: + return figure + +""" +Visualize_Topics: + Step 1: v/ + Step 2: v/ + Step 3: -> currently doing that to already return relevant_words_per_topic + + -> Wrapper calls 1-3 and 3 w/ return_figure=True and plots that + +Top_Words_per_Topic: + like above + +Top_Words_per_Document: + have a look at that + +topics_from_topic_model: + have a look at that +""" From 69af26b5540b9139fbe30ef288e60af4ef789e34 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 30 Aug 2020 18:20:53 +0200 Subject: [PATCH 34/42] Finish implementing the suggested changes. Co-authored-by: Maximilian Krahn --- .travis.yml | 2 +- setup.cfg | 2 +- tests/test_indexes.py | 7 +- tests/test_representation.py | 168 +++++++++ tests/test_visualization.py | 60 +--- texthero/representation.py | 493 +++++++++++++++++++++++++- texthero/visualization.py | 654 ++++------------------------------- 7 files changed, 746 insertions(+), 640 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index 291b198a..df282b8d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/tests/test_indexes.py b/tests/test_indexes.py index 2984616b..a55af203 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -79,16 +79,15 @@ representation.topics_from_topic_model, (s_numeric_lists,), ], -] - -test_cases_visualization = [ [ "top_words_per_document", - visualization.top_words_per_document, + representation.relevant_words_per_document, (df_document_term,), ], ] +test_cases_visualization = [] + test_cases = ( test_cases_nlp + test_cases_preprocessing diff --git a/tests/test_representation.py b/tests/test_representation.py index b4d48e3d..c4b55e54 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -304,3 +304,171 @@ def test_normalize_documenttermDF_also_as_output(self): pd.testing.assert_frame_equal( result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, ) + + """ + Test Topic Modelling (not all are suitable for parameterization). + `topics_from_topic_model, lda, truncatedSVD` already tested above. + + Here, we test + `relevant_words_per_document, relevant_words_per_topic, topic_matrices` + """ + + def test_relevant_words_per_document(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_result = representation.relevant_words_per_document(s_tfidf, n_words=2) + + s_true = pd.Series( + [ + ["soccer", "sports"], + ["violin", "orchestra"], + ["fun", "sports"], + ["guitar", "band"], + ], + ) + pd.testing.assert_series_equal(s_result, s_true) + + def test_relevant_words_per_topic(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=2, random_state=42) + .pipe(representation.kmeans, n_clusters=2, random_state=42) + ) + + s_document_topic, s_topic_term = representation.topic_matrices( + s_tfidf, s_cluster + ) + s_document_topic_distribution = representation.normalize( + s_document_topic, norm="l1" + ) + s_topic_term_distribution = representation.normalize(s_topic_term, norm="l1") + + s_result = representation.relevant_words_per_topic( + s_tfidf, s_document_topic_distribution, s_topic_term_distribution, n_words=3 + ) + s_true = pd.Series( + [["music", "violin", "orchestra"], ["sports", "football", "soccer"]], + ) + pd.testing.assert_series_equal(s_result, s_true, check_names=False) + + def test_topic_matrices_clustering_for_second_input(self): + + s = pd.Series(["Football", "Music", "Football", "Music",]) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=2, random_state=42) + .pipe(representation.kmeans, n_clusters=2, random_state=42) + ) + + s_document_topic_result, s_topic_term_result = representation.topic_matrices( + s_tfidf, s_cluster + ) + + s_document_topic_true = pd.DataFrame( + [[0, 1], [1, 0], [0, 1], [1, 0]], + columns=pd.MultiIndex.from_tuples( + [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] + ), + ) + + s_topic_term_true = pd.DataFrame( + [[0.0, 3.021651], [3.021651, 0.0]], + columns=pd.MultiIndex.from_tuples( + [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] + ), + ) + + pd.testing.assert_frame_equal( + s_document_topic_result, + s_document_topic_true, + check_less_precise=True, + check_dtype=False, + ) + + pd.testing.assert_frame_equal( + s_topic_term_result, + s_topic_term_true, + check_less_precise=True, + check_dtype=False, + ) + + def test_visualize_topics_topic_modelling_for_second_input(self): + + s = pd.Series(["Football", "Music", "Football", "Music",]) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_lda = s_tfidf.pipe(representation.normalize).pipe( + representation.lda, n_components=2, random_state=42 + ) + + s_document_topic_result, s_topic_term_result = representation.topic_matrices( + s_tfidf, s_lda + ) + + s_document_topic_true = pd.DataFrame( + [ + [0.744417, 0.255583], + [0.255583, 0.744417], + [0.744417, 0.255583], + [0.255583, 0.744417], + ], + columns=pd.MultiIndex.from_tuples( + [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] + ), + ) + + s_topic_term_true = pd.DataFrame( + [[2.249368, 0.772283], [0.772283, 2.249369]], + columns=pd.MultiIndex.from_tuples( + [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] + ), + ) + + pd.testing.assert_frame_equal( + s_document_topic_result, + s_document_topic_true, + check_less_precise=True, + check_dtype=False, + ) + + pd.testing.assert_frame_equal( + s_topic_term_result, + s_topic_term_true, + check_less_precise=True, + check_dtype=False, + ) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 4458c276..3e52f3c9 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -108,7 +108,9 @@ def test_visualize_topics_clustering_for_second_input(self): ) with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_cluster)) + self.assertIsNotNone( + visualization.visualize_topics(s_tfidf, s_cluster, return_figure=True) + ) def test_visualize_topics_topic_modelling_for_second_input(self): @@ -132,56 +134,6 @@ def test_visualize_topics_topic_modelling_for_second_input(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") - self.assertIsNotNone(visualization.visualize_topics(s_tfidf, s_lda)) - - def test_top_words_per_document(self): - s = pd.Series( - [ - "Football, Sports, Soccer", - "music, violin, orchestra", - "football, fun, sports", - "music, band, guitar", - ] - ) - - s_tfidf = ( - s.pipe(preprocessing.clean) - .pipe(preprocessing.tokenize) - .pipe(representation.tfidf) - ) - s_result = visualization.top_words_per_document(s_tfidf, n_words=2) - - s_true = pd.Series( - [ - ["soccer", "sports"], - ["violin", "orchestra"], - ["fun", "sports"], - ["guitar", "band"], - ], - ) - pd.testing.assert_series_equal(s_result, s_true) - - def test_top_words_per_topic(self): - s = pd.Series( - [ - "Football, Sports, Soccer", - "music, violin, orchestra", - "football, fun, sports", - "music, band, guitar", - ] - ) - s_tfidf = ( - s.pipe(preprocessing.clean) - .pipe(preprocessing.tokenize) - .pipe(representation.tfidf) - ) - s_cluster = ( - s_tfidf.pipe(representation.normalize) - .pipe(representation.pca, n_components=2, random_state=42) - .pipe(representation.kmeans, n_clusters=2, random_state=42) - ) - s_result = visualization.top_words_per_topic(s_tfidf, s_cluster, n_words=3) - s_true = pd.Series( - [["music", "violin", "orchestra"], ["sports", "football", "soccer"]], - ) - pd.testing.assert_series_equal(s_result, s_true, check_names=False) + self.assertIsNotNone( + visualization.visualize_topics(s_tfidf, s_lda, return_figure=True) + ) diff --git a/texthero/representation.py b/texthero/representation.py index 83624436..9fa6e093 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -11,7 +11,9 @@ from sklearn.cluster import KMeans, DBSCAN, MeanShift from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize as sklearn_normalize -from scipy.sparse import coo_matrix +from scipy.sparse import coo_matrix, csr_matrix, issparse + +import pyLDAvis from typing import Optional, Union, Any @@ -1111,6 +1113,495 @@ def topics_from_topic_model(s_document_topic: pd.Series) -> pd.Series: return pd.Series(cluster_IDs, index=s_document_topic.index, dtype="category") +def topic_matrices( + s_document_term: pd.DataFrame, s_document_topic: pd.Series, +): + # TODO: add Hero types everywhere when they're merged + """ + Get a DocumentTopic Matrix and a TopicTerm Matrix (both as Dataframes) + from a DocumentTerm Matrix and a DocumentTopic Matrix. + + Recieves as first argument s_document_term, which is the + output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency`. + + Recieves as second argument s_document_topic, which is either + the output of a clustering function + or the output of a topic modelling function, + so one of + - :meth:`texthero.representation.kmeans` + - :meth:`texthero.representation.dbscan` + - :meth:`texthero.representation.meanshift` + - :meth:`texthero.representation.lda`. + + Both these matrices (the first one relating documents to + terms and the second one relating documents to topics) + are used to generate a DocumentTopic Matrix + (relating documents to topics) and a + TopicTerm Matrix (relating topics to terms). + + When the second argument is the output of a clustering + function, we create the document_topic_matrix + through the cluster-IDs. So if document X is in cluster Y, + then document_topic_matrix[X][Y] = 1. + + For example, when + `s_document_topic = pd.Series([0, 2, 2, 1], dtype="category")`, + then the document_topic_matrix is + ```python + 1 0 0 + 0 0 1 + 0 0 1 + 0 1 0 + ``` + + When the second argument is the output of a topic modelling function, + their output is already the document_topic_matrix that relates + documents to topics. + + We then have in both cases the DocumentTerm Matrix and the DocumentTopic Matrix. + We then get the TopicTerm Matrix through + topic_term_matrix = document_term_matrix.T * document_topic_matrix. + + Parameters + ---------- + s_document_term : pd.DataFrame + Output of one of + :meth:`texthero.representation.tfidf`, + :meth:`texthero.representation.count`, + :meth:`texthero.representation.term_frequency`. + + s_document_topic : pd.Series + Output of one of + :meth:`texthero.representation.kmeans`, + :meth:`texthero.representation.dbscan`, + :meth:`texthero.representation.meanshift`, + :meth:`texthero.representation.lda`. + + Returns + ------- + Tuple of DataFrames. + + First one is + DocumentTopic DataFrame where the rows + are the documents and the columns are the + topics. So entry in row X and column Y + says how strongly document X belongs + to topic Y. + + Second one is + TopicTerm DataFrame where the rows + are the topics and the columns are the + terms. So entry in row X and column Y + says how strongly term Y belongs + to topic X. + + Examples + -------- + Using Clustering: + + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) + >>> s_document_topic, s_topic_term = hero.topic_matrices(s_tfidf, s_cluster) + >>> s_document_topic # doctest: +SKIP + Document Topic Matrix + 0 1 + 0 1 0 + 1 0 1 + 2 1 0 + 3 0 1 + >>> s_topic_term # doctest: +SKIP + Topic Term Matrix + band football fun guitar music orchestra soccer sports violin + 0 0.000000 3.021651 1.916291 0.000000 0.000000 0.000000 1.916291 3.021651 0.000000 + 1 1.916291 0.000000 0.000000 1.916291 3.021651 1.916291 0.000000 0.000000 1.916291 + + Using LDA: + + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_lda = s_tfidf.pipe(hero.lda, n_components=2) + >>> s_document_topic, s_topic_term = hero.topic_matrices(s_tfidf, s_lda) + >>> s_document_topic # doctest: +SKIP + Document Topic Matrix + 0 1 + 0 0.912814 0.087186 + 1 0.082094 0.917906 + 2 0.912814 0.087186 + 3 0.875660 0.124340 + >>> s_topic_term # doctest: +SKIP + Topic Term Matrix + band football fun guitar music orchestra soccer sports violin + 0 1.678019 2.758205 1.749217 1.678019 1.447000 0.157316 1.749217 2.758205 0.157316 + 1 0.238271 0.263446 0.167074 0.238271 1.574651 1.758974 0.167074 0.263446 1.758974 + + See Also + -------- + TODO add tutorial link + + """ + # Bool to note whether a clustering function or topic modelling + # functions was used for s_document_topic. + clustering_function_used = s_document_topic.dtype.name == "category" + + if not clustering_function_used: + # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. + + document_term_matrix = s_document_term.sparse.to_coo() + document_topic_matrix = np.array(list(s_document_topic)) + n_topics = len(document_topic_matrix[0]) + + else: + # Here, s_document_topic is output of some hero clustering function. + + # First remove documents that are not assigned to any cluster. + # They have clusterID == -1. + indexes_of_unassigned_documents = s_document_topic == -1 + s_document_term = s_document_term[~indexes_of_unassigned_documents] + s_document_topic = s_document_topic[~indexes_of_unassigned_documents] + s_document_topic = s_document_topic.cat.remove_unused_categories() + + document_term_matrix = s_document_term.sparse.to_coo() + + # Construct document_topic_matrix from the cluster category Series + # as described in the docstring. + n_rows = len(s_document_topic.index) # n_rows = number of documents + # n_cols = number of clusters + n_topics = n_cols = len(s_document_topic.values.categories) + + # Will get binary matrix: + # document_topic_matrix[X][Y] = 1 <=> document X is in cluster Y. + # We construct this matrix sparsely in CSR format + # -> need the data (will only insert 1s, nothing else), + # the rows (so in which rows we want to insert, which is all of them + # as every document belongs to a cluster), + # and we need the columns (so in which cluster we want to insert, + # which is exactly the clusterID values). + data = [1 for _ in range(n_rows)] # Will insert one 1 per row. + rows = range(n_rows) # rows are just [0, 1, ..., n_rows] + columns = s_document_topic.values + + # Construct the sparse matrix. + document_topic_matrix = csr_matrix( + (data, (rows, columns)), shape=(n_rows, n_cols) + ) + + topic_term_matrix = document_topic_matrix.T * document_term_matrix + + # Create s_document_topic and s_topic_term (both multiindexed) + + # Create s_document_topic + s_document_topic_columns = pd.MultiIndex.from_product( + [["Document Topic Matrix"], range(n_topics)] + ) + + if issparse(document_topic_matrix): + s_document_topic = pd.DataFrame.sparse.from_spmatrix( + document_topic_matrix, + columns=s_document_topic_columns, + index=s_document_term.index, + ) + + else: + s_document_topic = pd.DataFrame( + document_topic_matrix, + columns=s_document_topic_columns, + index=s_document_term.index, + dtype="Sparse", + ) + + # Create s_topic_term + s_topic_term_columns = pd.MultiIndex.from_product( + [["Topic Term Matrix"], s_document_term.columns.levels[1].tolist()] + ) + + if issparse(topic_term_matrix): + s_topic_term = pd.DataFrame.sparse.from_spmatrix( + topic_term_matrix, columns=s_topic_term_columns + ) + + else: + s_topic_term = pd.DataFrame( + topic_term_matrix, columns=s_topic_term_columns, dtype="Sparse" + ) + + return s_document_topic, s_topic_term + + +def relevant_words_per_topic( + s_document_term, + s_document_topic_distribution, + s_topic_term_distribution, + n_words=10, + return_figure=False, +): + """ + Use `LDAvis `_ + to find the most relevant words for each topic. + + First input is a DocumentTerm Matrix, so the + output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency`. + + Second input is a DocumentTopic Distribution, + so the l1-normalized (e.g. with :meth:`hero.representation.normalize`_) + first output of :meth:`hero.visualization.topic_matrices`_. + + Third input is a TopicTerm Distribution, + so the l1-normalized (e.g. with :meth:`hero.representation.normalize`_) + second output of :meth:`hero.visualization.topic_matrices`_. + + This function uses the three given relations + (documents->terms, documents->topics, topics->terms) + to find and return the most relevant words for each topic. + The `pyLDAvis library `_ + is used to find relevant words. + + Parameters + ---------- + s_document_term : pd.DataFrame + Output of one of + :meth:`texthero.representation.tfidf`, + :meth:`texthero.representation.count`, + :meth:`texthero.representation.term_frequency`. + + s_document_topic_distribution : pd.DataFrame + L1-Normalized first output of + :meth:`texthero.visualization.topic_matrices`. + + s_topic_term_distribution : pd.DataFrame + L1-Normalized second output of + :meth:`texthero.visualization.topic_matrices`. + + n_words: int, default to 5 + Number of top words per topic, should + be <= 30. + + Returns + ------- + Pandas Series with the topic IDs as index and + a list of n_words relevant words per + topic as values. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) + >>> s_document_topic, s_topic_term = hero.topic_matrices(s_tfidf, s_cluster) + >>> s_document_topic_distribution = hero.normalize(s_document_topic, norm="l1") + >>> s_topic_term_distribution = hero.normalize(s_topic_term, norm="l1") + >>> hero.relevant_words_per_topic(s_tfidf, s_document_topic_distribution, s_topic_term_distribution, n_words=2) # doctest: +SKIP + Topic + 0 [music, violin] + 1 [sports, football] + dtype: object + + See Also + -------- + `pyLDAvis `_ + for the methodology on how to find relevant terms. + + TODO add tutorial link + """ + + # Define parameters for pyLDAvis. + vocab = s_document_term.columns.levels[1].tolist() + doc_lengths = list(s_document_term.sum(axis=1)) + term_frequency = list(s_document_term.sum(axis=0)) + + doc_topic_dists = s_document_topic_distribution.values.tolist() + topic_term_dists = s_topic_term_distribution.values.tolist() + + # Create pyLDAvis visualization. + figure = pyLDAvis.prepare( + **{ + "vocab": vocab, + "doc_lengths": doc_lengths, + "term_frequency": term_frequency, + "doc_topic_dists": doc_topic_dists, + "topic_term_dists": topic_term_dists, + "R": 15, + "sort_topics": False, + } + ) + + if return_figure: + return figure + + # Extract relevant data from LDAvis output. + # Most of the output is only useful for + # the visualization internally (e.g. + # term frequencies, ...). + # We're only interested in the + # relevant words per topic + # which LDAvis returns in the "tinfo" field. + + pyLDAvis_data = figure.to_dict() + + # The top words per topic are in "tinfo". + # We're not calculating/... anything below here, + # only parsing the LDAvis output into a nice Series + # we can return. + df_topics_and_their_relevant_words = pd.DataFrame(pyLDAvis_data["tinfo"]) + + # Throw out topic "Default" + df_topics_and_their_relevant_words = df_topics_and_their_relevant_words[ + df_topics_and_their_relevant_words["Category"] != "Default" + ] + + # Our topics / clusters begin at 0 -> use i-1 and rename e.g. "Topic4" to "3". + n_topics = df_topics_and_their_relevant_words["Category"].nunique() + + replace_dict = {"Topic{}".format(i): i - 1 for i in range(1, n_topics + 1)} + + df_topics_and_their_relevant_words["Category"] = df_topics_and_their_relevant_words[ + "Category" + ].replace(replace_dict) + + # Sort first by topic, then by word frequency. + df_topics_and_their_relevant_words = df_topics_and_their_relevant_words.sort_values( + ["Category", "Freq"], ascending=[1, 0] + ) + + # Group by topic and combine the relevant words into a list. + s_topics_with_relevant_words = df_topics_and_their_relevant_words.groupby( + "Category" + )["Term"].apply(list) + + # Take the top n_words words for each topic. + s_topics_with_relevant_words = s_topics_with_relevant_words.apply( + lambda x: x[:n_words] + ) + + # Replace pyLDAvis names with ours. + s_topics_with_relevant_words = s_topics_with_relevant_words.rename(None) + s_topics_with_relevant_words.index.name = "Topic" + + return s_topics_with_relevant_words + + +def relevant_words_per_document(s_document_term, n_words=10): + """ + Combine several Texthero functions to get the + most relevant words of every document in your dataset. + + Using this function is equivalent to doing the following: + + ```python + + >>> # New Series where every document is its own cluster. + >>> s_cluster = pd.Series( + ... np.arange(len(s_document_term)), index=s_document_term.index, dtype="category") # doctest: +SKIP + >>> s_document_topic, s_topic_term = hero.topic_matrices(s_document_term, s_cluster) # doctest: +SKIP + >>> s_document_topic_distribution = hero.normalize(s_document_topic, norm="l1") # doctest: +SKIP + >>> s_topic_term_distribution = hero.normalize(s_topic_term, norm="l1") # doctest: +SKIP + >>> relevant_words_per_topic( + ... s_document_term, + ... s_document_topic_distribution, + ... s_topic_term_distribution) # doctest: +SKIP + + ``` + + First input has to be output of one of + - :meth:`texthero.representation.tfidf` + - :meth:`texthero.representation.count` + - :meth:`texthero.representation.term_frequency`. + + The function assigns every document + to its own cluster (or "topic") and then uses + :meth:`topic_matrices`_ and + :meth:`relevant_words_per_topic`_ to find + the most relevant words for every document + with `pyLDAvis `_ . + + Parameters + ---------- + s_document_term: pd.DataFrame + Output of one of + :meth:`texthero.representation.tfidf` + :meth:`texthero.representation.count` + :meth:`texthero.representation.term_frequency` + + n_words: int, default to 10 + Number of words to fetch per topic, should + be <= 30. + + Returns + ------- + Series with the documents as index and + a list of n_words relevant words per + document as values. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series( + ... ["Football, Sports, Soccer, Golf", + ... "music, violin, orchestra", + ... "football, fun, sports", + ... "music, band, guitar"]) + >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) + >>> hero.relevant_words_per_document(s_tfidf, n_words=2) # doctest: +SKIP + 0 [soccer, golf] + 1 [violin, orchestra] + 2 [fun, sports] + 3 [guitar, band] + dtype: object + >>> # We can see that the function tries to + >>> # find terms that distinguish the documents, + >>> # so e.g. "music" is not chosen for documents + >>> # 1 and 3 as it's found in both of them. + + See Also + -------- + `pyLDAvis `_ + for the methodology on how to find relevant terms. + + :meth:`texthero.representation.topic_matrices`_ + + :meth:`texthero.representation.relevant_words_per_topic`_ + + TODO add tutorial link + """ + + # Create a categorical Series that has + # one new cluster for every document. + s_cluster = pd.Series( + np.arange(len(s_document_term)), index=s_document_term.index, dtype="category" + ) + + # Get topic matrices. + s_document_topic, s_topic_term = topic_matrices(s_document_term, s_cluster) + + # Get topic distributions through normalization. + s_document_topic_distribution = normalize(s_document_topic, norm="l1") + s_topic_term_distribution = normalize(s_topic_term, norm="l1") + + # Call relevant_words_per_topic with the new cluster series + # (so every document is treated as one distinct "topic") + s_relevant_words_per_document = relevant_words_per_topic( + s_document_term, + s_document_topic_distribution, + s_topic_term_distribution, + n_words=n_words, + ) + + return s_relevant_words_per_document.reindex(s_document_term.index) + + """ Normalization. """ diff --git a/texthero/visualization.py b/texthero/visualization.py index 8f0549a5..de048dd6 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -8,7 +8,7 @@ from wordcloud import WordCloud -from texthero import preprocessing +from texthero import preprocessing, representation from texthero._types import TextSeries, InputSeries import string @@ -19,8 +19,6 @@ from sklearn.preprocessing import normalize as sklearn_normalize import pyLDAvis -from pyLDAvis import display as notebook_display -from pyLDAvis import show as browser_display from collections import Counter from typing import Tuple @@ -314,147 +312,36 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def _get_matrices_for_visualize_topics( +def visualize_topics( s_document_term: pd.DataFrame, s_document_topic: pd.Series, - clustering_function_used: bool, -): - # TODO: add Hero types everywhere when they're merged - """ - Helper function for visualize_topics. Used to extract and - calculate the matrices that pyLDAvis needs. - - Recieves as first argument s_document_term, which is the output of - tfidf / count / term_frequency. From this, s_document_term.values - is the document_term_matrix in the code. - - Recieves as second argument s_document_topic, which is either - the output of a clustering function (so a categorical Series) - or the output of a topic modelling function (so a VectorSeries). - - In the first case (that's when clustering_function_used=True), - we create the document_topic_matrix - through the clusterIDs. So if document X is in cluster Y, - then document_topic_matrix[X][Y] = 1. - - For example, when - `s_document_topic = pd.Series([0, 2, 2, 1, 0, 1], dtype="category")`, - then the document_topic_matrix is - 1 0 0 - 0 0 1 - 0 0 1 - 0 1 0 - 1 0 0 - 0 1 0 - - So e.g. document zero is in cluster 0, so document_topic_matrix[0][0] = 1. - - In the second case (that's when lda or truncatedSVD were used), - their output is already the document_topic_matrix that relates - documents to topics. - - We then have in both cases the document_term_matrix and the document_topic_matrix. - pyLDAvis still needs the topic_term_matrix, which we get through - topic_term_matrix = document_term_matrix.T * document_topic_matrix. - - """ - if not clustering_function_used: - # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. - - document_term_matrix = s_document_term.sparse.to_coo() - document_topic_matrix = np.array(list(s_document_topic)) - - else: - # Here, s_document_topic is output of some hero clustering function. - - # First remove documents that are not assigned to any cluster. - # They have clusterID == -1. - indexes_of_unassigned_documents = s_document_topic == -1 - s_document_term = s_document_term[~indexes_of_unassigned_documents] - s_document_topic = s_document_topic[~indexes_of_unassigned_documents] - s_document_topic = s_document_topic.cat.remove_unused_categories() - - document_term_matrix = s_document_term.sparse.to_coo() - - # Construct document_topic_matrix from the cluster category Series - # as described in the docstring. - n_rows = len(s_document_topic.index) # n_rows = number of documents - n_cols = len(s_document_topic.values.categories) # n_cols = number of clusters - - # Will get binary matrix: - # document_topic_matrix[X][Y] = 1 <=> document X is in cluster Y. - # We construct this matrix sparsely in CSR format - # -> need the data (will only insert 1s, nothing else), - # the rows (so in which rows we want to insert, which is all of them - # as every document belongs to a cluster), - # and we need the columns (so in which cluster we want to insert, - # which is exactly the clusterID values). - data = [1 for _ in range(n_rows)] # Will insert one 1 per row. - rows = range(n_rows) # rows are just [0, 1, ..., n_rows] - columns = s_document_topic.values - - # Construct the sparse matrix. - document_topic_matrix = csr_matrix( - (data, (rows, columns)), shape=(n_rows, n_cols) - ) - - topic_term_matrix = document_topic_matrix.T * document_term_matrix - - return s_document_term, s_document_topic, document_topic_matrix, topic_term_matrix - - -def _prepare_matrices_for_pyLDAvis( - document_topic_matrix: np.matrix, topic_term_matrix: np.matrix + notebook=True, + return_figure=False, ): - # TODO: add types everywhere when they're merged - """ - Helper function for visualize_topics. Used to prepare the - document_topic_matrix and the topic_term_matrix for pyLDAvis. - - First normalizes both matrices to get the - document_topic_distributions and topic_term_distributions matrix. - For example, the first row of document_topic_distributions - has the probabilities of document zero to belong to the - different topics (so every row sums up to 1 (this is later - checked by pyLDAvis)). - So document_topic_matrix[i][j] = proportion of document i - that belongs to topic j. - - Then densify the (potentially) sparse matrices for pyLDAvis. """ - - # Get distributions through normalization - document_topic_distributions = sklearn_normalize( - document_topic_matrix, norm="l1", axis=1 - ) - - topic_term_distributions = sklearn_normalize(topic_term_matrix, norm="l1", axis=1) - - # Make sparse matrices dense for pyLDAvis - - if issparse(document_topic_distributions): - document_topic_distributions = document_topic_distributions.toarray().tolist() - else: - document_topic_distributions = document_topic_distributions.tolist() - - if issparse(topic_term_distributions): - topic_term_distributions = topic_term_distributions.toarray().tolist() - else: - topic_term_distributions = topic_term_distributions.tolist() - - return document_topic_distributions, topic_term_distributions - - -def visualize_topics(s_document_term: pd.DataFrame, s_document_topic: pd.Series): - # TODO: add types everywhere when they're merged - """ - Visualize the topics of your dataset. First input has + Combine several Texthero functions to get a + `pyLDAvis `_ visualization + straight from document_term_matrix and document_topic_matrix. + + Using this function is equivalent to doing the following: + ```python + + >>> import pyLDAvis # doctest: +SKIP + >>> s_document_topic, s_topic_term = hero.topic_matrices(s_document_term, s_document_topic) # doctest: +SKIP + >>> s_document_topic_distribution = hero.normalize(s_document_topic, norm="l1") # doctest: +SKIP + >>> s_topic_term_distribution = hero.normalize(s_topic_term, norm="l1") # doctest: +SKIP + >>> figure = hero.relevant_words_per_topic(s_document_term, s_document_topic_distribution, s_topic_term_distribution, return_figure=True) # doctest: +SKIP + >>> # in a Jupyter Notebook + >>> pyLDAvis.display(figure) # doctest: +SKIP + >>> # otherwise + >>> pyLDAvis.show(figure) # doctest: +SKIP + ``` + + First input has to be output of one of - :meth:`texthero.representation.tfidf` - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` - - (tfidf suggested). + - :meth:`texthero.representation.term_frequency`. Second input can either be the result of clustering, so output of one of @@ -471,34 +358,36 @@ def visualize_topics(s_document_term: pd.DataFrame, s_document_topic: pd.Series) two relations (documents->topics, documents->terms), the function calculates a distribution of documents to topics, and a distribution - of topics to terms. These distributions - are passed to `pyLDAvis `_, - which visualizes them. - + of topics to terms, using :meth:`hero.topic_matrices`_ + and :meth:`hero.representation.normalize`_. - **To show the plot**: - - Interactively in a Jupyter Notebook: use `hero.notebook_display(hero.visualize_topics(...))` - - In a new browser window: `hero.browser_display(hero.visualize_topics(...))` + These distributions are passed to + :meth:`hero.relevant_words_per_topic`_, which + uses `pyLDAvis `_ + to visualize the topics and terms. Parameters ---------- - s_document_term: pd.DataFrame - One of - :meth:`texthero.representation.tfidf` - :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency` - - s_document_topic: pd.Series - One of - :meth:`texthero.representation.kmeans` - :meth:`texthero.representation.meanshift` - :meth:`texthero.representation.dbscan` - (using clustering functkmeansions, documents - that are not assigned to a cluster are - not considered in the visualization) - or one of - :meth:`texthero.representation.lda` - :meth:`texthero.representation.truncatedSVD` + s_document_term : pd.DataFrame + Output of one of + :meth:`texthero.representation.tfidf`, + :meth:`texthero.representation.count`, + :meth:`texthero.representation.term_frequency`. + + s_document_topic : pd.Series + Output of one of + :meth:`texthero.representation.kmeans`, + :meth:`texthero.representation.dbscan`, + :meth:`texthero.representation.meanshift`, + :meth:`texthero.representation.lda`. + + notebook : bool, default True + Whether to show the visualization inside + a Jupyter Notebook or open a new browser tab. + Set this to False when not inside a Jupyter Notebook. + return_figure : bool, default False + Whether to only return the figure instead + of showing it. Examples -------- @@ -510,9 +399,9 @@ def visualize_topics(s_document_term: pd.DataFrame, s_document_topic: pd.Series) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) >>> # Display in a new browser window: - >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster, notebook=False) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_cluster)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster, notebook=True) # doctest: +SKIP Using LDA: @@ -522,439 +411,46 @@ def visualize_topics(s_document_term: pd.DataFrame, s_document_topic: pd.Series) >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> s_lda = s_tfidf.pipe(hero.lda, n_components=2) >>> # Display in a new browser window: - >>> hero.browser_display(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP + >>> hero.visualize_topics(s_tfidf, s_cluster, notebook=False) # doctest: +SKIP >>> # Display inside the current Jupyter Notebook: - >>> hero.notebook_display(hero.visualize_topics(s_tfidf, s_lda)) # doctest: +SKIP - - - See Also - -------- - `pyLDAvis `_ - - TODO add tutorial link - - """ - # Bool to note whether a clustering function or topic modelling - # functions was used for s_document_topic. - clustering_function_used = s_document_topic.dtype.name == "category" - - # Get / build matrices from input - # (see helper function docstring for explanation) - ( - s_document_term, - s_document_topic, - document_topic_matrix, - topic_term_matrix, - ) = _get_matrices_for_visualize_topics( - s_document_term, s_document_topic, clustering_function_used - ) - - vocab = list(s_document_term.columns.levels[1]) - doc_lengths = list(s_document_term.sum(axis=1)) - term_frequency = list(s_document_term.sum(axis=0)) - - # Prepare matrices for input to pyLDAvis - # (see helper function docstring for explanation) - ( - document_topic_distributions, - topic_term_distributions, - ) = _prepare_matrices_for_pyLDAvis(document_topic_matrix, topic_term_matrix) - - # Create pyLDAvis visualization. - figure = pyLDAvis.prepare( - **{ - "vocab": vocab, - "doc_lengths": doc_lengths, - "term_frequency": term_frequency, - "doc_topic_dists": document_topic_distributions, - "topic_term_dists": topic_term_distributions, - "R": 15, - "sort_topics": False, - } - ) - - return figure - - -def top_words_per_topic( - s_document_term: pd.DataFrame, s_clusters: pd.Series, n_words=5 -): - # TODO: add types everywhere when they're merged - """ - Find the top words per topic of your dataset. First input has - to be output of one of - - :meth:`texthero.representation.tfidf` - - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` - - (tfidf suggested). - - Second input has to be the result of - clustering, so output of one of - - :meth:`texthero.representation.kmeans` - - :meth:`texthero.representation.meanshift` - - :meth:`texthero.representation.dbscan` - - :meth:`texthero.representation.topics_from_topic_model` - - The function uses the given clustering - from the second input, which relates - documents to topics. The first input - relates documents to terms. From those - two relations (documents->topics, documents->terms), - the function calculates a distribution of - documents to topics, and a distribution - of topics to terms. These distributions - are used to find the most relevant - terms per topic. - - Parameters - ---------- - s_document_term: pd.DataFrame - One of - :meth:`texthero.representation.tfidf` - :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency` - - s_clusters: pd.Series - One of - :meth:`texthero.representation.kmeans` - :meth:`texthero.representation.meanshift` - :meth:`texthero.representation.dbscan` - :meth:`texthero.representation.topics_from_topic_model` - - n_words: int, default to 5 - Number of top words per topic, should - be <= 30. - - Returns - ------- - Series with the topic IDs as index and - a list of n_words relevant words per - topic as values. - - Examples - -------- - Using Clustering: - - >>> import texthero as hero - >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) - >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) - >>> s_cluster = s_tfidf.pipe(hero.normalize).pipe(hero.pca, n_components=2).pipe(hero.kmeans, n_clusters=2) - >>> hero.top_words_per_topic(s_tfidf, s_cluster) # doctest: +SKIP - 0 [sports, football, soccer] - 1 [music, violin, orchestra] - dtype: object + >>> hero.visualize_topics(s_tfidf, s_cluster, notebook=True) # doctest: +SKIP See Also -------- `pyLDAvis `_ - and their methodology on how to find relevant terms. - - TODO add tutorial link - - """ - - pyLDAvis_result = visualize_topics(s_document_term, s_clusters).to_dict() - - df_topics_and_their_top_words = pd.DataFrame(pyLDAvis_result["tinfo"]) + for the methodology on how to find relevant terms. - # Throw out topic "Default" - df_topics_and_their_top_words = df_topics_and_their_top_words[ - df_topics_and_their_top_words["Category"] != "Default" - ] + :meth:`texthero.representation.topic_matrices`_ - n_topics = df_topics_and_their_top_words["Category"].nunique() - - # Our topics / clusters begin at 0 -> use i-1 - replace_dict = {"Topic{}".format(i): i - 1 for i in range(1, n_topics + 1)} - - df_topics_and_their_top_words["Category"] = df_topics_and_their_top_words[ - "Category" - ].replace(replace_dict) - - df_topics_and_their_top_words = df_topics_and_their_top_words.sort_values( - ["Category", "Freq"], ascending=[1, 0] - ) - - s_topics_with_top_words = df_topics_and_their_top_words.groupby("Category")[ - "Term" - ].apply(list) - - s_topics_with_top_words = s_topics_with_top_words.apply(lambda x: x[:n_words]) - - # Remove series name "Term" from pyLDAvis - s_topics_with_top_words = s_topics_with_top_words.rename(None) - - return s_topics_with_top_words - - -def top_words_per_document(s_document_term: pd.DataFrame, n_words=3): - # TODO: add types everywhere when they're merged - """ - Find the top words per document of your dataset. First input has - to be output of one of - - :meth:`texthero.representation.tfidf` - - :meth:`texthero.representation.count` - - :meth:`texthero.representation.term_frequency` - - (tfidf suggested). - - The function assigns every document - to its own cluster (or "topic") and then uses - :meth:`top_words_per_topic` to find - the top words for every document. - - Parameters - ---------- - s_document_term: pd.DataFrame - One of - :meth:`texthero.representation.tfidf` - :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency` - - n_words: int, default to 3 - Number of words to fetch per topic, should - be <= 30. - - Returns - ------- - Series with the document IDs as index and - a list of n_words relevant words per - document as values. - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, band, guitar"]) - >>> s_tfidf = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) - >>> hero.top_words_per_document(s_tfidf, n_words=2) # doctest: +SKIP - 0 [soccer, sports] - 1 [violin, orchestra] - 2 [fun, sports] - 3 [guitar, band] - dtype: object - >>> # We can see that the function tries to - >>> # find terms that distinguish the documents, - >>> # so "music" is not chosen for documents - >>> # 1 and 3 as it's found in both. - - See Also - -------- - :meth:`top_words_per_topic` + :meth:`texthero.representation.relevant_words_per_topic`_ TODO add tutorial link """ - # Create a categorical Series that has - # one new cluster for every document. - s_cluster = pd.Series( - np.arange(len(s_document_term)), index=s_document_term.index, dtype="category" - ) - - # Call top_words_per_topic with the new cluster series - # (so every document is one distinct "topic") - s_top_words_per_document = top_words_per_topic( - s_document_term, s_cluster, n_words=n_words - ) - - return s_top_words_per_document.reindex(s_document_term.index) - - -# NEW PIPELINE: STEP 1 -def topic_matrices( - s_document_term: pd.DataFrame, - s_document_topic: pd.Series, -): - # TODO: add Hero types everywhere when they're merged - # FIXME: new docstring ~ DocumentTerm & DocumentTopic -> DocumentTopic & TopicTerm. - """ - - Helper function for visualize_topics. Used to extract and - calculate the matrices that pyLDAvis needs. - - Recieves as first argument s_document_term, which is the output of - tfidf / count / term_frequency. From this, s_document_term.values - is the document_term_matrix in the code. - - Recieves as second argument s_document_topic, which is either - the output of a clustering function (so a categorical Series) - or the output of a topic modelling function (so a VectorSeries). - - In the first case (that's when clustering_function_used=True), - we create the document_topic_matrix - through the clusterIDs. So if document X is in cluster Y, - then document_topic_matrix[X][Y] = 1. - - For example, when - `s_document_topic = pd.Series([0, 2, 2, 1, 0, 1], dtype="category")`, - then the document_topic_matrix is - 1 0 0 - 0 0 1 - 0 0 1 - 0 1 0 - 1 0 0 - 0 1 0 - - So e.g. document zero is in cluster 0, so document_topic_matrix[0][0] = 1. - - In the second case (that's when lda or truncatedSVD were used), - their output is already the document_topic_matrix that relates - documents to topics. - - We then have in both cases the document_term_matrix and the document_topic_matrix. - pyLDAvis still needs the topic_term_matrix, which we get through - topic_term_matrix = document_term_matrix.T * document_topic_matrix. - - - Docuement Topic Matrix Topic Term Matrix - 1 2 3 1 2 3 - 0 1 2 3 0 1 2 3 - 1 4 5 6 , 1 4 5 6 - - - """ - # Bool to note whether a clustering function or topic modelling - # functions was used for s_document_topic. - clustering_function_used = s_document_topic.dtype.name == "category" - - if not clustering_function_used: - # Here, s_document_topic is output of hero.lda or hero.truncatedSVD. - - document_term_matrix = s_document_term.sparse.to_coo() - document_topic_matrix = np.array(list(s_document_topic)) - n_topics = len(document_topic_matrix[0]) - - else: - # Here, s_document_topic is output of some hero clustering function. - - # First remove documents that are not assigned to any cluster. - # They have clusterID == -1. - indexes_of_unassigned_documents = s_document_topic == -1 - s_document_term = s_document_term[~indexes_of_unassigned_documents] - s_document_topic = s_document_topic[~indexes_of_unassigned_documents] - s_document_topic = s_document_topic.cat.remove_unused_categories() - - document_term_matrix = s_document_term.sparse.to_coo() - - # Construct document_topic_matrix from the cluster category Series - # as described in the docstring. - n_rows = len(s_document_topic.index) # n_rows = number of documents - # n_cols = number of clusters - n_topics = n_cols = len(s_document_topic.values.categories) - - # Will get binary matrix: - # document_topic_matrix[X][Y] = 1 <=> document X is in cluster Y. - # We construct this matrix sparsely in CSR format - # -> need the data (will only insert 1s, nothing else), - # the rows (so in which rows we want to insert, which is all of them - # as every document belongs to a cluster), - # and we need the columns (so in which cluster we want to insert, - # which is exactly the clusterID values). - data = [1 for _ in range(n_rows)] # Will insert one 1 per row. - rows = range(n_rows) # rows are just [0, 1, ..., n_rows] - columns = s_document_topic.values - - # Construct the sparse matrix. - document_topic_matrix = csr_matrix( - (data, (rows, columns)), shape=(n_rows, n_cols) - ) - - topic_term_matrix = document_topic_matrix.T * document_term_matrix - - # Create s_document_topic and s_topic_term (both multiindexed) - - # Create s_document_topic - s_document_topic_columns = pd.MultiIndex.from_product( - [["Document Topic Matrix"], range(n_topics)] + # Get topic matrices. + s_document_topic, s_topic_term = representation.topic_matrices( + s_document_term, s_document_topic ) - if isinstance(document_topic_matrix, csr_matrix): - s_document_topic = pd.DataFrame.sparse.from_spmatrix( - document_topic_matrix, - columns=s_document_topic_columns, - index=s_document_term.index - ) - else: - s_document_topic = pd.DataFrame( - document_topic_matrix, - columns=s_document_topic_columns, - index=s_document_term.index - ) - - # Create s_topic_term - s_topic_term_columns = pd.MultiIndex.from_product( - [["Topic Term Matrix"], s_document_term.columns.tolist()] + # Get topic distributions through normalization. + s_document_topic_distribution = representation.normalize( + s_document_topic, norm="l1" ) - if isinstance(topic_term_matrix, csr_matrix): - s_topic_term = pd.DataFrame.sparse.from_spmatrix( - topic_term_matrix, - columns=s_topic_term_columns - ) - - else: - s_topic_term = pd.DataFrame( - topic_term_matrix, - columns=s_topic_term_columns - ) - - return s_document_topic, s_topic_term - -# New Pipeline: Step 2 -# Users just need to l1-normalize - - -# New Pipeline: Step 3 -def relevant_terms_per_topic( - s_document_term, - s_document_topic_distribution, - s_topic_term_distribution, - return_figure=False -): - """ - Use LDAvis to get topics & relevant terms. - """ + s_topic_term_distribution = representation.normalize(s_topic_term, norm="l1") - # Define parameters for pyLDAvis. - vocab = s_document_term.columns.levels[1].tolist() - doc_lengths = list(s_document_term.sum(axis=1)) - term_frequency = list(s_document_term.sum(axis=0)) - - doc_topic_dists = s_document_topic_distribution.values.tolist() - topic_term_dists = s_topic_term_distribution.values.tolist() - - # Create pyLDAvis visualization. - figure = pyLDAvis.prepare( - **{ - "vocab": vocab, - "doc_lengths": doc_lengths, - "term_frequency": term_frequency, - "doc_topic_dists": doc_topic_dists, - "topic_term_dists": topic_term_dists, - "R": 15, - "sort_topics": False, - } + # Get the pyLDAvis figure. + figure = representation.relevant_words_per_topic( + s_document_term, + s_document_topic_distribution, + s_topic_term_distribution, + return_figure=True, ) - # TODO Extract relevant info etc. from figure if return_figure: return figure -""" -Visualize_Topics: - Step 1: v/ - Step 2: v/ - Step 3: -> currently doing that to already return relevant_words_per_topic - - -> Wrapper calls 1-3 and 3 w/ return_figure=True and plots that - -Top_Words_per_Topic: - like above - -Top_Words_per_Document: - have a look at that - -topics_from_topic_model: - have a look at that -""" + # Visualize it. + if notebook: + pyLDAvis.display(figure) + else: + pyLDAvis.show(figure) From b9eaf1fa9126507e28ecca970b42634f83e1a4e3 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 12 Sep 2020 11:35:19 +0200 Subject: [PATCH 35/42] incorporate suggested changes from review --- texthero/representation.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 9fa6e093..034b4f8e 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -977,29 +977,32 @@ def lda( n_jobs=-1, ) -> pd.Series: """ - Performs Latent Dirichlet Allocation on the given pandas series. + Performs Latent Dirichlet Allocation on the given Pandas Series + or DataFrame. Latent Dirichlet Allocation (LDA) is a topic modeling algorithm based on Dirichlet distribution. In natural language processing - LDA is often used to categorise documents into diffenrent topics + LDA is often used to categorize documents into different topics and generate top words from these topics. In this process LDA is used in combination with algorithms which generate document-term- - matrixes, like :meth:`count` or :meth:`tfidf` + matrices, like :meth:`count`, :meth:`tfidf` or :meth:`term_frequency`. - LDA can directly handle sparse input, so when calling truncatedSVD on a - DocumentTermDF, the advantage of sparseness is kept. + LDA can directly handle sparse input, so when calling LDA on a + sparse DataFrame, the advantage of sparseness is kept. Parameters ---------- - s : pd.Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + s : pd.Series (VectorSeries) or Sparse pd.DataFrame - n_components : int, default is 2. + n_components : int, default is 10. Number of components to keep (dimensionality of output vectors). When using truncatedSVD for Topic Modelling, this needs to be the number of topics. max_iter : int, optional (default: 10) - The maximum number of iterations. + The maximum number of iterations. In each interation, + the algorithm gets closer to convergence. Set this higher + for potentially better results, but also longer runtime. random_state : int, default=None Determines the random number generator. Pass an int for reproducible @@ -1007,8 +1010,8 @@ def lda( Returns ------- - Pandas Series with the vector calculated by LDA for the document in every - cell. + Pandas Series (VectorSeries) with the vector calculated by LDA + for the document in every cell. Examples -------- From 6c30a5e531d70548e255e288eadc5c8b3e606be0 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 18 Sep 2020 20:18:43 +0200 Subject: [PATCH 36/42] Fix pyLDAvis PCoA issue. PCoA is implemented in a sub-optimal way in the pyLDAvis library. We change this (by adding 1 character to their code). Co-authored-by: Maximilian Krahn --- texthero/_helper.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/texthero/_helper.py b/texthero/_helper.py index 6319c056..fcb2fbe3 100644 --- a/texthero/_helper.py +++ b/texthero/_helper.py @@ -2,7 +2,9 @@ Useful helper functions for the texthero library. """ +import pyLDAvis import pandas as pd +import numpy as np import functools import warnings @@ -71,3 +73,57 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +""" +For representation.relevant_words_per_topic: + +Redefinition of PCoA from pyLDAvis to support +big datasets. The only thing we change is the line +`eigvals, eigvecs = np.linalg.eigh(B)`, which was before +`eigvals, eigvecs = np.linalg.eig(B)`. Apart from that, +every line is the same as in pyLDAvis! Without this change, +we get complex eigenvalues with all complex components = 0 +due to floating point errors, see e.g. +https://stackoverflow.com/questions/8765310/scipy-linalg-eig-return-complex-eigenvalues-for-covariance-matrix + +The change is safe and makes sense as the input matrix `pair_dists` +(pairwise distances) is always a symmetric matrix. + +""" + + +def _hero_pcoa(pair_dists, n_components=2): + """Principal Coordinate Analysis, + aka Classical Multidimensional Scaling + """ + # code referenced from skbio.stats.ordination.pcoa + # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py + + # pairwise distance matrix is assumed symmetric + pair_dists = np.asarray(pair_dists, np.float64) + + # perform SVD on double centred distance matrix + n = pair_dists.shape[0] + H = np.eye(n) - np.ones((n, n)) / n + B = -H.dot(pair_dists ** 2).dot(H) / 2 + eigvals, eigvecs = np.linalg.eigh(B) # CHANGED BY US + + # Take first n_components of eigenvalues and eigenvectors + # sorted in decreasing order + ix = eigvals.argsort()[::-1][:n_components] + eigvals = eigvals[ix] + eigvecs = eigvecs[:, ix] + + # replace any remaining negative eigenvalues and associated eigenvectors with zeroes + # at least 1 eigenvalue must be zero + eigvals[np.isclose(eigvals, 0)] = 0 + if np.any(eigvals < 0): + ix_neg = eigvals < 0 + eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape) + eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape) + + return np.sqrt(eigvals) * eigvecs + + +pyLDAvis._prepare._pcoa = _hero_pcoa From d12ba7e62b1d9d5d8271d1647dc3cc6dc42d977a Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 18 Sep 2020 20:32:19 +0200 Subject: [PATCH 37/42] Add comment to docstring. --- texthero/representation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index 034b4f8e..c5e12f9b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1355,6 +1355,11 @@ def relevant_words_per_topic( - :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency`. + The document-term-matrix has to include all + terms that are present in the documents + (i.e. you _cannot_ use the parameters max_df, + min_df, or max_features). + Second input is a DocumentTopic Distribution, so the l1-normalized (e.g. with :meth:`hero.representation.normalize`_) first output of :meth:`hero.visualization.topic_matrices`_. @@ -1376,6 +1381,10 @@ def relevant_words_per_topic( :meth:`texthero.representation.tfidf`, :meth:`texthero.representation.count`, :meth:`texthero.representation.term_frequency`. + All terms from the corpus have to be present + (i.e. you _cannot_ use the parameters max_df, + min_df, or max_features when computing + s_document_term). s_document_topic_distribution : pd.DataFrame L1-Normalized first output of @@ -1522,6 +1531,11 @@ def relevant_words_per_document(s_document_term, n_words=10): - :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency`. + The document-term-matrix has to include all + terms that are present in the documents + (i.e. you _cannot_ use the parameters max_df, + min_df, or max_features). + The function assigns every document to its own cluster (or "topic") and then uses :meth:`topic_matrices`_ and @@ -1535,7 +1549,11 @@ def relevant_words_per_document(s_document_term, n_words=10): Output of one of :meth:`texthero.representation.tfidf` :meth:`texthero.representation.count` - :meth:`texthero.representation.term_frequency` + :meth:`texthero.representation.term_frequency`. + All terms from the corpus have to be present + (i.e. you _cannot_ use the parameters max_df, + min_df, or max_features when computing + s_document_term). n_words: int, default to 10 Number of words to fetch per topic, should From a57192502a7c9afa3298e0e1bddd9465ea5b14aa Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 18 Sep 2020 20:46:13 +0200 Subject: [PATCH 38/42] import _helper in __init__ to overwrite pyLDAvis change --- texthero/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/texthero/__init__.py b/texthero/__init__.py index 66e891e9..8998dd32 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -16,3 +16,5 @@ from .nlp import * from . import stopwords + +from . import _helper From a75aebea13f178d56885f3258ecb4c3559bdc0fe Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 18 Sep 2020 21:00:49 +0200 Subject: [PATCH 39/42] enable auto-display for jupyter notebooks --- texthero/visualization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index de048dd6..6633ab84 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -451,6 +451,8 @@ def visualize_topics( # Visualize it. if notebook: - pyLDAvis.display(figure) + # Import here as non-notebook users don't have this. + import IPython + return IPython.display.display(pyLDAvis.display(figure)) else: pyLDAvis.show(figure) From cfc78d94aea23d59ff089b883189aad0efc8bbc7 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 11:51:48 +0200 Subject: [PATCH 40/42] fixed vector series, as pca returns an array --- tests/test_representation.py | 173 ++++++++++++++++++++++++++++++++++- texthero/_types.py | 3 +- texthero/representation.py | 31 ++++--- texthero/visualization.py | 1 + 4 files changed, 187 insertions(+), 21 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index ab52d711..7c452854 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -256,11 +256,6 @@ def test_dim_reduction_and_clustering_with_dataframe_input( atol=0.1, check_category_order=False, ) - s_cluster = ( - s_tfidf.pipe(representation.normalize) - .pipe(representation.pca, n_components=2, random_state=42) - .pipe(representation.kmeans, n_clusters=2, random_state=42) - ) def test_normalize_DataFrame_also_as_output(self): # normalize should also return DataFrame output for DataFrame @@ -273,3 +268,171 @@ def test_normalize_DataFrame_also_as_output(self): pd.testing.assert_frame_equal( result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, ) + + """ + Test Topic Modelling (not all are suitable for parameterization). + `topics_from_topic_model, lda, truncatedSVD` already tested above. + + Here, we test + `relevant_words_per_document, relevant_words_per_topic, topic_matrices` + """ + + def test_relevant_words_per_document(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_result = representation.relevant_words_per_document(s_tfidf, n_words=2) + + s_true = pd.Series( + [ + ["soccer", "sports"], + ["violin", "orchestra"], + ["fun", "sports"], + ["guitar", "band"], + ], + ) + pd.testing.assert_series_equal(s_result, s_true) + + def test_relevant_words_per_topic(self): + s = pd.Series( + [ + "Football, Sports, Soccer", + "music, violin, orchestra", + "football, fun, sports", + "music, band, guitar", + ] + ) + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=2, random_state=42) + .pipe(representation.kmeans, n_clusters=2, random_state=42) + ) + + s_document_topic, s_topic_term = representation.topic_matrices( + s_tfidf, s_cluster + ) + s_document_topic_distribution = representation.normalize( + s_document_topic, norm="l1" + ) + s_topic_term_distribution = representation.normalize(s_topic_term, norm="l1") + + s_result = representation.relevant_words_per_topic( + s_tfidf, s_document_topic_distribution, s_topic_term_distribution, n_words=3 + ) + s_true = pd.Series( + [["music", "violin", "orchestra"], ["sports", "football", "soccer"]], + ) + pd.testing.assert_series_equal(s_result, s_true, check_names=False) + + def test_topic_matrices_clustering_for_second_input(self): + + s = pd.Series(["Football", "Music", "Football", "Music",]) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_cluster = ( + s_tfidf.pipe(representation.normalize) + .pipe(representation.pca, n_components=2, random_state=42) + .pipe(representation.kmeans, n_clusters=2, random_state=42) + ) + + s_document_topic_result, s_topic_term_result = representation.topic_matrices( + s_tfidf, s_cluster + ) + + s_document_topic_true = pd.DataFrame( + [[0, 1], [1, 0], [0, 1], [1, 0]], + columns=pd.MultiIndex.from_tuples( + [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] + ), + ) + + s_topic_term_true = pd.DataFrame( + [[0.0, 3.021651], [3.021651, 0.0]], + columns=pd.MultiIndex.from_tuples( + [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] + ), + ) + + pd.testing.assert_frame_equal( + s_document_topic_result, + s_document_topic_true, + check_less_precise=True, + check_dtype=False, + ) + + pd.testing.assert_frame_equal( + s_topic_term_result, + s_topic_term_true, + check_less_precise=True, + check_dtype=False, + ) + + def test_visualize_topics_topic_modelling_for_second_input(self): + + s = pd.Series(["Football", "Music", "Football", "Music",]) + + s_tfidf = ( + s.pipe(preprocessing.clean) + .pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + ) + s_lda = s_tfidf.pipe(representation.normalize).pipe( + representation.lda, n_components=2, random_state=42 + ) + + s_document_topic_result, s_topic_term_result = representation.topic_matrices( + s_tfidf, s_lda + ) + + s_document_topic_true = pd.DataFrame( + [ + [0.744417, 0.255583], + [0.255583, 0.744417], + [0.744417, 0.255583], + [0.255583, 0.744417], + ], + columns=pd.MultiIndex.from_tuples( + [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] + ), + ) + + s_topic_term_true = pd.DataFrame( + [[2.249368, 0.772283], [0.772283, 2.249369]], + columns=pd.MultiIndex.from_tuples( + [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] + ), + ) + + pd.testing.assert_frame_equal( + s_document_topic_result, + s_document_topic_true, + check_less_precise=True, + check_dtype=False, + ) + + pd.testing.assert_frame_equal( + s_topic_term_result, + s_topic_term_true, + check_less_precise=True, + check_dtype=False, + ) diff --git a/texthero/_types.py b/texthero/_types.py index 16125109..597aa128 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -59,6 +59,7 @@ def tfidf(s: TokenSeries) -> DataFrame: import functools import pandas as pd +import numpy as np from typing import Tuple @@ -198,7 +199,7 @@ def is_numeric(x): return True def is_list_of_numbers(cell): - return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell) + return isinstance(cell, (list, tuple, np.ndarray)) and all(is_numeric(x) for x in cell) try: first_non_nan_value = s.loc[s.first_valid_index()] diff --git a/texthero/representation.py b/texthero/representation.py index b028e53e..3cbf2dae 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -684,7 +684,7 @@ def kmeans( @InputSeries([VectorSeries, DataFrame]) def dbscan( - s, + input_matrix, eps=0.5, min_samples=5, metric="euclidean", @@ -917,7 +917,7 @@ def meanshift( def truncatedSVD( - s: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, + input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, ) -> pd.Series: """ Perform TruncatedSVD on the given pandas Series. @@ -936,7 +936,7 @@ def truncatedSVD( Parameters ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -978,21 +978,22 @@ def truncatedSVD( n_components=n_components, n_iter=n_iter, random_state=random_state ) - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + + if isinstance(input_matrix, pd.DataFrame): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + input_matrix_for_vectorization = list(input_matrix) result = pd.Series( - list(truncatedSVD.fit_transform(s_for_vectorization)), index=s.index + list(truncatedSVD.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index ) return result def lda( - s: Union[pd.Series, pd.DataFrame], + input_matrix: Union[pd.Series, pd.DataFrame], n_components=10, max_iter=10, random_state=None, @@ -1014,7 +1015,7 @@ def lda( Parameters ---------- - s : pd.Series (VectorSeries) or Sparse pd.DataFrame + input_matrix : pd.Series (VectorSeries) or Sparse pd.DataFrame n_components : int, default is 10. Number of components to keep (dimensionality of output vectors). @@ -1058,13 +1059,13 @@ def lda( n_components=n_components, max_iter=max_iter, random_state=random_state ) - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + if isinstance(input_matrix, pd.DataFrame): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + input_matrix_for_vectorization = list(s) - result = pd.Series(list(lda.fit_transform(s_for_vectorization)), index=s.index) + result = pd.Series(list(lda.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) return result diff --git a/texthero/visualization.py b/texthero/visualization.py index 12f44f0c..38e51369 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -453,6 +453,7 @@ def visualize_topics( if notebook: # Import here as non-notebook users don't have this. import IPython + return IPython.display.display(pyLDAvis.display(figure)) else: pyLDAvis.show(figure) From 4c5aa0b1cec4b20908f7f5971bb72365eae07591 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 12:04:12 +0200 Subject: [PATCH 41/42] fixed the last merged issues --- tests/test_representation.py | 16 ++++------------ texthero/representation.py | 10 +++------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c452854..5692bcb5 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -361,16 +361,12 @@ def test_topic_matrices_clustering_for_second_input(self): s_document_topic_true = pd.DataFrame( [[0, 1], [1, 0], [0, 1], [1, 0]], - columns=pd.MultiIndex.from_tuples( - [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] - ), + columns=[ 0,1] ) s_topic_term_true = pd.DataFrame( [[0.0, 3.021651], [3.021651, 0.0]], - columns=pd.MultiIndex.from_tuples( - [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] - ), + columns= ["football", "music"] ) pd.testing.assert_frame_equal( @@ -411,16 +407,12 @@ def test_visualize_topics_topic_modelling_for_second_input(self): [0.744417, 0.255583], [0.255583, 0.744417], ], - columns=pd.MultiIndex.from_tuples( - [("Document Topic Matrix", 0), ("Document Topic Matrix", 1)] - ), + columns=[0, 1] ) s_topic_term_true = pd.DataFrame( [[2.249368, 0.772283], [0.772283, 2.249369]], - columns=pd.MultiIndex.from_tuples( - [("Topic Term Matrix", "football"), ("Topic Term Matrix", "music")] - ), + columns=["football","music"], ) pd.testing.assert_frame_equal( diff --git a/texthero/representation.py b/texthero/representation.py index 3cbf2dae..43ff1998 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1324,9 +1324,7 @@ def topic_matrices( # Create s_document_topic and s_topic_term (both multiindexed) # Create s_document_topic - s_document_topic_columns = pd.MultiIndex.from_product( - [["Document Topic Matrix"], range(n_topics)] - ) + s_document_topic_columns = list(range(n_topics)) if issparse(document_topic_matrix): s_document_topic = pd.DataFrame.sparse.from_spmatrix( @@ -1344,9 +1342,7 @@ def topic_matrices( ) # Create s_topic_term - s_topic_term_columns = pd.MultiIndex.from_product( - [["Topic Term Matrix"], s_document_term.columns.levels[1].tolist()] - ) + s_topic_term_columns = list(s_document_term.columns) if issparse(topic_term_matrix): s_topic_term = pd.DataFrame.sparse.from_spmatrix( @@ -1452,7 +1448,7 @@ def relevant_words_per_topic( """ # Define parameters for pyLDAvis. - vocab = s_document_term.columns.levels[1].tolist() + vocab = list(s_document_term.columns) doc_lengths = list(s_document_term.sum(axis=1)) term_frequency = list(s_document_term.sum(axis=0)) From dc42ed1e6ced0c33a13d15fe1abc581163a55c75 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 22 Sep 2020 14:59:17 +0200 Subject: [PATCH 42/42] fix formatting --- tests/test_representation.py | 11 ++++------- texthero/_types.py | 4 +++- texthero/representation.py | 14 ++++++++++---- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5692bcb5..6b1cda84 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -360,13 +360,11 @@ def test_topic_matrices_clustering_for_second_input(self): ) s_document_topic_true = pd.DataFrame( - [[0, 1], [1, 0], [0, 1], [1, 0]], - columns=[ 0,1] + [[0, 1], [1, 0], [0, 1], [1, 0]], columns=[0, 1] ) s_topic_term_true = pd.DataFrame( - [[0.0, 3.021651], [3.021651, 0.0]], - columns= ["football", "music"] + [[0.0, 3.021651], [3.021651, 0.0]], columns=["football", "music"] ) pd.testing.assert_frame_equal( @@ -407,12 +405,11 @@ def test_visualize_topics_topic_modelling_for_second_input(self): [0.744417, 0.255583], [0.255583, 0.744417], ], - columns=[0, 1] + columns=[0, 1], ) s_topic_term_true = pd.DataFrame( - [[2.249368, 0.772283], [0.772283, 2.249369]], - columns=["football","music"], + [[2.249368, 0.772283], [0.772283, 2.249369]], columns=["football", "music"], ) pd.testing.assert_frame_equal( diff --git a/texthero/_types.py b/texthero/_types.py index 597aa128..897797bc 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -199,7 +199,9 @@ def is_numeric(x): return True def is_list_of_numbers(cell): - return isinstance(cell, (list, tuple, np.ndarray)) and all(is_numeric(x) for x in cell) + return isinstance(cell, (list, tuple, np.ndarray)) and all( + is_numeric(x) for x in cell + ) try: first_non_nan_value = s.loc[s.first_valid_index()] diff --git a/texthero/representation.py b/texthero/representation.py index 43ff1998..7003f4ea 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -917,7 +917,10 @@ def meanshift( def truncatedSVD( - input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, n_iter=5, random_state=None, + input_matrix: Union[pd.Series, pd.DataFrame], + n_components=2, + n_iter=5, + random_state=None, ) -> pd.Series: """ Perform TruncatedSVD on the given pandas Series. @@ -978,7 +981,6 @@ def truncatedSVD( n_components=n_components, n_iter=n_iter, random_state=random_state ) - if isinstance(input_matrix, pd.DataFrame): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") @@ -986,7 +988,8 @@ def truncatedSVD( input_matrix_for_vectorization = list(input_matrix) result = pd.Series( - list(truncatedSVD.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index + list(truncatedSVD.fit_transform(input_matrix_for_vectorization)), + index=input_matrix.index, ) return result @@ -1065,7 +1068,10 @@ def lda( else: input_matrix_for_vectorization = list(s) - result = pd.Series(list(lda.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) + result = pd.Series( + list(lda.fit_transform(input_matrix_for_vectorization)), + index=input_matrix.index, + ) return result