From ec37e13f7f8a272a26d6875fbdf30867df120204 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 21 Nov 2024 10:56:29 +0100 Subject: [PATCH 01/38] Fixing changelog with correct account --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 84ac28256..b96fb767e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -77,7 +77,7 @@ Minor changes Dockès `. * Added a `DropColumnIfNull` transformer that drops columns that contain only null - values. :pr:`1115` by :user: `Riccardo Cappuzzo ` + values. :pr:`1115` by :user: `Riccardo Cappuzzo ` Bug fixes --------- From 4f7e46e34031ce0c4f133416177cffa89b635eb2 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 26 Nov 2024 16:55:21 +0100 Subject: [PATCH 02/38] Initial commit --- example_string_encoder.py | 33 ++++++++++++++++++++++++++ skrub/_string_encoder.py | 50 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 example_string_encoder.py create mode 100644 skrub/_string_encoder.py diff --git a/example_string_encoder.py b/example_string_encoder.py new file mode 100644 index 000000000..79ad3de60 --- /dev/null +++ b/example_string_encoder.py @@ -0,0 +1,33 @@ +# %% test string encoder +import polars as pl +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline + +from skrub._string_encoder import StringEncoder + +corpus = [ + "this is the first document", + "this document is the second document", + "and this is the third one", + "is this the first document", +] +column = pl.Series(name="this_column", values=corpus) + +# %% + +pipe = Pipeline( + [ + ("tfidf", TfidfVectorizer()), + ("pca", PCA(n_components=2)), + ] +) +# %% +a = pipe.fit_transform(corpus) + +# %% +se = StringEncoder(2) + +# %% +r = se.fit_transform(column) +# %% diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py new file mode 100644 index 000000000..523eefb92 --- /dev/null +++ b/skrub/_string_encoder.py @@ -0,0 +1,50 @@ +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline + +from . import _dataframe as sbd +from ._on_each_column import SingleColumnTransformer + + +class StringEncoder(SingleColumnTransformer): + """_summary_ + + Parameters + ---------- + + """ + + def __init__(self, pca_components=30): + self.pca_components = pca_components + + def _transform(self, X): + # TODO: vocabulary? + self.pipe = Pipeline( + [ + ("tfidf", TfidfVectorizer()), + ("pca", PCA(n_components=self.pca_components)), + ] + ).fit(X) + + return self.pipe.transform(X) + + def get_feature_names_out(self, X): + name = sbd.name(X) + if not name: + name = "pca" + names = [f"{name}_{idx}" for idx in range(self.pca_components)] + return names + + def fit_transform(self, X, y=None): + del y + + return self.transform(X) + + def transform(self, X): + # check_is_fitted(self) + + result = self._transform(sbd.to_numpy(X)) + names = self.get_feature_names_out(X) + result = sbd.make_dataframe_like(X, dict(zip(names, result.T))) + result = sbd.copy_index(X, result) + return result From 583250bb94b9cdf6c0a6dd3b6924aa284eb1379a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 27 Nov 2024 15:09:22 +0100 Subject: [PATCH 03/38] Update --- skrub/_string_encoder.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 523eefb92..0d0891497 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -1,7 +1,3 @@ -from sklearn.decomposition import PCA -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.pipeline import Pipeline - from . import _dataframe as sbd from ._on_each_column import SingleColumnTransformer @@ -19,12 +15,6 @@ def __init__(self, pca_components=30): def _transform(self, X): # TODO: vocabulary? - self.pipe = Pipeline( - [ - ("tfidf", TfidfVectorizer()), - ("pca", PCA(n_components=self.pca_components)), - ] - ).fit(X) return self.pipe.transform(X) From 8686d7f4c0b7e55c8dc7c2b7896e10a402f0d6fc Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 5 Dec 2024 16:15:32 +0100 Subject: [PATCH 04/38] Updated object and added test --- skrub/_string_encoder.py | 69 +++++++++++++++++++++++++++---- skrub/tests/test_stringencoder.py | 38 +++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) create mode 100644 skrub/tests/test_stringencoder.py diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 0d0891497..89ce58ced 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -1,12 +1,21 @@ +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline +from sklearn.utils.validation import check_is_fitted + from . import _dataframe as sbd from ._on_each_column import SingleColumnTransformer class StringEncoder(SingleColumnTransformer): - """_summary_ + """Generate a lightweight string encoding of a given column. First, apply a + tf-idf vectorization of the text, then reduce the dimensionality with a PCA + decomposition with the given number of parameters. Parameters ---------- + pca_components : int + Number of components to be used for the PCA decomposition. """ @@ -14,9 +23,13 @@ def __init__(self, pca_components=30): self.pca_components = pca_components def _transform(self, X): - # TODO: vocabulary? + result = self.pipe.transform(sbd.to_numpy(X)) + + names = self.get_feature_names_out(X) + result = sbd.make_dataframe_like(X, dict(zip(names, result.T))) + result = sbd.copy_index(X, result) - return self.pipe.transform(X) + return result def get_feature_names_out(self, X): name = sbd.name(X) @@ -26,15 +39,53 @@ def get_feature_names_out(self, X): return names def fit_transform(self, X, y=None): + """Fit the encoder and transform a column. + + Parameters + ---------- + X : Pandas or Polars series. + The column to transform. + y : None. Ignored + + Returns + ------- + A Pandas or Polars dataframe (depending on input) with shape + (len(X), pca_components). New features will be named `{col_name}_{component}` + if the series has a name, and `pca_{component}` if it does not. + """ del y + self.pipe = Pipeline( + [ + ("tfidf", TfidfVectorizer()), + ("pca", PCA(n_components=self.pca_components)), + ] + ) + + self.pipe.fit(sbd.to_numpy(X)) + + self._is_fitted = True return self.transform(X) def transform(self, X): - # check_is_fitted(self) + """Transform a column. - result = self._transform(sbd.to_numpy(X)) - names = self.get_feature_names_out(X) - result = sbd.make_dataframe_like(X, dict(zip(names, result.T))) - result = sbd.copy_index(X, result) - return result + Parameters + ---------- + X : Pandas or Polars series. + The column to transform. + + Returns + ------- + A Pandas or Polars dataframe (depending on input) with shape + (len(X), pca_components). New features will be named `{col_name}_{component}` + if the series has a name, and `pca_{component}` if it does not. + """ + check_is_fitted(self) + return self._transform(X) + + def __sklearn_is_fitted__(self): + """ + Check fitted status and return a Boolean value. + """ + return hasattr(self, "_is_fitted") and self._is_fitted diff --git a/skrub/tests/test_stringencoder.py b/skrub/tests/test_stringencoder.py new file mode 100644 index 000000000..b3d298f19 --- /dev/null +++ b/skrub/tests/test_stringencoder.py @@ -0,0 +1,38 @@ +import pytest +from sklearn.decomposition import PCA +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline + +from skrub import _dataframe as sbd +from skrub._string_encoder import StringEncoder + + +@pytest.fixture +def encode_column(df_module): + corpus = [ + "this is the first document", + "this document is the second document", + "and this is the third one", + "is this the first document", + ] + + return df_module.make_column("test_column", corpus) + + +def test_encoding(encode_column, df_module): + pipe = Pipeline( + [ + ("tfidf", TfidfVectorizer()), + ("pca", PCA(n_components=2)), + ] + ) + check = pipe.fit_transform(sbd.to_numpy(encode_column)) + + names = [f"test_column_{idx}" for idx in range(2)] + + check_df = df_module.make_dataframe(dict(zip(names, check.T))) + + se = StringEncoder(2) + result = se.fit_transform(encode_column) + + df_module.assert_frame_equal(check_df, result) From eb4de978780b450890e29a47fdbf8d694ceb2262 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 5 Dec 2024 16:26:11 +0100 Subject: [PATCH 05/38] quick update to changelog --- CHANGES.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index c2c43c21b..7b1417558 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,13 @@ Ongoing development Skrub is a very recent package. It is currently undergoing fast development and backward compatibility is not ensured. +Release 0.4.1 +============= + +New features +------------ +* Added :class:`StringEncoder`. :pr:`1159` by :user:`Riccardo Cappuzzo ` + Release 0.4.0 ============= From 96423ba94e555350e9b30b28b1da4a30eb560f16 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 5 Dec 2024 16:26:23 +0100 Subject: [PATCH 06/38] Fixed test --- skrub/tests/test_stringencoder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skrub/tests/test_stringencoder.py b/skrub/tests/test_stringencoder.py index b3d298f19..eb6052bc5 100644 --- a/skrub/tests/test_stringencoder.py +++ b/skrub/tests/test_stringencoder.py @@ -35,4 +35,8 @@ def test_encoding(encode_column, df_module): se = StringEncoder(2) result = se.fit_transform(encode_column) + # Converting dtypes to avoid nullable shenanigans + check_df = sbd.pandas_convert_dtypes(check_df) + result = sbd.pandas_convert_dtypes(result) + df_module.assert_frame_equal(check_df, result) From 3a1f6ebf0b00c9e9a4bb2f722d1ecf16855862ef Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 14:26:12 +0100 Subject: [PATCH 07/38] Replacing PCA with TruncatedSVD --- skrub/_string_encoder.py | 26 +++++++++++++------------- skrub/tests/test_stringencoder.py | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 89ce58ced..186899153 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -1,4 +1,4 @@ -from sklearn.decomposition import PCA +from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.utils.validation import check_is_fitted @@ -9,18 +9,18 @@ class StringEncoder(SingleColumnTransformer): """Generate a lightweight string encoding of a given column. First, apply a - tf-idf vectorization of the text, then reduce the dimensionality with a PCA - decomposition with the given number of parameters. + tf-idf vectorization of the text, then reduce the dimensionality with a + truncated SVD decomposition with the given number of parameters. Parameters ---------- - pca_components : int + components : int Number of components to be used for the PCA decomposition. """ - def __init__(self, pca_components=30): - self.pca_components = pca_components + def __init__(self, components=30): + self.components = components def _transform(self, X): result = self.pipe.transform(sbd.to_numpy(X)) @@ -34,8 +34,8 @@ def _transform(self, X): def get_feature_names_out(self, X): name = sbd.name(X) if not name: - name = "pca" - names = [f"{name}_{idx}" for idx in range(self.pca_components)] + name = "tsvd" + names = [f"{name}_{idx}" for idx in range(self.components)] return names def fit_transform(self, X, y=None): @@ -50,14 +50,14 @@ def fit_transform(self, X, y=None): Returns ------- A Pandas or Polars dataframe (depending on input) with shape - (len(X), pca_components). New features will be named `{col_name}_{component}` - if the series has a name, and `pca_{component}` if it does not. + (len(X), tsvd_components). New features will be named `{col_name}_{component}` + if the series has a name, and `tsvd_{component}` if it does not. """ del y self.pipe = Pipeline( [ ("tfidf", TfidfVectorizer()), - ("pca", PCA(n_components=self.pca_components)), + ("tsvd", TruncatedSVD(n_components=self.components)), ] ) @@ -78,8 +78,8 @@ def transform(self, X): Returns ------- A Pandas or Polars dataframe (depending on input) with shape - (len(X), pca_components). New features will be named `{col_name}_{component}` - if the series has a name, and `pca_{component}` if it does not. + (len(X), components). New features will be named `{col_name}_{component}` + if the series has a name, and `tsvd_{component}` if it does not. """ check_is_fitted(self) return self._transform(X) diff --git a/skrub/tests/test_stringencoder.py b/skrub/tests/test_stringencoder.py index eb6052bc5..649652978 100644 --- a/skrub/tests/test_stringencoder.py +++ b/skrub/tests/test_stringencoder.py @@ -1,5 +1,5 @@ import pytest -from sklearn.decomposition import PCA +from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline @@ -23,7 +23,7 @@ def test_encoding(encode_column, df_module): pipe = Pipeline( [ ("tfidf", TfidfVectorizer()), - ("pca", PCA(n_components=2)), + ("tsvd", TruncatedSVD(n_components=2)), ] ) check = pipe.fit_transform(sbd.to_numpy(encode_column)) From 398f9db9dff5438e00b9a3f3465d3f291b32f7ad Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 14:52:25 +0100 Subject: [PATCH 08/38] Updated init --- skrub/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skrub/__init__.py b/skrub/__init__.py index 571c3e295..bcf21bcff 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -17,6 +17,7 @@ from ._reporting import TableReport, patch_display, unpatch_display from ._select_cols import DropCols, SelectCols from ._similarity_encoder import SimilarityEncoder +from ._string_encoder import StringEncoder from ._table_vectorizer import TableVectorizer from ._tabular_learner import tabular_learner from ._text_encoder import TextEncoder @@ -53,5 +54,6 @@ "SelectCols", "DropCols", "TextEncoder", + "StringEncoder", "column_associations", ] From 3a45f192a5284cd85320a2faf7ab1734f65a9ea0 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 14:52:40 +0100 Subject: [PATCH 09/38] Updated example to add StringEncoder --- examples/02_text_with_string_encoders.py | 28 +++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index 009a30382..d99397861 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -132,7 +132,7 @@ def plot_gap_feature_importance(X_trans): # We set ``n_components`` to 30; however, to achieve the best performance, we would # need to find the optimal value for this hyperparameter using either |GridSearchCV| # or |RandomizedSearchCV|. We skip this part to keep the computation time for this -# example small. +# small example. # # Recall that the ROC AUC is a metric that quantifies the ranking power of estimators, # where a random estimator scores 0.5, and an oracle —providing perfect predictions— @@ -221,6 +221,25 @@ def plot_box_results(named_results): plot_box_results(results) +# %% +# |TextEncoder| embeddings are very strong, but they are also quite expensive to +# train. A simpler, faster alternative for encoding strings is the |StringEncoder|, +# which works by first performing a tf-idf vectorization of the text, and then +# following it with TruncatedSVD to reduce the number of dimensions to, in this +# case, 30. +from skrub import StringEncoder + +string_encoder = StringEncoder(components=30) + +string_encoder_pipe = clone(gap_pipe).set_params( + **{"tablevectorizer__high_cardinality": string_encoder} +) +string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="roc_auc") +results.append(("StringEncoder", string_encoder_results)) + +plot_box_results(results) + + # %% # The performance of the |TextEncoder| is significantly stronger than that of # the syntactic encoders, which is expected. But how long does it take to load @@ -232,7 +251,7 @@ def plot_box_results(named_results): def plot_performance_tradeoff(results): fig, ax = plt.subplots(figsize=(5, 4), dpi=200) - markers = ["s", "o", "^"] + markers = ["s", "o", "^", "x"] for idx, (name, result) in enumerate(results): ax.scatter( result["fit_time"], @@ -293,8 +312,11 @@ def plot_performance_tradeoff(results): # During the subsequent cross-validation iterations, the model is simply copied, # which reduces computation time for the remaining folds. # +# Interestingly, |StringEncoder| has a performance remarkably similar to that of +# |GapEncoder|, while being significantly faster. # Conclusion # ---------- # In conclusion, |TextEncoder| provides powerful vectorization for text, but at # the cost of longer computation times and the need for additional dependencies, -# such as torch. +# such as torch. \StringEncoder| represents a simpler alternative that can provide +# good performance at a fraction of the cost of more complex methods. From 51856b35687d67e3c9ea2c6551968cfb1d6733fc Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 14:58:31 +0100 Subject: [PATCH 10/38] Updating changelog. --- CHANGES.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 03f16dd7e..8c61fc252 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,7 +14,9 @@ It is currently undergoing fast development and backward compatibility is not en New features ------------ - +* The :class:`StringEncoder` encodes strings using tf-idf and truncated SVD + decomposition and provides a cheaper alternative to :class:`TextEncoder`. + :pr:`1159` by :user:`Riccardo Cappuzzo`. Changes ------- From 58a3559c9ed47eb915ea8eee68a1d18da819c94a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 15:24:03 +0100 Subject: [PATCH 11/38] =?UTF-8?q?=F0=9F=93=9D=20Updating=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skrub/_string_encoder.py | 62 +++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 186899153..95fbeb3e3 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -8,15 +8,50 @@ class StringEncoder(SingleColumnTransformer): - """Generate a lightweight string encoding of a given column. First, apply a - tf-idf vectorization of the text, then reduce the dimensionality with a - truncated SVD decomposition with the given number of parameters. + """Generate a lightweight string encoding of a given column using tf-idf \ + vectorization and truncated SVD. + + First, apply a tf-idf vectorization of the text, then reduce the dimensionality + with a truncated SVD decomposition with the given number of parameters. + + New features will be named `{col_name}_{component}` if the series has a name, + and `tsvd_{component}` if it does not. Parameters ---------- components : int Number of components to be used for the PCA decomposition. + See Also + -------- + MinHashEncoder : + Encode string columns as a numeric array with the minhash method. + GapEncoder : + Encode string columns by constructing latent topics. + SimilarityEncoder : + Encode string columns as a numeric array with n-gram string similarity. + TextEncoder : + Encode string columns using pre-trained language models. + + Examples + -------- + >>> import pandas as pd + >>> from skrub import StringEncoder + + We will encode the comments using 2 components: + + >>> enc = StringEncoder(components=2) + >>> X = pd.Series([ + ... "The professor snatched a good interview out of the jaws of these questions.", + ... "Bookmarking this to watch later.", + ... "When you don't know the lyrics of the song except the chorus", + ... ], name='video comments') + + >>> enc.fit_transform(X) # doctest: +SKIP + video comments_0 video comments_1 + 0 8.218069e-01 4.557474e-17 + 1 6.971618e-16 1.000000e+00 + 2 8.218069e-01 -3.046564e-16 """ def __init__(self, components=30): @@ -25,13 +60,13 @@ def __init__(self, components=30): def _transform(self, X): result = self.pipe.transform(sbd.to_numpy(X)) - names = self.get_feature_names_out(X) + names = self._get_feature_names_out(X) result = sbd.make_dataframe_like(X, dict(zip(names, result.T))) result = sbd.copy_index(X, result) return result - def get_feature_names_out(self, X): + def _get_feature_names_out(self, X): name = sbd.name(X) if not name: name = "tsvd" @@ -43,15 +78,15 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : Pandas or Polars series. + X : Pandas or Polars series The column to transform. - y : None. Ignored + y : None + Unused. Here for compatibility with scikit-learn. Returns ------- - A Pandas or Polars dataframe (depending on input) with shape - (len(X), tsvd_components). New features will be named `{col_name}_{component}` - if the series has a name, and `tsvd_{component}` if it does not. + X_out: Pandas or Polars dataframe with shape (len(X), tsvd_components) + The embedding representation of the input. """ del y self.pipe = Pipeline( @@ -72,14 +107,13 @@ def transform(self, X): Parameters ---------- - X : Pandas or Polars series. + X : Pandas or Polars series The column to transform. Returns ------- - A Pandas or Polars dataframe (depending on input) with shape - (len(X), components). New features will be named `{col_name}_{component}` - if the series has a name, and `tsvd_{component}` if it does not. + X_out: Pandas or Polars dataframe with shape (len(X), tsvd_components) + The embedding representation of the input. """ check_is_fitted(self) return self._transform(X) From 8e4fce2850b959a7a1c1463ab63eb608a4f39ef5 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 15:35:58 +0100 Subject: [PATCH 12/38] =?UTF-8?q?=F0=9F=93=9D=20Fixing=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/02_text_with_string_encoders.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index d99397861..aa5abb8cd 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -17,6 +17,9 @@ .. |TextEncoder| replace:: :class:`~skrub.TextEncoder` +.. |StringEncoder| replace:: + :class:`~skrub.StringEncoder` + .. |TableReport| replace:: :class:`~skrub.TableReport` From afdb361b8a80456e77c30c6a03009afc3df57597 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 16:20:31 +0100 Subject: [PATCH 13/38] =?UTF-8?q?=E2=9C=85=20Fixing=20tests=20and=20renami?= =?UTF-8?q?ng=20test=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skrub/_string_encoder.py | 37 +++++++++++-------- ...tringencoder.py => test_string_encoder.py} | 13 ++++++- 2 files changed, 33 insertions(+), 17 deletions(-) rename skrub/tests/{test_stringencoder.py => test_string_encoder.py} (71%) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 95fbeb3e3..4d245124c 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -19,7 +19,7 @@ class StringEncoder(SingleColumnTransformer): Parameters ---------- - components : int + n_components : int Number of components to be used for the PCA decomposition. See Also @@ -40,7 +40,7 @@ class StringEncoder(SingleColumnTransformer): We will encode the comments using 2 components: - >>> enc = StringEncoder(components=2) + >>> enc = StringEncoder(n_components=2) >>> X = pd.Series([ ... "The professor snatched a good interview out of the jaws of these questions.", ... "Bookmarking this to watch later.", @@ -54,24 +54,26 @@ class StringEncoder(SingleColumnTransformer): 2 8.218069e-01 -3.046564e-16 """ - def __init__(self, components=30): - self.components = components + def __init__(self, n_components=30): + self.n_components = n_components def _transform(self, X): result = self.pipe.transform(sbd.to_numpy(X)) - names = self._get_feature_names_out(X) - result = sbd.make_dataframe_like(X, dict(zip(names, result.T))) + result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T))) result = sbd.copy_index(X, result) return result - def _get_feature_names_out(self, X): - name = sbd.name(X) - if not name: - name = "tsvd" - names = [f"{name}_{idx}" for idx in range(self.components)] - return names + def get_feature_names_out(self): + """Get output feature names for transformation. + + Returns + ------- + feature_names_out : list of str objects + Transformed feature names. + """ + return list(self.all_outputs_) def fit_transform(self, X, y=None): """Fit the encoder and transform a column. @@ -85,17 +87,22 @@ def fit_transform(self, X, y=None): Returns ------- - X_out: Pandas or Polars dataframe with shape (len(X), tsvd_components) + X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) The embedding representation of the input. """ del y self.pipe = Pipeline( [ ("tfidf", TfidfVectorizer()), - ("tsvd", TruncatedSVD(n_components=self.components)), + ("tsvd", TruncatedSVD(n_components=self.n_components)), ] ) + name = sbd.name(X) + if not name: + name = "tsvd" + self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)] + self.pipe.fit(sbd.to_numpy(X)) self._is_fitted = True @@ -112,7 +119,7 @@ def transform(self, X): Returns ------- - X_out: Pandas or Polars dataframe with shape (len(X), tsvd_components) + X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) The embedding representation of the input. """ check_is_fitted(self) diff --git a/skrub/tests/test_stringencoder.py b/skrub/tests/test_string_encoder.py similarity index 71% rename from skrub/tests/test_stringencoder.py rename to skrub/tests/test_string_encoder.py index 649652978..8117c6ff0 100644 --- a/skrub/tests/test_stringencoder.py +++ b/skrub/tests/test_string_encoder.py @@ -16,7 +16,7 @@ def encode_column(df_module): "is this the first document", ] - return df_module.make_column("test_column", corpus) + return df_module.make_column("col1", corpus) def test_encoding(encode_column, df_module): @@ -28,7 +28,7 @@ def test_encoding(encode_column, df_module): ) check = pipe.fit_transform(sbd.to_numpy(encode_column)) - names = [f"test_column_{idx}" for idx in range(2)] + names = [f"col1_{idx}" for idx in range(2)] check_df = df_module.make_dataframe(dict(zip(names, check.T))) @@ -40,3 +40,12 @@ def test_encoding(encode_column, df_module): result = sbd.pandas_convert_dtypes(result) df_module.assert_frame_equal(check_df, result) + + +def test_get_feature_names_out(encode_column): + """Test that ``get_feature_names_out`` returns the correct feature names.""" + encoder = StringEncoder(n_components=4) + + encoder.fit(encode_column) + expected_columns = ["col1_0", "col1_1", "col1_2", "col1_3"] + assert encoder.get_feature_names_out() == expected_columns From 6c6d884c1ebc58970259ebc05825162f2c2ea667 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 16:24:08 +0100 Subject: [PATCH 14/38] =?UTF-8?q?=E2=9C=85=20Fixing=20coverage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skrub/tests/test_string_encoder.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 8117c6ff0..c9dd8213d 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -42,10 +42,27 @@ def test_encoding(encode_column, df_module): df_module.assert_frame_equal(check_df, result) -def test_get_feature_names_out(encode_column): +def test_get_feature_names_out(encode_column, df_module): """Test that ``get_feature_names_out`` returns the correct feature names.""" encoder = StringEncoder(n_components=4) encoder.fit(encode_column) expected_columns = ["col1_0", "col1_1", "col1_2", "col1_3"] assert encoder.get_feature_names_out() == expected_columns + + # Checking that a series with an empty name generates the proper column names + X = df_module.make_column( + None, + [ + "this is the first document", + "this document is the second document", + "and this is the third one", + "is this the first document", + ], + ) + + encoder = StringEncoder(n_components=4) + + encoder.fit(X) + expected_columns = ["tsvd_0", "tsvd_1", "tsvd_2", "tsvd_3"] + assert encoder.get_feature_names_out() == expected_columns From 9366d90ebfc2f8ba8b1d5494d493bb3c20ed9ad0 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 9 Dec 2024 16:36:29 +0100 Subject: [PATCH 15/38] =?UTF-8?q?=F0=9F=90=9B=20Fixing=20the=20name=20of?= =?UTF-8?q?=20a=20variable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/02_text_with_string_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index aa5abb8cd..c511157cc 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -232,7 +232,7 @@ def plot_box_results(named_results): # case, 30. from skrub import StringEncoder -string_encoder = StringEncoder(components=30) +string_encoder = StringEncoder(n_components=30) string_encoder_pipe = clone(gap_pipe).set_params( **{"tablevectorizer__high_cardinality": string_encoder} From e8f308e2d4885b49454bd4f8c62b26a6d91ccc9e Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 11 Dec 2024 14:13:13 +0100 Subject: [PATCH 16/38] Addressing comments in review --- CHANGES.rst | 2 +- example_string_encoder.py | 33 ------------------------ examples/02_text_with_string_encoders.py | 5 ++-- skrub/_string_encoder.py | 22 ++++++++-------- 4 files changed, 15 insertions(+), 47 deletions(-) delete mode 100644 example_string_encoder.py diff --git a/CHANGES.rst b/CHANGES.rst index 968d4d75d..58f07f4ca 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -18,7 +18,7 @@ Release 0.4.1 New features ------------ * The :class:`StringEncoder` encodes strings using tf-idf and truncated SVD - decomposition and provides a cheaper alternative to :class:`TextEncoder`. + decomposition and provides a cheaper alternative to :class:`GapEncoder`. :pr:`1159` by :user:`Riccardo Cappuzzo`. Changes diff --git a/example_string_encoder.py b/example_string_encoder.py deleted file mode 100644 index 79ad3de60..000000000 --- a/example_string_encoder.py +++ /dev/null @@ -1,33 +0,0 @@ -# %% test string encoder -import polars as pl -from sklearn.decomposition import PCA -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.pipeline import Pipeline - -from skrub._string_encoder import StringEncoder - -corpus = [ - "this is the first document", - "this document is the second document", - "and this is the third one", - "is this the first document", -] -column = pl.Series(name="this_column", values=corpus) - -# %% - -pipe = Pipeline( - [ - ("tfidf", TfidfVectorizer()), - ("pca", PCA(n_components=2)), - ] -) -# %% -a = pipe.fit_transform(corpus) - -# %% -se = StringEncoder(2) - -# %% -r = se.fit_transform(column) -# %% diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index c511157cc..b81431462 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -226,8 +226,9 @@ def plot_box_results(named_results): # %% # |TextEncoder| embeddings are very strong, but they are also quite expensive to -# train. A simpler, faster alternative for encoding strings is the |StringEncoder|, -# which works by first performing a tf-idf vectorization of the text, and then +# use. A simpler, faster alternative for encoding strings is the |StringEncoder|, +# which works by first performing a tf-idf (computing vectors of rescaled word +# counts, [wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) of the text, and then # following it with TruncatedSVD to reduce the number of dimensions to, in this # case, 30. from skrub import StringEncoder diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 4d245124c..0163d30c4 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -57,14 +57,6 @@ class StringEncoder(SingleColumnTransformer): def __init__(self, n_components=30): self.n_components = n_components - def _transform(self, X): - result = self.pipe.transform(sbd.to_numpy(X)) - - result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T))) - result = sbd.copy_index(X, result) - - return result - def get_feature_names_out(self): """Get output feature names for transformation. @@ -103,11 +95,11 @@ def fit_transform(self, X, y=None): name = "tsvd" self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)] - self.pipe.fit(sbd.to_numpy(X)) + result = self.pipe.fit_transform(sbd.to_numpy(X)) self._is_fitted = True - return self.transform(X) + return self._transform(X, result) def transform(self, X): """Transform a column. @@ -123,7 +115,15 @@ def transform(self, X): The embedding representation of the input. """ check_is_fitted(self) - return self._transform(X) + + result = self.pipe.transform(sbd.to_numpy(X)) + return self._transform(X, result) + + def _transform(self, X, result): + result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T))) + result = sbd.copy_index(X, result) + + return result def __sklearn_is_fitted__(self): """ From 8ea92d81d962748b5576f18899a6f3c2772508d3 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 12 Dec 2024 17:21:50 +0100 Subject: [PATCH 17/38] Updating code to benchmark --- examples/benchmarking_string_encoder.py | 272 ++++++++++++++++++++++++ skrub/_string_encoder.py | 56 ++++- 2 files changed, 321 insertions(+), 7 deletions(-) create mode 100644 examples/benchmarking_string_encoder.py diff --git a/examples/benchmarking_string_encoder.py b/examples/benchmarking_string_encoder.py new file mode 100644 index 000000000..36fd958de --- /dev/null +++ b/examples/benchmarking_string_encoder.py @@ -0,0 +1,272 @@ +# %% +# Benchmarking different parameters for the StringEncoder transformer + +# %% +from skrub.datasets import fetch_toxicity + +dataset = fetch_toxicity() +X, y = dataset.X, dataset.y +X["is_toxic"] = y + +y = X.pop("is_toxic").map({"Toxic": 1, "Not Toxic": 0}) + +# %% +from skrub import TableReport + +TableReport(X) + +# %% +import numpy as np +from matplotlib import pyplot as plt +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.model_selection import cross_validate +from sklearn.pipeline import make_pipeline + +from skrub import TableVectorizer + + +def plot_box_results(named_results): + fig, ax = plt.subplots() + names, scores = zip( + *[(name, result["test_score"]) for name, result in named_results] + ) + ax.boxplot(scores) + ax.set_xticks(range(1, len(names) + 1), labels=list(names), size=12) + ax.set_ylabel("ROC AUC", size=14) + plt.title( + "AUC distribution across folds (higher is better)", + size=14, + ) + plt.show() + + +def plot_performance_tradeoff(results): + fig, ax = plt.subplots(figsize=(5, 4), dpi=200) + # markers = ["s", "o", "^", "x"] + for idx, (name, result) in enumerate(results): + ax.scatter( + result["fit_time"], + result["test_score"], + label=name, + # marker=markers[idx], + ) + mean_fit_time = np.mean(result["fit_time"]) + mean_score = np.mean(result["test_score"]) + ax.scatter( + mean_fit_time, + mean_score, + color="k", + # marker=markers[idx], + ) + std_fit_time = np.std(result["fit_time"]) + std_score = np.std(result["test_score"]) + ax.errorbar( + x=mean_fit_time, + y=mean_score, + yerr=std_score, + fmt="none", + c="k", + capsize=2, + ) + ax.errorbar( + x=mean_fit_time, + y=mean_score, + xerr=std_fit_time, + fmt="none", + c="k", + capsize=2, + ) + + ax.set_xlabel("Time to fit (seconds)") + ax.set_ylabel("ROC AUC") + ax.set_title("Prediction performance / training time trade-off") + + ax.annotate( + "", + xy=(1.5, 0.98), + xytext=(8.5, 0.90), + arrowprops=dict(arrowstyle="->", mutation_scale=15), + ) + # ax.text(8, 0.86, "Best time / \nperformance trade-off") + ax.legend(bbox_to_anchor=(1, 0.3)) + plt.show() + + +# %% +from skrub import StringEncoder + +results = [] + +# %% +default_pipe = make_pipeline( + TableVectorizer(high_cardinality=StringEncoder(n_components=30)), + HistGradientBoostingClassifier(), +) +gap_results = cross_validate(default_pipe, X, y, scoring="roc_auc") +results.append(("tfidf_default", gap_results)) + +plot_box_results(results) + +# %% +hashing_pipe = make_pipeline( + TableVectorizer(high_cardinality=StringEncoder(n_components=30)), + HistGradientBoostingClassifier(), +) +results_ = cross_validate(hashing_pipe, X, y, scoring="roc_auc") +results.append(("hashing_default", results_)) + +plot_box_results(results) + +# %% +configurations = { + "ngram_range": [(1, 1), (3, 4)], + "analyzer": ["word", "char", "char_wb"], + "vectorizer": ["tfidf"], + "n_components": [30], + # "tf_idf_followup": [True], +} + +# %% +from sklearn.model_selection import ParameterGrid + +config_grid = ParameterGrid(configurations) + +import polars as pl +from tqdm import tqdm + + +# %% +def format_name(params): + s = ( + f'{params["vectorizer"]},' + + f'{params["ngram_range"]},' + + f'{params["analyzer"]},' + + f'{params["tf_idf_followup"]}' + ) + return s + + +results = [] + + +for params in tqdm(config_grid, total=len(config_grid)): + print(params) + this_pipe = make_pipeline( + TableVectorizer(high_cardinality=StringEncoder(**params)), + HistGradientBoostingClassifier(), + ) + results_ = cross_validate(this_pipe, X, y, scoring="roc_auc") + print(results_) + params.update( + { + "fit_time": list(results_["fit_time"]), + "test_score": list(results_["test_score"]), + "ngram_range": str(params["ngram_range"]), + } + ) + results.append(params) + +df = pl.from_dicts(results) + +# %% +df = df.with_columns( + mean_fit_time=pl.col("fit_time").list.mean(), + mean_score=pl.col("test_score").list.mean(), + std_fit_time=pl.col("fit_time").list.std(), + std_score=pl.col("test_score").list.std(), +) + +# %% +plot_performance_tradeoff(results) + +# %% + +# %% +import pandas as pd +import seaborn as sns + + +def pareto_frontier_plot( + data, + x_var, + y_var, + hue_var, + # palette, + # hue_order, + ax, + ax_title=None, + ax_xlabel="", +): + if not isinstance(data, pd.DataFrame): + raise ValueError() + x = data[x_var] + y = data[y_var] + + # ax.set_xscale("log") + + xs = np.array(x) + ys = np.array(y) + perm = np.argsort(xs) + xs = xs[perm] + ys = ys[perm] + + sns.scatterplot( + data=data, + x=x_var, + y=y_var, + hue=hue_var, + ax=ax, + palette="tab10", + # hue_order=hue_order, + ) + + # for row in df.iter_rows(named=True): + # mean_fit_time = row["mean_fit_time"] + # mean_score = row["mean_score"] + # std_fit_time = row["std_fit_time"] + # std_score = row["std_score"] + + # ax.errorbar(mean_fit_time, mean_score, std_fit_time, std_score, c="k") + + xs_pareto = [xs[0], xs[0]] + ys_pareto = [ys[0], ys[0]] + for i in range(1, len(xs)): + if ys[i] > ys_pareto[-1]: + xs_pareto.append(xs[i]) + ys_pareto.append(ys_pareto[-1]) + xs_pareto.append(xs[i]) + ys_pareto.append(ys[i]) + xs_pareto.append(ax.get_xlim()[1]) + ys_pareto.append(ys_pareto[-1]) + + ax.plot(xs_pareto, ys_pareto, "--", color="k", linewidth=2, zorder=0.8) + ax.set_ylabel("") + # ax.set_title(ax_title) + h, l = ax.get_legend_handles_labels() + # ax.legend( + # h, + # [constants.LABEL_MAPPING[hue_var][_] for _ in l], + # title=None, + # ) + ax.set_xlabel(ax_xlabel) + + # ax.set_ylim([-0.5, 0.6]) + # ax.axhspan(0, -0.5, zorder=0, alpha=0.05, color="red") + + optimal_y = ys_pareto[-1] + return (h, l), optimal_y + + +# %% +fig, axs = plt.subplots(1, 3, figsize=(10, 3)) + +for ax, hue_var in zip(axs, ["analyzer", "ngram_range", "vectorizer"]): + pareto_frontier_plot( + df.to_pandas(), + x_var="mean_fit_time", + y_var="mean_score", + hue_var=hue_var, + ax=ax, + ) + +# %% diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 0163d30c4..75404b4ce 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -1,5 +1,9 @@ from sklearn.decomposition import TruncatedSVD -from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_extraction.text import ( + HashingVectorizer, + TfidfTransformer, + TfidfVectorizer, +) from sklearn.pipeline import Pipeline from sklearn.utils.validation import check_is_fitted @@ -54,8 +58,23 @@ class StringEncoder(SingleColumnTransformer): 2 8.218069e-01 -3.046564e-16 """ - def __init__(self, n_components=30): + def __init__( + self, + n_components=30, + vectorizer="tfidf", + ngram_range=(1, 1), + tf_idf_followup=False, + n_features=None, + max_features=None, + analyzer="word", + ): self.n_components = n_components + self.vectorizer = vectorizer + self.ngram_range = ngram_range + self.tf_idf_followup = tf_idf_followup + self.n_features = n_features + self.max_features = max_features + self.analyzer = analyzer def get_feature_names_out(self): """Get output feature names for transformation. @@ -83,12 +102,35 @@ def fit_transform(self, X, y=None): The embedding representation of the input. """ del y - self.pipe = Pipeline( - [ - ("tfidf", TfidfVectorizer()), - ("tsvd", TruncatedSVD(n_components=self.n_components)), + + if self.vectorizer == "tfidf": + self.pipe = Pipeline( + [ + ( + "tfidf", + TfidfVectorizer( + ngram_range=self.ngram_range, analyzer=self.analyzer + ), + ), + ("tsvd", TruncatedSVD(n_components=self.n_components)), + ] + ) + + elif self.vectorizer == "hashing": + pipe_elements = [ + ( + "hashing", + HashingVectorizer( + ngram_range=self.ngram_range, analyzer=self.analyzer + ), + ), ] - ) + if self.tf_idf_followup: + pipe_elements.append(("tfidf", TfidfTransformer())) + pipe_elements.append(("tsvd", TruncatedSVD(n_components=self.n_components))) + self.pipe = Pipeline(pipe_elements) + else: + raise ValueError(f"Unknown vectorizer {self.vectorizer}.") name = sbd.name(X) if not name: From 8411a83bcdd1c85047662157a177196244063253 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 12 Dec 2024 18:38:26 +0100 Subject: [PATCH 18/38] updating code --- examples/benchmarking_string_encoder.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/benchmarking_string_encoder.py b/examples/benchmarking_string_encoder.py index 36fd958de..0c1c44fb4 100644 --- a/examples/benchmarking_string_encoder.py +++ b/examples/benchmarking_string_encoder.py @@ -21,6 +21,9 @@ from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline +import pandas as pd +import seaborn as sns + from skrub import TableVectorizer @@ -119,11 +122,11 @@ def plot_performance_tradeoff(results): # %% configurations = { - "ngram_range": [(1, 1), (3, 4)], + "ngram_range": [(1, 1),(1,2) ,(3, 4)], "analyzer": ["word", "char", "char_wb"], - "vectorizer": ["tfidf"], + "vectorizer": ["tfidf", "hashing"], "n_components": [30], - # "tf_idf_followup": [True], + "tf_idf_followup": [True, False], } # %% @@ -176,6 +179,8 @@ def format_name(params): std_score=pl.col("test_score").list.std(), ) +df.write_csv("results.csv") + # %% plot_performance_tradeoff(results) @@ -268,5 +273,5 @@ def pareto_frontier_plot( hue_var=hue_var, ax=ax, ) - +fig.savefig("results.png") # %% From 190ce2a1ba1ac5fa05f75e9259c4b1d74802875a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 11:05:56 +0100 Subject: [PATCH 19/38] Updating script --- examples/benchmarking_string_encoder.py | 208 ++++++++---------------- examples/results.png | Bin 0 -> 39551 bytes results.parquet | Bin 0 -> 7524 bytes 3 files changed, 64 insertions(+), 144 deletions(-) create mode 100644 examples/results.png create mode 100644 results.parquet diff --git a/examples/benchmarking_string_encoder.py b/examples/benchmarking_string_encoder.py index 0c1c44fb4..d5a029cd9 100644 --- a/examples/benchmarking_string_encoder.py +++ b/examples/benchmarking_string_encoder.py @@ -1,145 +1,60 @@ # %% # Benchmarking different parameters for the StringEncoder transformer +# This script is used to test different parameters to use with the StringEncoder +# and see which configurations work best. +# +# For the moment, I am only considering the Toxicity dataset to test the performance, +# and more tables should be tested to have more reliable results. It's still a +# good start. +# +# The version of the StringEncoder used here will be simplified for the next release. # %% -from skrub.datasets import fetch_toxicity - -dataset = fetch_toxicity() -X, y = dataset.X, dataset.y -X["is_toxic"] = y - -y = X.pop("is_toxic").map({"Toxic": 1, "Not Toxic": 0}) - -# %% -from skrub import TableReport - -TableReport(X) - -# %% +# Import all the required libraries import numpy as np +import pandas as pd +import polars as pl +import seaborn as sns from matplotlib import pyplot as plt from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline -import pandas as pd -import seaborn as sns - - -from skrub import TableVectorizer - - -def plot_box_results(named_results): - fig, ax = plt.subplots() - names, scores = zip( - *[(name, result["test_score"]) for name, result in named_results] - ) - ax.boxplot(scores) - ax.set_xticks(range(1, len(names) + 1), labels=list(names), size=12) - ax.set_ylabel("ROC AUC", size=14) - plt.title( - "AUC distribution across folds (higher is better)", - size=14, - ) - plt.show() - - -def plot_performance_tradeoff(results): - fig, ax = plt.subplots(figsize=(5, 4), dpi=200) - # markers = ["s", "o", "^", "x"] - for idx, (name, result) in enumerate(results): - ax.scatter( - result["fit_time"], - result["test_score"], - label=name, - # marker=markers[idx], - ) - mean_fit_time = np.mean(result["fit_time"]) - mean_score = np.mean(result["test_score"]) - ax.scatter( - mean_fit_time, - mean_score, - color="k", - # marker=markers[idx], - ) - std_fit_time = np.std(result["fit_time"]) - std_score = np.std(result["test_score"]) - ax.errorbar( - x=mean_fit_time, - y=mean_score, - yerr=std_score, - fmt="none", - c="k", - capsize=2, - ) - ax.errorbar( - x=mean_fit_time, - y=mean_score, - xerr=std_fit_time, - fmt="none", - c="k", - capsize=2, - ) - - ax.set_xlabel("Time to fit (seconds)") - ax.set_ylabel("ROC AUC") - ax.set_title("Prediction performance / training time trade-off") - - ax.annotate( - "", - xy=(1.5, 0.98), - xytext=(8.5, 0.90), - arrowprops=dict(arrowstyle="->", mutation_scale=15), - ) - # ax.text(8, 0.86, "Best time / \nperformance trade-off") - ax.legend(bbox_to_anchor=(1, 0.3)) - plt.show() +from tqdm import tqdm +from skrub import StringEncoder, TableVectorizer # %% -from skrub import StringEncoder - -results = [] +# Import the toxicity dataset and prepare it for the experiments. +from skrub.datasets import fetch_toxicity -# %% -default_pipe = make_pipeline( - TableVectorizer(high_cardinality=StringEncoder(n_components=30)), - HistGradientBoostingClassifier(), -) -gap_results = cross_validate(default_pipe, X, y, scoring="roc_auc") -results.append(("tfidf_default", gap_results)) +dataset = fetch_toxicity() +X, y = dataset.X, dataset.y +X["is_toxic"] = y -plot_box_results(results) +y = X.pop("is_toxic").map({"Toxic": 1, "Not Toxic": 0}) -# %% -hashing_pipe = make_pipeline( - TableVectorizer(high_cardinality=StringEncoder(n_components=30)), - HistGradientBoostingClassifier(), -) -results_ = cross_validate(hashing_pipe, X, y, scoring="roc_auc") -results.append(("hashing_default", results_)) +from skrub import TableReport -plot_box_results(results) +TableReport(X) # %% +# Prepare the parameter grid to evaluate. +from sklearn.model_selection import ParameterGrid + configurations = { - "ngram_range": [(1, 1),(1,2) ,(3, 4)], + "ngram_range": [(1, 1), (1, 2), (3, 4)], "analyzer": ["word", "char", "char_wb"], "vectorizer": ["tfidf", "hashing"], "n_components": [30], "tf_idf_followup": [True, False], } -# %% -from sklearn.model_selection import ParameterGrid - config_grid = ParameterGrid(configurations) -import polars as pl -from tqdm import tqdm - # %% def format_name(params): + # Simple helper function to format the labels s = ( f'{params["vectorizer"]},' + f'{params["ngram_range"]},' @@ -149,8 +64,10 @@ def format_name(params): return s -results = [] +# %% +# Run the experiments and save all the results in a dataframe. +results = [] for params in tqdm(config_grid, total=len(config_grid)): print(params) @@ -179,25 +96,17 @@ def format_name(params): std_score=pl.col("test_score").list.std(), ) -df.write_csv("results.csv") +df.write_parquet("results.parquet") -# %% -plot_performance_tradeoff(results) # %% - -# %% -import pandas as pd -import seaborn as sns - - +# Build the Pareto frontier plot for a given set of variables, and color the +# dots by a specific variable. def pareto_frontier_plot( data, x_var, y_var, hue_var, - # palette, - # hue_order, ax, ax_title=None, ax_xlabel="", @@ -222,17 +131,8 @@ def pareto_frontier_plot( hue=hue_var, ax=ax, palette="tab10", - # hue_order=hue_order, ) - # for row in df.iter_rows(named=True): - # mean_fit_time = row["mean_fit_time"] - # mean_score = row["mean_score"] - # std_fit_time = row["std_fit_time"] - # std_score = row["std_score"] - - # ax.errorbar(mean_fit_time, mean_score, std_fit_time, std_score, c="k") - xs_pareto = [xs[0], xs[0]] ys_pareto = [ys[0], ys[0]] for i in range(1, len(xs)): @@ -246,23 +146,16 @@ def pareto_frontier_plot( ax.plot(xs_pareto, ys_pareto, "--", color="k", linewidth=2, zorder=0.8) ax.set_ylabel("") - # ax.set_title(ax_title) + ax.set_xscale("log") h, l = ax.get_legend_handles_labels() - # ax.legend( - # h, - # [constants.LABEL_MAPPING[hue_var][_] for _ in l], - # title=None, - # ) ax.set_xlabel(ax_xlabel) - # ax.set_ylim([-0.5, 0.6]) - # ax.axhspan(0, -0.5, zorder=0, alpha=0.05, color="red") - - optimal_y = ys_pareto[-1] - return (h, l), optimal_y + return (h, l) # %% +# Use the function defined above to plot three different Pareto plots that are +# colored by hue_var. fig, axs = plt.subplots(1, 3, figsize=(10, 3)) for ax, hue_var in zip(axs, ["analyzer", "ngram_range", "vectorizer"]): @@ -275,3 +168,30 @@ def pareto_frontier_plot( ) fig.savefig("results.png") # %% +# Boxplots comparing the test score for different analyzers +sns.catplot( + data=df.to_pandas(), + x="analyzer", + y="test_score", + hue="ngram_range", + kind="box", + col="vectorizer", +) +# %% +g = sns.catplot( + data=df.to_pandas(), + x="analyzer", + y="fit_time", + hue="ngram_range", + kind="box", + col="vectorizer", +) +g.set(ylim=(0, 30)) +# %% +# From the results, it's clear that the tfidf vectorizer is much faster than the +# hashing vectorizer, and achieves similar if not better test score. The best +# ngram_range is (3,4), and `char` and `char_wb` are the better analyzers. +# +# As mentioned before, this is a preliminary study, but it's already providing +# interesting results and an indication of what should be used as default +# parameters for the StringEncoder. diff --git a/examples/results.png b/examples/results.png new file mode 100644 index 0000000000000000000000000000000000000000..633fc6d75fb37939ba4b5c03eb81a2af476a4a4f GIT binary patch literal 39551 zcmeFZbyOT(wDw6Lc(C9O!CiuDaCd^cySoOrAi*61B)A245AN=+jl1_0@16JFyVlH| z`PTaW{jyd-tIFaHzO-OiG_=cgEJ2klkIf&g~UR`x%7Lx6u+RUiekXZIOd1ri+%ACQ*zu;K!rv4^Iq(h5d&ehMNVRh za{o?35Ar7shDCa9o8WwsjB~cL{At|YSkt)Y*i{jRm=dZm9BJ@B|HybY@<9CGuTh01 z0~r6~*Qmmc&@>Yv|9*~>1iJa(*PAc}U$Fl@e&rj+!N0GA@#7$M|2?k*lyWZOzvs34 zg5dJ+>u#j~evxC_kAi1f?FPWEw9K z?7$Q+5{#-|QosA7Sr*~i>+kwHag`tZjaOS#K3bk19W^xx2glrPTNwlS|MR>fX%!}3 zZF+tfAr9zu@tX4E33v^&lEG&uVBCYYH*i1phlk@Pj4%JV?=d zpsh(bJe>@>*`aL;lItGonT=2u2E@9~+-S?fb_}1bHUF0?*uk476eD}_`UCHj`$DoV zzBj}8wMQu2Ivln*WpEKvnV{kk8=K&C#&IyzeCZ)$Ve($9;cGtgN~nFMK|`&PwNPuTb7kdIFe);^ zIIYptKCf*b_ZK}!+-<>^u!2mZe=cYb%%qC1@nO3gcP2<)0VDW&zsh!_H-6UFrwwnc zg9H1HmvF>JS+Ru293>Z_7yHZj>4O`;wNCLZ1=kyTH5~FyyuLGfHnOqppnQ>bv#dwV zN3V!e%Iw>(Hu5YtIj@NpQo>Df| zz&krVUAn<;wdA^eu3TFp3X8#JToC;1ob7Wt7-ffzi#rJX&$!_@0G7z$=4cjs)eSg& zB!hF@PIEY!fdz-r@5W)XJ7mRUJGy?K*X`Kg{(KXA6?tQ$E9m|yr)}K*_4#2WM-ZI1 z?0UMAM(lUr<)^<4&ON*^wLq?f?1QAmG#!O;X11emh)$E0l7fc-rh)0vOmY1Q0wSX2 zY{}Q8E!{dROx?b}{-h4VgaeJ&mFGfs$eVq%h_ zhT6bk_w=*dCFdt!GTe1LzQtdQtu_&#wL?CoG=oX>jwaq72>lrwwm4g2Ha z!nybA@hu-02Kg7J6A4S&@9H1-pjMk@Ddgg!`hC=Is~P@3J%<`Rh@bK3=ZG-({?0qq z?vmRPKEXjl&mqE|W{VK?QCq6lFX8)BaUH_mD;cZJ5I8 zAFE%!F64)Sa$tSgARZnb$f>CGui5|n`9s@o=jD}D&kyeJwc${=D02{*Vp%T z?be5zoBQh$2PbDk_n*W>4B%EWHJ_w)-+f3JRNeK!6sz{uQ&d#^Wov9~e1e^bHFdn8 z+=^VujwN`s|50}OIN~g99n{B zFE1{Nr|N(F_<@-W4GXJoxtyDui!cnV`lT*&OUuj4r%mGd#h2dvcah}Z^eQW->J1GI zZxJI|&v`C`dR*J&pS#LA<&9opS>AdcTJpDa(&9MV+uIvB(-oMg^U+>!v3|dwxar`=SXM$jI9ZVIkV*|d3)=Hj8=$7rGq3>Bo~f2@Y+RAPYFC5f?$$>1%i!_0N9!KJOUfCb?bq06x7aBT)zq{& zN-)YBiGZ0!##6)Z%6oR^mE-g)6NIPNALXmpJ3l-xri5tDlKMgR)z>Mc3vxXCB5JdS zGzaZSrCOt;d}X)Aqj-hFPE5(;xOCO0HBwarvCR z{oQ^UU%jo_lgfCFI1BC5v)%DLq48;}6MDX>J8cQ~6v#HaDLs>RE8Q=ny^d)kSf9mA z#Vls*sTt{#A={9r zO6u6=I4f$u#NNb;f%c}|l7VH3-_iDP;^VcU(y~R6TU8bpv7Akn4@RGZZtleUt~^7%u4th=hXJ9GrF1C2 zZe1Y7UQINIX*B!O42BN3sB7q?qpgmu1^LJn;;5Z+*{WwkVPuT(UEJ6a9j{}2WA+~; zbxpR%scm0$)VAQemzcHS`udwX&ifhR>G1bg3BiBC}ajp)#Mil zo-lVa>mju;&@P^{lv*iieakYIWgeafY}dIv>20gsu@d2C%&$4fH6+4e-oJ!kcUa<+ zmvDYfZw+@V!uChi;*_9ZT>XEA>L6MscK zb6=ni*qwGuE37uwzdgL{FW-X2Nc#^ezcE6hI zuy>q4YLTRsbaM4@)yqdEK7H2z;pckA?6;P)=g{^kAQ!zjP5cm=771nG!9#Ol+rbbZ0Mg0{}oM)##Dco)Q8`~7RU$ZC{)Q8c7 zv>bq^{e2f}h#`qMY6mmr;x>ygVnXJvdA*&sQRNL^v3>_Az}<&v6Q2%fz;%qd=!Dz* zV)LvTx50g~LISp5nGlaK)Gb2cKcu++O#TSN1p1CP1dF(VcykUe-4s3|WwGZaqZCKd z4}X1LNAKqiGXlF~t_j(uI7>E*jMCZ84jK5wratb6wA> zKlCPDDSM-xV9V9&*OrPD$FIx3s#z{qQu&)&%@-@@@@j_XB%=y4(0Eb=KUoh_v+?Lu zx=$+_1(`ladUa7PJi=K!UbyO(lh1qgwX#b1Rbck3ZqXlZF|G*zt^Pn+o3&9@t{>zV zeBXGeDWg=WTQm>*O_OC_kzDwf^5j$@#U`>-NZ3#iO;#al9-^a=iLg#;z9postp@GL z8GRY$^hVG_!pLp>sl6fbvl zwGigQC_*$*tJM7dXo(?S3%P zj6&3y%kjzY5GVhZCriR9yf(E8pV`{LK*(fzo$jm8+3v2?!m3hnjNz8Lg)F(7 z5XqLUI{;+W@zqqAkak=kUCO5F@v`7FkLDmZg5F_R)S;O5TSKMytT!y3Ggz1%1_3{>I zML7?W3eS*bVKb%!7x0+#_UU@(y@L?GakdVU_6|H1$D zVW012MulGUPw-lsxAV>b+TZWELDIjBfXJf$W~tV)$G#&gE52`9ML?MJh<1=nIGm1@ zTNIAeinuB8l>BOc-2D0e;yWR4G7#X@IBW|0`1ol2F^TB8X|P%B{;ICqr|Z5drlUgu z1&eF~1UHytF5X^VA8~Oly@NxCNK}Pqz5?+$s=BaKII8fB05E9-22G6mV+cRsFn89E z^z;P1gMz7Y1*wHCU!SgK5c0Y$_`Q!M792}ww{F}F#Xp|o^}42>oSbB{oCA95b&msoH2U2L!?etnHO^Jd43#)&pB z^rxk!E(&$04tC)9Z?}dDxHXh>mHOi{Ol`eSXC31m;Md`Xb-&@G+0ry#_X^uAQjt)H zenR(&c3*E8B)ndrGXulJ+ohH5B`zII9nY-q-o2}ru(7d`Yb7KNM*Od?Y6Cm;gttQi-G3X|w_rU?iL zV6$J#3=R&KHuiixEKK8ZEjjTN5(?0MWzH7xU796@+f*Zu)!_m9kx9Y5GbM$LCfo=G z^yEEWbSR^4bCumH&oXONUw5-B=wP-qrN=fhDvIv;ym;0M0z=HPF}llOW1-quR7~tE zUX)s?SmpdOe=zE)0x&|iq$Poqg)d1}nd%DYvfNxISe=&CFv1U$BPJNx@BU$JZUv%O;gu?`6d3AC{O zEoCL%2)es4T4A>vB+F}eoIl&@YqVa87ZLT{saWrS{3i{i+YyX}$40``!3!)BNXf-$VvdI(CK+m%A0nLU2mMQvW&c+(~p-{-=CuR10diEk+7vf^NK= zxD*UeU5w(HVz+O%Qn6|UvK*X}j0u)`yh^&r&*C`d9Ol5fu6yejUBYD4yn1F#5hWOm zg1|zo=TDu2ANgh~Vq|B~fz)e*1CW1-&(4+yW;Zb5T7Y}b_?DvbM|nqZOuYNjhm~LzBGvI6mc$UzopGUgt0Bw+Oxn8Mbse@i|$>XI8fr^PDyIn9&OkT>l=$P?smR6Q$QShq?^Shdx-meqH0Urt;D~$E4 z+PS$}bRz+R^W36Pg0TLO-K$yLrF=7dMWSt#jKb4l16G3-ZtVzy_&QKfKS&ri_6rvl zh*+k~zpSVJP0Z}+3c^N|HgLb^%F-SgIlG$%-`rgLo}m2o`;(~uOk|> z@$AvAAXv>-4;FpD2a0jNs}a+)nr0xOTd1q=9CV_>5>9;*#}+wfty)U(&ju z)M{*;E1b`^nYZ94@AZVL8{TZsDY=DkDKsi@5$l0TPlT5jlRdpWSer*J?Ckw`vyUD* z(M5F!Ng=q!D3z~d1{hh@EMI0?Kb~sUMkn#gtC-ajMIG}bjoTtN#K##rm=)!5M4iHM zwn1sVESd^M8(znv9^AjrMV*!VMIE_yL~@h|JjRJ~ZO%2{J6#BREX21b_Xje7*1Vcmo4Se7lT~{*ed^Dl~C#SWD&4$j>Zu3+N}1* z{Ff49^v;10&;J2P-WG|kiK#;mM%o`yp50-?rVwaoKG$$^=P1e4Urn#E4UNQ92I3Bn zK0A@LO}*I3518$uluF6T)oEvEX6pQE6kSy^s|UuDw3Qxw$gjCW+1437(I>HK%37og zp$pe$u8!2^IEi@YsHT%32A}2|a-i!wDTx<@$>=NX4s?S}*avdPBbBXOuv2%>@<8?* z@K7$JJ40v;j!^{yb*fjKp~^7+WN+uO^Zp0cYwPjfS?~u zg@9nXizTq?74I*<_wgW~lOs*nLG)@!4rQAZc@y*qMMX&OOVvyC&! z(Kmf#^Jht8wH7Qtm2kqVq8dWBmgL&IWvsyJI953bKqe>fDaR(I4smDNZTw<~z26sQ zByHC^J)2I|=IuwVXkZ!`X4p+-Fsseh!WIMJ0ZCkl0&E;l%Tb{)Tl;6PVA7Mk@bUsq z`3wtxFAiIUZw=UHQGsGg_dePmwPQ@t54OeB%qY#ATUR;aMUK(}RTmy`ZS>(NnkK5S zKlr@s)(V{W>~A#vwXq3(^yo_eNk;_1g|pi4#Y@lYkYddr?CW)XAS5gd87uHe?Yuu` zx;K)R(zafhyCFPoQo$7w#)UkHjD91 z&ii_|-k5bFUP3hJ0xw~0gXWdFWrFd8$;OLi={}g{dDnB@d`$4juMqw7B8mhqZ77iV z0dwP~)h)b0b{rVbWosE)hMom_{f+I>mPDK5dm@F@V}k~>Dd1lUl|(D9^SX0MeC*WF z#YOFn(~k`h6Zck)zVOfM`6e{)?(gq&1u49-bGwl&Cpe8WUMAMBtalY2>5V8P}Zc)DZS&nowb`nD1%x#bj{{-^L5@5_JKcVyBPKt z-1!T7Tsl#AlH@lU_-5bV6A^8mlC|)jSRdBdKXk!v&eT)N!GnR|1^1d|;=~Wc62aI> zVHou&1gK^>BqXtNwS;WD^)_05{;UDXeps0QIl}PPe0o1&RcBF`;y3#xzry3LX9QpS zN4)w_kN$D*eMdmHmN>^oQoH&17ph&1&#%*j^%bEIo_(f#PcM1`ZI!&`pI$sI&X*9L zvY8Db(J5scceun(oXOiJfCPJwN&WeyLYRlW3Dq-VYD@xR0BO4Hj$$OHa_-*EYhh43KY!y>y0_)hj=ws8G>ir(_H4k}iF@ij zRU&53>iPsrZBx&a2u17+$|4L|joL~7mcje1o3PJPgLGZfHNhx%J_F{kf;DxvvTKrM z{`66>jFdx_rzm`y%g%7$Qi6An7a*wESy2{e zdRaad77?=K9)DSXhZTJ!-M;|wd;Quq#8(EnSi(tR0ejzEW2EXh%&`P@m1gQ;Ipt0Z(-Q6a9Nxye#>rm$rUZ0v2vwUtQo3S^o{ zC(WnUkA|;Wf#bM7za+K|BEOqq*-O`A?n3~}KQ6e>Px(@sIVt18WEj3>>-mFN!0xjA zR{jrX;hWVMbndrdlTEx9>g~tV{91H^f!ScAU)1 zp6RdN?1J55LXZ~g35Xxiv7H)rhJ&{*$QTw%7|@Zd250Iq?p}{M;5HycxJd)nJrEqj z*(%29Gk)H|67@udZ-0c2a-{Mx;19`TW^YYH!%58gLs#0!xJ;7(o8e&UFiO$gK9H$zPv%z z6E@3M<`sMBL)9%!|Mt(aEIs=ev0!V-fK&1}=W{Yoj9$C`dk`!NtK*hvWMm``uNyr? zzIgQ3VPQ^vxuDGgv%viZlvGn6*wQBbb;zaZZ3&m*k>%{Sg0CTfzxc@uWHPY~S8g$kL7nTK7m`*7xJPHU8Cd z&$!9!B;#yNYm`!Q!c8t*xxL!Lv+T9~T&PVi!$HNP!`V^r-Cuskh%IB$d8rG24Jc_pfC8(pfH(>UW6tRK6$jfA9Xn9*s?Tl+DLD$i=v z3&;BTa~iufmZhbo%kjdGI_vo*7&R$r=|8+O{KkJjsM2D7HXC^hXRE51Ly_?RMlrQ( z?E0!v)rCdMM?*gBR+m0lft2zfT}*IkJGLB{f5i!mc3Ti__K&G}BF|eA97Q#Vk7>=T zLsL+$J@5x8)Mr*PJiREw;K!I;&V|R2n_;n3xxUZVG08r@RNjX<61Vaug1?);7_vR# z&bBldu9l5=q9tzPubHqo$;!WhO*%Sm+uy%Jti0tCV~EcPpWm{+pyk#!fVIt(3o9b_ zIQtE~ayLq&C3d(Nr|X%`oF+#-6N_v1HZ(2p%lG&DEB5wZEPKPA9BB&+rd-`~#Y8c5_WOrrtc4sKqb?%z@zH__Awdl4 z!dwv}L3-mlr#fLkc6#OfR{Sz@n4sR@#U2}`K4M4F>K$im+vH|jz`8B>4#O$X+3+_hG5ge zdR}!-6jT3O`O~LefiMt~4O?83`ejh?P5px`e!2e>g~;({TX{5FQygAnazta1_L|!$3Xvdgste@zH0n zORi?vE5JjVWw5}1h4)8tEQ4~KqqdbXS}gazu(=v4J8+$DF0y^9{v2;o{N$0X~c zTU)Yg?d5V2f-*@#nk%x)x*`tGI?pKRfoBE!l@&kOfe7#xN<_P&m!YF{JP+^NZwnU?zplQ*YM-BKE@K^Iow{vbM@mVz5UVEI+BXlV3!fvg1WCm<$AZ;;+Kl*R+QiCpO?1Z%@TM6A{z??B!c}n`4oQUR&r2d}| z;zJBzUaX7hyqkFAA6_Sr-0vaAoz3vWZD7$~gOMtQrp(Tff=YjXNwDsuS|~P*d<5LE z`OoT0(S3?q;%Yx4Muf@Q@f3ug*+dUq<-=8H*rm9|be>|mP)MjM2YV#{7S3YJ1 zjM#fyrCmUc08lF$7>IzoQk$J3Fq^_#TFG6opdp{!B`N+BEh{BmQIG-6i-9(22!1k> zhk^-y4P@75y63Z5Za9O2>1~CJ1oRnYoM$nrw0;jj$SQK9qP#W&UVNx8UEQ3Rh+ z1+%anNvCvmb<&vy8JJd=VyM(ltF^Z4vKh#OKvS=Fo?Z!%lt?)h(yoT64s8cqv>xu=#9(#Lk!q(TB4-a)8vFp5Uto|b!c66oA!@8RDa1vRTT*nG~Ga|{j!xD!B!n)tL_u%TGVe^YC2m{*IJ5Uqm? z4^UbcnbNO%LIer9HaMcrb-!YTce`*k!PmFL{{CdaKBCnN%kHFkur-wBpQjKmvsWBq zHm?oq`LvHdz>R?9tzD>rXyf6=@65eJ*{zSb*_$)h-9B!Nx{Bs=q!%l&srQzsn!y3? zU3p%s$Lnv7L``}N{#nds^3(CPabLl8iKbr!-+G@#>wnwq{*aG#P%mvczlo=#6rQnu zw6O4|t8IM8H-ic5$ZFCF=7*G+%hpZygnQM5+@2G1*iMgDSHNW0cdK}gVH7jmOd^@qG@1d!_)%fHmSupw_ri{33VYiKPIRcFHo1N}+NnoHpK!{!nyyM!<)IBU)=Tj>s%fy~xu${;(0fk+Q|VzD(q z#HtXK|J^lEc<0(6*G4G{9b$Pt>H*^4GtYd5J$*dtUiKUS5_yadi0CQDs$DdHU--t} zROT4}$N_^w0;>Z@!AXEEabVMks}RCna${FJ7dWT$yGSqWU*45om0hVT18!qoszj_# zrq2dXVRKsqWvdC`SDcv2u(ABjWk&-^XUo*?Xw2W;cPN<*W8;!b2y&IhJ}qu!Ca+sO zox{UXDouUQ`#t4=cqSq3=_J!XHZcPKCO5^tnzNUnX85dcppd60vuLWHne*crh3#1L z!41iqmw}uVV2T|hfOPG>$60?O9p)JtBMp<`*CVqLxyk^4ED@#Oh@UX*D%sYT%2xw6 z$`R9+FVI|$B3D1=9KVv2uCY2nuMr$^(PgfR1+o*I``Y32<>5SaxPNkQ@gZ#3N+kW>{Wi>il#PfZfzN$&Ajyd6fn(fZY_o7!##XCzCU>65nN=-8-Xv4t+~6(<1hTh~}}Y~N`F zQwP)PlS^G{7KHBFbLPC+H9B!2zwjuK`e9-Z%fc1?{oiID9wnsoTAQBV2~A1$XEI~^ zr$o{BjtaE9$HfWTr4cSGZGCaw+&h?USgn>kt)X(2Pg#P| z@kMn`pQj`24oBN}Aj3|6coE78`+6Lt|Gu}{RaEEN_!Tr}ym3$&vuw;GqG)^ou64Cy z9|vb~VdcR{V!Jyk0k*@=>(iMzV5%Old;M)7)<0|(!9y`z2`$dFa!0&>KWmVfr)KHJ zAX;16$XDCsYG<5;pioEkZV7^B3>5Bs1=s!)b#3V)k|k^NdcEqw_B^&|kwold6C1i` z9lV}vzmNL}bN_N?F9CvG3jq9NmQw%wY}iNk;@X#bMkEi9KE_8*6tWowKM_$1JQqiw zN>2l_0}t$?+1%UOJye=8F{JqE$gRA+(G2suR#!=DO1IR3FmHk0%MQEaY+40M*Yw`s zJ90QfAAEc^RI*!+xLf&6uT?j{3E-W+BE&p@05d+VvsvE>595DfnkJqV+uuhM_kqb- zwaTIdeUCG982tc(aO~qYpf8)m5o6fed>3622+P^^yMe?P(syx zvcb_&3To3JrMlhzq-J$%3SUL`IcfoLB(V%YaQL=jFBIRaIqyASK|FK zR$k!=)*<|J(#&jwZ<15L!BoV^&loG`M#*3mA(@EQXahnqDc@57!k-Dl3t{dkVrwkZ!>WHL>6X4 z(TcQ>vIgTI&*_mGXwI*&;9iB*gtt(g)JshsYYhg{V5^mV8FXy9s?(!Hl{(S2I5&!x9d6CO5-od2So^ zx!-q$sf{EVHYiifgyQqkb%yk1V+>b>BW29NIFP>u+-)>$>@fMle2G{hoAvg#Dc7gF zvsme0CFeLiE=9IkEXIFTEr|I%s#MlR090ZUbVKj8?|T2B2?}pC?PFVbVXt)aoT|WY z`76@%J^Ql--oi{V3%rv4Ob7ww9N$MgyrIJ&aOY2O4db&ni%SbJ-t8ZK?lOv%hma9= zI#KpwI?wmH0~%u)9YkFkc0WhTslO-valAh_?Czl>^6+~)X-Ji&T)sL!w&8I(=p=MsF>F`^ z*lx{cm*SHqHPA|z!v`3skdP1;J9HF${VImL)f;Figz3WtV`Q2{P1 zzw^z{XuzKSJU}Y0)l1u(J~A*&TC4?!|L3cg51UgeU2u^vxik0V^TQRcpdX)ce+;w% zK%d@umbm4998-|CBMx*FrXxT)(%@qT%{2@VUJ&J#nVl+ToOb94Lk#Ech>fq}8W z&p7|be(uZs6Cr0Z{?FN};k5?T?(CKw)S`C2cmUt8r^MqU{6JDnJ@+(eI`M_HB)Vni z#}OZvX0;1iLp_qy0>$r0>Tzo*-i z1}$L+t148Ry=&hdMs0ou7etFCVfKnEXiS>FT9uY=%!(I3aVV278><+BN2K2g`Q z1um295q*XdIX5Ye{>vCgwmWKEG2A!1EzTms86!8-ify*ef0f=-%!Pb)mwJ zV_!r=6}#XSa>;dA?*5PDa{LIK!pIO{wvi~wvx;_h_yBzfI)KdT;WKVLz3vfNN;IsLm#zatw| z(NfXvm+kjx3D8DXZvec*qTqL|s6HO|8(n?jIW(^4gc~dMQo{{mBp&eaQYET6U`xd8 z5N)Q(jmO2MKzs>Scwgf+nrCgI?!&j+Sz1y;7xv|5@5c>(!&y*x}_uH#y-KRM++aT)k$6BJ3ObWq1m5V8)km*i!o(VTi3()~v8KOLkFrWmc`sxCEqoH($Bw?M8Bly&7%3RPj=T6W- zarUh{F+IA!4{-7vabsTW0A&hlSX9hZsksREdZTR8fmIHuZZTT?V@km15f_0@rL3xI zf-mJdQ>4WAay^Y%uMPO#oa;WHNl9;mC~!Z(WP5DEzdnMP%%S?wK<< zY_|Z{ueeIEPHQxq-&>>J2G`TmQ&d9Yv%|)hy|FB9va48xO<6r6z23v^h4?>xhTe(# z7KodcDq$R1^9kDHZd`A{8+g7fr-rN7;I zfBV&Hw9G&(6o$r5oPa(%$ETd~7GWwmF*8_6*ZQ+n-g#%4bMng+H(@}aB*CK8`hP<< zB-A0u0;nbl(ddc$?#`w;?@0w4o)*5`cPm%6S+rxI{18O!CHZUkO}2MP20I9#TnBTz zG^OI)eUXWg3xg72RjM~a(ypg}I_$OjMbq_)Rvs&}*Mwt|yb7MQ8P%uNB$Ay}6!Rc| zGse_FGAt8)gBbY{2aux>Kv-@SoPvyZ7Zz48WS3VbiDHY~ZkU@uH`4v^550mJttC>c z&o=1?bRf76Jx)PjHG+!GD^V}n9Ug(?du>wrdw@uZBK^V@l7#Of`RhT-N z7zbN<020d>^am&r3GnR7PidJYXBnzod2qB=Doh^)>L-Cfy)A>PV9tx~H;WR29&>|fmyI}~E;ptLsGt8R3uM}z*(1BL}jQG}}l6=`r|+$HM(H;jKfOay)> zFPs|A^>sUm88hGEQ|gwtkaGF^X`T zb05;GtXRpPo!l1ND>o;=UH9vzZ%AsNDPeFd?3bp=$5v{@*E2K=77RPDWgf&}RK~5% zBZy$s8;Sa^ftvNRI?Vljtp|J7LueqR1BlhHm-0(M`XVuVQNe#rV4v>oBc;x>aQ}i`g`RiK6mGZm}^BWkrt>T%ngvc0Y9E=K?1nZ%*!Khz} zxPwD#^$n<#3@%~`03y;D6tb3qfthSCqt@YS50A^~YS!eFf#2FX0A`FK+}M&VVz>>O zc$K$sBc#S+#@$#~AT6S~4{qcUCJpAjLJpm4`1059fCJg5r%WA19t@O;j>0za2rG>P z!v?qnApG}G?7!1qC-7Hm-6p=Od+`ocJ)knbr^5Rjde10d8;pGJOwlzb!6a?ux~<0 zQC{52>iM>>IW{$d9=srWobLjF%0#>AI&@o-$A(*}=pmF6g$@Pm^H2l?MwdJ8c@ zd+CyUimGO;px**nCcg}%=qG8S;(TC~7h&~rpMzxkZ9^>MYM~*)Dm5(dj>x#=@^DJd zLo#LNg>(sx)q6&0!ZY$Tyma<-vd}7yqkW&O5<5Bsfx=h$`v^J}HAot;+FJIxHtZUW8z{}=Rx zVoIrixum3|1OU;W4LTtJI_zlXj@kd!2LK6r09l&#$dgBO9`%g~ov@*}E%&gF9bbV# z5!(q4AZBN&OZY9G2^w4*5q)~BIkL3?fs9645~3gMswgF)N;I*TjE5ub=eree~V_VP8+w8mw77NZt;LOH~?jD z0^m#ckB_Vt6C~0yGFi6*d*eCdz&=X_JGV7dJe%l>7sBmV_2Q&JxW}tL;G4f+O!rpU z6>hadEcURv{%x*3qlTg$d)GGyf+;Hs(pkZkA$A}_z_Cx2*1{1A@ZbEJw4fT8pnbnv zZ++bQAjTyJsIVld1j@zij8QWR9v&@EfIcg&eaWzIFTQ_zqJs$nSZ1+m4SH!4lLA)z zHNIsYU|Wk+EB)GjzZFG$@>x(&|6Stq^D}KnEP$CpD;gOYm9$2q#%R!E?JoN((13Lt zM5K7CbO#rzB&C*LqyN~0iA0vDL`5d#WQ* z*ZYpT*uPSlm4jYc{$xGRr^T`G67H8aK5*C&p z0No4o0mnd_3aFBMqX+?J{IvCI?8p~gSheOFlC1L8CABtCnav6%4=O|1hBZMc^?j%9 zDjxdJlu~^XZqe`|0(rWfeG$!PA2{Kc){A*|GIqh(sXqi;!cLaN_+J`3cF^Jg>Q^Mx z<=z#>gs^T+4;$4wtG$|}Z)t6h11_%gpxgCLL6()#kC=T%L%R(E)W4|1!34B;^=l5G zm22$R1+bX(Y_yY;lg*a%3$p<|FBBjN?5r-f`rhF1)72LJNk2Wm2UCDgwuxO>K>Y4U z7o1=@Lv(L3lT_ai;|TzwuG+4V#GwV3U1%$Yz!@n zq|fNk3v&TQp9#UCp?PYhQVI$U+SvevYhtnk-M~OgYt$Esx#ACgaT^}Dai@9m0nqco z@$m+Jfb4cHwigSxLf@<0tRMms*bi1SwNdaz0I!(>#54a~IF^#sf}fDMeZ~E&arbi_ zJHd}9rxc*PQnV3s9qxdA(J?~{O73yqf(u9!11faVg3aY{>J5=tOxC2wdMmlE)N5mG zJek+`kDj#H0r0MB?X^Zn<4z`j4%#4TcX`q5 z#6@=Gbg||0Mz+0)IPRv?Tdjv2z=i@G`*fYPdb9i4SRgbaHE?H+7pft?2fpK;;+{9Jq^FRO};}0=+P;=HE*{2tJ95>17kSSYq^!t2{dKm4>fuv1*Clzd2(A z>f1->D>1-oNN^te?dR3cb(Oew;#Me9HlpYJa@~y(2hs;bmpn8w@wl|K!7|MnNg0`Z zAo?%UZJFb!l2%s!NJMmN{)j>>F!Eb2;~wDk2~MvDsd5U6io)U1DdSR7Byt3NHCo&l zc(T50&{@SA=p%Kt<7tTxeoaIKS{7`aZG~bK=o1cK^6ckgNZ~0OE)66~i8mQ^2_sP5 zR3r1KE0Bd}G69qk_%kp*Ug?BglVR5!F)||C-Q9h^8AhB*@Duf>f7ge2HQ?>*7ewA% z>Hi|<<#Pm1df`Hf1g)%ziidX)nj5LPQ?Z0~{sUT{ZE`j@GB$=sL~OYq0;)q~-uC~u zucTPX&OAUDo`GeZDgLi-{s4|H(pjUCT!Mu6|01bk;r#hM8w67tf()Fa;{yeN%Su1k znX_{DQQBDmgdH_MnisC5G@KGUVCx^t4(;BF;IhG88ft`DC3qFRcI$g%x0jr232G)B&EaHwUS#Z>bc zS@1|4p?tj1TUXK6P7thf0-sIPB`8azto({~9sT}xu|a_W{*kq&4_YarnVg{`F=d{> zr1Kf+I-?;Qjo8lgW%C1q%&-n&`v1k(TYzP?cHP3Dq5=vijWiOHihwkVl!zcmNh2xU z9U>q|ONxXDB1m_abf4%cSyYZDL8TI;^&J?9u>j+uJ>Q!l%Q zhBaupZoJ0(da!TCjuFV}C-s+Dk2a*>wqSX`6Z4KVy|MuQPN*7Lbm}5x`}h$AP#;eV zeXm`A`J*8u-*jHAnR}{iNMt-D4>xlB8u5+HuC|xjpvrgiiA71HvARV!QC!@C*Z7h!q6;#~NcinhuFW04ij3rL3C+q7TDr=M_lXbZi6N-9#?6p$zd0>2JWM}VJ|&quUET2?%$ZXp%Nu(Z}ZkS5sQK8 z>6B=p_S|NpRX0C%7GuRv6KdpJelBaM!{w+Npb!^7OETgjmo>$Z zY;~9&q~R{()_ASbS^c7P&xEuc6lhb~yJ^QLqH%Y!vRxi>2aa52D@^<6?euBQbtS#U z8dWrrnaSkilQm(aXMF7>;medD=V_0FXTWDR*!0(gUpN+I*8cfI z_}1-JqE@Bd?HDl26>)>3eBgj1pF0dF=TMAL1`hlIDVLc%{Fv5ZXWs1IgYySy6?V?f z-@H!x)m|q`ul#*@GjL>^s(mY{$Lr`l3d5raYsQqXUbn77H2BJ5W$azf=l;6Hnz)VS z$XLK(dBE=4X{+z~491c7^DZAob93vq;maqw%I4K=JZ9@8Y+hkV$sT!e<#5Eo!HTgu zwuH`VUrmilhGj4QR!Y&c9U+4#=hRHkyC>qZY8veu0pk^0gP8a6Rm9({la0q1IPpKH zait+dV`UXfI-q2%i5Yv>lw{a+-RM2eoTrbE+BlvPom}>OlB_z?LlGX`MKI1nbMCly zvf$wC7#gYBI=UK^V*y09goJH}n z{O(SiHGj|Oc3O?>qvS!Z{We;vdX|9WkHsqcYJpM8CF&$j1&OUV84jAp#C_e3X%9_` zr_*a5hmqkd0dM&}%|BtLACb9kW-0#1O@+Lwh!{8OQuPOr>@lN0gP&*AtUfz0kTfttVf}bkFEL^TS+yG7Xz&wej#ORH-Pbw+I zg0pk_tIeCUgU>Bujb{jK;{kPGcI6n`6>zb`BKp zNHGnK`1I#@@Rz98+|@@O2VRSupT^al=^!&7!Cy4cjnK_yvAtJZKPmU)Rf!MzD`AJJ z6*`xK?UHqkJ-Q!N?UkjsS?>Ma7}dESRgywbH6Wf){Q05#YK=Bj^Pu%q+dOj%_b{b4 z#G6qK)AnQZ>t0pn{l%9QWm0TwKbR^&G$I(9Vs1#AwClezPpI$8v1C*|X82*BZ4^NO z=4$^abf4ZS^<#|{2J$&l675c=_8CQpeW)56%d&=siS&37*Rp|=_#s_8{Vn&v>U5gQ z`vzF4b#-;y+uP4*TmG&y_z7EDJ|ZS2CX%{-pue7JGv4tahWPH2YjIJeKTDG!Hweo% zZHybgX9W#L58YYNqp#b=wkH*|3#T&(^mt5UVeRHguv*G2E?|F1O=SquCt_5uPAG`| zcy}3fx)Uh>?T-41OQvr8iofJ0FPioHv+zzhv1x>}X==*t4p(r^I+bsx6e~@{m!N{6 zMvy(`L#i7fHqMOr*Xlzudme3WI%Y_#nGW**p1m(tMEc?bSB)sD21)*~310#hI=ZDT z_YA1WGDTI5R2Ux#J|bn~->|5BxcLxxc*yp3E_m6_Y&DR6i!O212Mj!XnwDRyBk^&a zdO0%eZsXCH)>p>0P{8F6bS+&ueB_+^w9(Jn{R=8@1;0YY@v5xg>FQ6@WB+t)_9}B7 zx&w2uEi^#`fT&cix_!YiKfRV=8fn#JDy&9Tiu#hffW=f5c(tiY_02ExA#)#nS-YZ7 zMO}{3SOMYYes-$`Tg+{#7vifz^K7c=CLd#buuRnxFLtkX8k|t<;pScFZ-$R#U96&iQ3Jchz*VwLW9wcRItIr zueZFG<}iMhw=r{k`L^lsFa?69E@8A0cj5|FX}xRQ=`i?NIj(jFT{{`F_cSA3?n>1N zym=TUJ2tMa{hpY@j9c;jX`)c@87oX{d|3onkE_s7BOa^8B9H>$f5J)&5# zfqIcf?ysmixB47=?zcw`ni+Q)gCcUqqOFCes|7JrXj$$yz-pIG-kgZv?6H0PTVJnwYJB{{l)wP&D5^|pgD|(xnCHg*|KJk9x*`3@XqW^K z0cOCW5R?M?{h03+9WGQh0@Kse8EU1WP)10{^S{v2N*H%C9VwWGhU0vJ!@Z1);Mdtb zeUH=V(}Oath9-oHgm`62F9-2Y4mV{0EPb1!<&pl}l2Is-f~=M|L?gDd_%qD=inN5uA(^ z`O@N+##GrE2CDCn(lzhh<+EplS87#Cc<+K~n7;Lu|CxfsDn5ov1zM#E!{}a7Lk|)Z zn>)QP*|fPvFB!-LoWhru48L)_uN}9ad?goZeci-=-+w;;o?y~T5kliLM(iwO2eP(L z%rqlxBM-74jD2sB=i9uO6$ldN?Lo)wq5Td=BZ-%z{W})!pfBtVDtF>Yd!qC(yFS`u zS?v|l%|2iBYvdQ=mNleBQ59o@?>d`#c#cNf?POc=VyZUgYFp z{UvltC+89ciUHZNMy@dZ`{1XZ#mkAZ1s1omefoj1yibJLPG7`M#Ozk*BNmXx+j}zR zy5z7qSkKg>gOq&I?UZFu&HA>WpdbKr ze>Nx|@bSe!-}hZyT+iTO?DdhtSG;0-js=m7_%bkcP{EtPfd3kX4HPdW5E zl3a(kMBLrJj`@5U*IEB1#`r{4qLXQQ6PL$?JND1#3~G#*<4F%I@y<^V7r%&;PeVC; zAC#cY%@P1r>KF^zEa4y~11WtyWwypZ{69f1$o*2lf=e7}9^G3V#Ez40+L>=l4=03; z*@d){A#Vo*16euX2_Y*lUk^ao_Tgd5+kE}Ay*?S}ZCoA97EhMj?voMfNq$LKpu&Q1 zTJTf1Cu@DLjXHKYyR*+sj}H!LM1{g-;w8{yEomD+UVZ*icdn{OFr3etcc`WxR=Mf95>Jf z>5&}GT3o-Rn-Be_`7yCs;#}BQMer9XWll;#ge5-z+Z!_Ke{M7U^qANy*p?6)j31|o2kh|{5ipt{6gL{FWkB?8D>#3dUDRh~qgE7ZE+;PAxB#n&H*1v>= zByboHNR@)S2ROl9oO^kDKS01stF5VL^KgR|vZ^nf=D!(dG~t-#GDkqWt~0Ym!RN&^;&5-_0Yq&!w4h+_pHdlBGS z!9d}<_XR8t@953cf0mSx@CBCFrswdaiNc-XLob+F02bsX3*>k>NLuv3P6hEDiKuiw zWE2k~mDO+m`eVt4cYdxVtjP2pr_s-AH*PR(RvyM%?Jdd9!>=mx^YfdO!jsoT$HG$F zC|xAiySzAW1$T^aP?0fx$g^IQ7L`qljA31ToaVpcj0HG%3j>2J`Nf&0D%N|BbMfTW zG0lj|lwG|1VXKBy!o;ld*W`DyfTzNRs01BtqgWN4M4U$9R(-FYKtJe8dbT3;gz}c_ z84qO0qBIzCD4wXmO;Uh{?cb84FJHcl01t=JS$OLGJG9I!ObQx9Cm#U-S?Dnr6{77A zaS}bvX3gT)YHM$oN737ys4Vi_w8qvt*qqEpX@zcYrtzlkAI~G;{)`=*ja^Zf#nWpJ zp=z!FjBDBjc8%J}?w1$%2RgaR;i_h(qRY_bC8wsA!QhwJ9>=Fd%xwWjxxs}X9QY>A zm4KM4kKC)ivcHaUDlFs&%Okd#ouT8qx{ke;>%#VNQ5fOmv)gHs9up}t6y9$en2nc-f`DJt$3r(Gq z>W5(y{f)6w1Pq^=(t-_l1o{p(2lL6J?b+iQe>MgdmIht+ZTO<~r0b!A@@BL_yf(bS z43$C*bMsYK=W_ea1!!Kc>FVk_;C`&o8J%^E0zR6h?e1SNbH4t_z{BXH+{6oC@O-;r zE~MnzOjAwKTBE$-@a?C)y9{Mg%{Kqa#>&{{6XohzC=L)qo}nS-@@IvxH8(6MU=0>J zYxGxGY#68a8wm>w8xCg0nGQ#rbsFrvQi!GS|D&Y^<~#*P{eB?$g167@ur-CI1MV3j zVqzm2hg;Ki1?H3FaT{w=l#7?D@|;g1K;0b~`6X6W3YuaY*&h=u51726u#(XK@g+G% zhTDXRfyKaGvcIg?->Fsp+FNn$zL+id;6H!*L%y%AqRF)p#kIq-?G%pdn=vo z37D0&$|-QPK;ltfU%v%fx2q4Ktq$q=N1`P;EN>v}S`iN%nqdBSql*Ra`-r$qh#>A# z)6#l_S3;6vIx&xxZeX<#G}O1qIX;VcC-M7_;aFXCmp=6Bff56DhT&*Y=6H>ffmj2Y z$zfL<56-tI=H%v;`vizt65J<*d(&$T$HZ#qg8n@?1L1j*fn`lz9=lofxm~%f;OW}q z376ehV8|mQB^6*=3il3r`YKLV((wn_=Gi>megG zKidD?Rv=iO%k}P4D-HDdpanx~okh3mKPO~D#ofRCcB{=5IxvGYPKA#btlmX)nUWwa zgW1&=yr5*M8y$@qHLtFe%vEb$Riu3q-3ojTYhuCL+LWGT>1Xz^=yz{;c%#wrQcsa> z@}pb%80FLFx1zwHrz*5cg7P=VikacEO@=S>sTK{aCWxag}7$GGKh!D0C^%14a`&*!4)BZ zlqVDdG+^mVeJR-P=lg`(+MGgc1zcs1$wQ9!>&4SBlAz|oy!?DbD+Ah|MDxii z6qAvH<nrsxn(>*BNAJpK z$*EbW&w88&0;M%rR0^gb2qTt9g5?mI$|XlNq%|xeJ7S(dhar(!={!M=io|zr%bCyv zSUH%;GY=4>QOUGTtK7Y0VU3Jo_7nc~$506*^{Q%OMSmo6CN@K7($ZI)uzDOFaTc;^ z^ShUFDot(n)qWG&zSbsjvA!IhISZ%=+gfFYK1f3_m>vwdGX5FK6X5_OE6UWYuBy!g zdDS-5UNZFOd(sq`YMfA9#Z**CipQ)l;a1XnUca3e zIFhcI4#b5{@6Q)tKI{vzaYQU$T0%k#ST>il?WWXz>2KHD96VTkk{uHyRCtxtJ88!6 z-N2oj6*aSLkR46Ie*er@bNnmQIUEFtb`ROlgBs3(;H3@6Gpg2;yM;7SF@&*Hf`ah~ z^9^Yf7;u95(gN)vP6?>|91eyJqM=P&MqnQga)k;wQEG|<8LV3z&-X0%4`#isxB%Nv72Y=~uSnWdxf zUv3#}88b7x$JllbIB_Q?nziw=0*;#_hfstbKVHt7o1XsThtGuY!O-bHpW8&PGg09f zRS*pRP2IriAf79bFnkUOm|Flge8~-|o zTz_g~!x8huywQXUQGJ2Pu?b1LZ`~>=Pn26{JbiX8Sf2o`#m;3@Nm@yekfH8Ty1Pw-rE;qgOiKn$@D z9%3Ra|EL1&N{z0|^U2Fx<3U!$lnQ*h+5nvRkbg-)*!8LvZIhEN)w^%+b6b+!Z=%0+ z9}Ub5KW8tjy_h$%e8x;M=-G@C&7>}$ct3lG%m0|ViBXt4N>cWx(0f<8*RP*GeTsM) z_p?j*_n6#Izwa;nhH4^y;@W%9ZsYNkmv2E*2=*u(W@AT*H);zb5*g~7ns&RNV?VN8 zkyrAuwiLUnOJO@C?-x|B_Bb$xD?R`FU}@Uvsq&jNz76oa9D9fR%gP&^!l};+Q~Nh; zFG|X+TOdDMhFAi500&?XB-9y|k!+fs@T3s3w_G%ZeO@0EVXC=?DV{RT)3M7*= zl+4HD!%VwSYfmpOio3Y*!#m|aSxQo~?N`{(>3#1{^hb{#txZ%Sm$Li)@+?KN zf#*ZYi02XtSrfbe>&{i%r2V~wXCc9lYYEJxW`7sCZ9K~g68TGVLS)E=bJs0bGfVDI z$l%H=OV+V45O4i`2ha@g;yXR!6nF6CUxHDU1`gg0(c#zO=zhc;ZVADkb}%P{nM6z# zl$DzyFhlD!IBbQ3<=s}}J@b;ThS`KAMSZ6R(7+ zloHV%wMdbztxaNl80*HqlE7OaTWk?Xt5 z4mw!tAwvcbp`owSi+|=mB*wROcGB_l$KujU|2;BBB-5-hxEeR@*^$$#>R4aNhsw+1 z5xhjIj}%>4`g52*@r6*+<;!%P4=y30f$BjsO>p*1w_@&l z>XvJbIR z3#58W`%d61h@ys-Zl&kYN&xSLX`4|=Qz%*?uropC58LpdIx%7zTjT-5;7 z^n06=-{H1x!5;PjL!L0GK-mHG1qkcyH1pYNcC|FvKMs{z(RB>Y1Wi~xU#r=+7}sp)t{ z1wXmN6e=J+-ODxwrlXO_^Vwe=d~CO-ytBItSv{Z(fFoUfYG%d?b`{hN3^xJn|8$GE z7Sb)#Q-16Da~}$~1Q5Q!I}HsDT`MbrT&T8a(wYJQsp545u-DpKA0;$5HwWc|kqSB?iv!vxaK##S1JMl#?nx3QYNKlwVOb& zzyT4T{YJgkLqu^5zCM7A=zvkEn*F$Kdn`{F+zcceLsFy)$KAC9M=<2u0iknW!TUGJ z<5o4VYyc^wBLIi*Cn|V?`FC!@foRow3QnD|={p^bf%lLS_|1X3!g#s80klj&)gIMI zwGzp&4M5Hu&1dl0v@Q25SrX4Ps6sJj7oJu%IbG&E~>n1gF8l@mkHh+=l@5 zahXrF!!m31xO&53K0@hrdOwm8KmnXExC*9Mw49s~kOrX+IJaCLFG7wweFG|BKy$_{ z>TcWhh2j$u?gC1HKvVhdmu|Inb?fhf0|N`!pTIB1@!12`Q41{`w`+2BnuwsSqj=oz zp7X)lbQ@c>F5GljRKsRMi{E3x!A}Twd&vi8&)>Cwc8J}{JE+^HDmLCBw-7!V{A%G^ zO7{m;VJrhazN_HNdE(%z+V6i0$UpJYI~Y2 zcZZl5K=}J^Zf>N2YQsJBfWr7|mFp?ug9mAVw3QV*AYYX%K~R(Z;|g`S@d_#m#`e=J z#dOot+qZ7b{kcK$6KemW^gW~j2BOB3H@~EiBotg}cMnF*cfp}WPDRCNwCk9D(Hm!@ ztFIsR=5!~9WE<;{hQpTwrXIj>BbndU0?xR-vy%wa4iW@l6NSaZyaarV^}TylRh6`# z{7qRhwFq-S4PmcI58Ep^Iy+w-NcR(#PeJTsw`ZGnp{p{f-{@_X3lEZalarGpDFIX( zqgh`)_uaO7d*#6Sbhyx%5EMrVKq@J?Rf46=%<}Sk`z!qS?<1D67J&ARvImMihg+9k z&~>^|9t;__+xfl{KYzyHfZgWAKOF~EE~TadqOv*4%=hW&dLijgm}zmO@;VK>qV9{? zy;{LKwBZidX1O4^2l!l{V!$C>lahv`_t=;kx%-(Vib|nT1MIr>qVWpH$M6=#GZmG} zE)_FXa!p6M%JtxZDREX~Qi&+QvVcS+G$v*>+*L_IS{e%luzJ?O=YIuYojsSuQ_R+Q z1^Mexi6xDB)iE8)bx#k(!4!6EVXYl+T&LYkGjT;_C4?eE^-XU=jj6@&?)q?XAUGY3 zym#w562HhHuwabB^n&h#ODii4-x2Y$62AY*+wpr;RMXEFVF;{~-rrl{Xbpv|wdg6R z5&p{5^>6>c8U?X8?O&PMf1IUM}`jbMF;p;-?D?S=le%TaHnR+DE_#epJ;Q%n4&pZ zSUk*BEv92<4~3P9GC;AMU3CIwlwvoW!f?7Z>~;xteSM_ETCxdUFnRF86Hcov$e)LI zR0}yq?1@`_GTgRmxNWycIDFwqQ7Z9^4+*IOBnZ(4{`~0$CnuCLSuek(wdk@Bn?}XP zwj5CTi^WT0B3Q|NAnm()U;14YcbRHgzdNDn{Nt?|H^|vJ! zKgRRmavwbP7Jv6H(Uj~d^nkukK7A!6Wm6Ii-xwZv@Zfja>Yb|Faqs05IOpTMX>CX?KdaV-OJ;k2Uty%!Ji7v<=0yi$fg}$goGi>=)+rvj>E2Jwb3@v=RXGY1*O$2EUL^Dxv zW`-Pt5FQ{zIL!e;)Bv%h@d8dXG_?B0#?bHIgJJkf(cxMaDDYoMN&N%}g_Yvk3pF*@ z(~5sJURfNuhV##8tei|Pz}j;JMrx?o{JSYmnm_XM5X4EjEk3||X>(gg_)Or*f`ELV z&pSl5y?ku-lXW5Fg|L;TmJ@-mVeP>N`|RtRqe_0~j_fiXds8rl!F+4P z*A9J%XmvoxIL-z>ahUcdgng{}l)ouaEc&;3#dT5S;5qj^l7&6x!VCBf>z0i<5<@CjOgh z`)8dgl$KjNJ9#k#>F_9EvuLwNvl@R#QOP98RCGq$6uUW%@qc)>BmH*h0QbSW(S{9K zr1yWC)FI{d{rmT|a+OWmU(Aod}_2f&&} z$atc;EvZFCMZsB~;Dc^@pQGyb>vn+MJEl!Y&0RT+W* zc2T1%?aJt+zNtKgP}0bWdd2Ip{cYhd{Ul+?$1OqWq`6YzB6`8RFq*fB$?I zXQ%+&jouICP_b1VC1#!j1EXf$!3moYs52xRH!_A|S|8{f zpc$ae{5h1Y5M9{T4krH_Z!hgGBkaqVI{iOap?Qau;7tmy?k!MY{Q+DWxM;np8V{KI zH9A1du6q+&?Gd^x>YP_NmaTfIKKIsDzA!7E$oG`hL!9(Kg30(A3kNk`LO=wBP$a1b zhnV;?_z&y|Le0g%#H6#-{R8~(r)Otp>wUd(vgnW+HthQ?&B+^^cAml6p7!Ont*4o7 zaZ8ZRE<`A4XhTtl1RfX)H)ci>19eIy6e9>G2qZ@*P+t$&*luLdWY|Gh11xr=fdn!2 zhtUzL)2;mgV|E*IAZr8KGI{sUp(DNq?f+4)R=ONpkvlC>0p2J9Z1^!$W>9oPlW=EI zkV+;JaS<5#pNnUr&Gu^1T$?L9zE?7T&Z2hMc%A&>9X-fcrJXy#XCX(s?mChsfQ)MW zOI%!s>*2VMWbzEuUY0`*=kT8&Pz%Eov}j`~4vC919fX374+VM&EPFJ@*xgJHKdP9hRt$;}tol%~Z+O=Ks ze`|O%xn?;Mu~$6d zF^&i0ByNOgKBHp&SuoW)!WLFsd?-2yC0QPDug=4rxAI^N1wZmiKu2{V`w;R(A`ZjX z-X2rHL#ID7qN>NYl&ykv2h=h=dXtMzP_`@QzC#0EPu+WP_uv396)g^uIC7uV(f_Ld zzS%*=1#ZLI#TigWviiXxh0G!&p+`ukJ9PJ`C9gwlINl%BL~b1< zx^po3ky1cl{6Q`TRoZ9&omVN+aoiSDAKo0?-WoB^eq_I)20P~^tRk{3WcTyhuFMq> ziJaeANts?bdhKJX)Z}w_Md^HevvWs${obkb;6-7@$Ky ztdTV0A<^2vC={4b`eZ~N(ySDtP~6TQ8>kY7v05$+*%Jd34-UV72alH~P|5+7ckS)l zw?(6!pu!R-*9I*Q!ZjfX15^!zc9*3%iEg1V0HfsV2ciByFkwe63+_%?py_5A|c;(IgK5oSBXN?i^++P6Ifq*IG>sN;!Q-Bs* z5=H%yNhTn)dsp)Twf*ZOPcX6Oj34fBknCsEdj<~BSg*1^ZTV|U+4mujX z8%uC{dWWqcxFmdjAJ6+oq;d(cX+{n-ac}%5EBS;))(R-&3u)=B9nvS|f0;&nAd!((`uCLMUsBOlFIHu~ar{=%I2>d0Msgfide3_Ff8{%Xq5r4n z9=?FgM&8f`5fC|ukf1DqE&S(TeUwT45Q5CJWVvAh8{W8aUbyeSb2aP{csg z)&VN*N0HGFvTNnv!~2b!Be(f~TJ+SBnI#RbB6_iPh=FPU{QZAh^Q329Oc^VKk|O)e zCPblJ4u}xBKOYpQ`GqQ2WKlu0E^{*~gjCK}Z-E^&@AJO{WxgQuPeZtT-&X4P@RdCg z)DT2^_XlAnh^?MJCNWy_9A3l^Ei%q=nbjBI2+9#}L?{kL0vq(7;da0z8~~;eI2iDR z4xoV`o@uv)(ngp8J?I+{P#^%I#N4i}yx5;RDh-lF>jJ>kdF_Rzl z@`w+oMlQNeDY3ukf}=p4y4B4Rck)N7?JiMUWO@Wm#SP43g*cuquj^#d4c_3Pg$;{% z#dl~YTn^~<~DrDcWzhOml8kY~h{oZ+cGg!MTDkSWt z9&J{TwLRJ?#0Q2P0$wQmED88^2-xB0vJ#e^UQ&Op?d?w*b=1uPb8D_lDuf z(AFT)!bAnldthYf_iCQO4{v!&kmuT%P%u%47}7UUaBeZ7E!jH%A)Qqg&wFO)UC=z) zto-x?Sn1rR+^=AO-`j?_Sy)}BB`?V>4@s|ReXvKz%~CfgrYYLPC=V+(0Yeu8c|B z?bC2z@Ony-P2wl=_nN3|GuKyGSxn3_Eldr}Ni`1b1 zU-usn5@hrWR#a3h{r!y#gmFYxAFx<3eTzw5JiQ;#m38tBx_X&5PL^8IqQbx)+?BQLwU)8x@7$13lV;Ldpb0tb%ABEqS+R$7(Y@wREaIEjE z3%kQ2Hub~mEQ~96R%DOl&~JDg!A}qsFVKB8DmmEX{9ggl2XPbBtWaGY`<2mE@3b-@ zQ{k8bKVxl$$z@y>dy0Du_XW7bv|0^zh-_ZMH~R&F#);BU%Gr6Q{vn)KDKub zER!|wd(ois8;XSR!p7N^VsPle^Z>w(Lx2J88E`v0vdk)7Ac0A8kf3>vpL;>nKqN~b zV}h@N5{^vMW!roF0^HxaK#BtuDaMZB7o_rC^8@It!!>-W zJF6rHozgyIZvVFOFw0eT0Im$gun2ey8l0gKFqe>l`gd!8pO%X&(udsXrsuSY(h8Is z?4W-8`0*orCmAqK$PpUNE@5PN0wm~2DP3-FUfy%iwK?OqpqLK~PaHsopey{!@&2x! zoQP^2Dqi(0yr0+z^_%_cXJRS)ZXbqe9?N7u-JT9d*BP{$u!rtqq4kiT?#Ke;@_99e3_SdUZ9!vxpz4<}tfb zQ&21-DKgANe*sEgyPlcWanT!2yMM;xTDZP8vlF53^y93g)Bc?-Y;df4^nF-WKc805 ze|!hp$fEQ0u(CDI8Nt&srQ!gAzaWot0I(g=qQDg)qUwPQh~GPao+DHmBC~;P{{@Ik zAz=b02hpE`@B+L75nDT$LluP-oB;lg3L-!hU@^n+u0>~&*ayYRBmI10aEk&jd!$hS z!s#v8nUMJbh9U+zvFRvO(^w$nhX{y3JeE)R;I%EC~O(Vx!Cl?>~}DlSmZ=;-ET2jC13wGAT~J;g2JAS7i!U zLbKx8U}gdW&H<$aXiO_4mdGoYKsLU;wKWA<@Grm>K|3A`%h4A$Ln!P3iVx6t1;F7E z#A(2U=W9H!9vvSKuU`%ue4hu$D@tbO07TpZgw5v1>S_10?HdqNK*2@(wzfs))-771FE`b&z>Px5zwE|4GkG6fTjY3icm;^ zs@)+Y>wxzQx_7gJvw2qZaAS58zK_nM& z8SGAbhA2RDOM>}<$)PVYrtrsepDP3De?dw?MMdQS3km?UI*`GdoI1k+dJO8~W2ltY zF!qj)95kHQ3mF^!7Mp9rK>1Qbwey0<${n87K})R z4#C9%hIk@~wu94xiS(R2X~D~+gBmr=vO5EKEmTpBZ`NOcZJDf_nGOEt-I5irsejSz zZD@m_-yZtRXBD5C(s3r9H^`2YY6#P zc4@vjk(9`IDEPsB5##EutG=zGIGA{kYcaJKoY12jQl%cR2luVmmH8eRJ#ng39C@1_ zmfpwCvMW#TJQ=~Fc<687XrhOsY{csN%mttGO%5J6rFPB3Th_y;=Q>zA&9@#uY;TVj znCwzHn|3>B=NH^vn~XOQD)z%@k9m?kxfc`=a0@w7m4R{1_qc#GO$*XzXeUFR`pRoj zSba9n*12i{h2U2qcR`TFfJ#MufnHFM+`MA@I_PcSn0o}aL-O+STEKyD1Ly-SESAs+ z0m}sy#cplLBO@b2`+V}bA1nFSLxZVt#~`ZpYQh+dpB=ZPt9)y&mrZ7>mEMIe6`N?g z{dbHY`uqdSVXP`jI;9Tb+e3%LVqdCPa@)7~cBQj{B4=uY_ zKy8-R*Bi!4t;gecPk$u3)?uisssj3UyL>z-O2Y{qiBBZ=?rG;LcaM(xr>0(1O2)f| zUur-gx3adDRZu|7RK(0w^bZZaYcf`HM=a?M5V-fYW(A%+DV{WdiK>|JB~zAH%@QO& zzkO|Ia*rc_K3bMd+!D7MU2k1G*C^;`$n#Bjq*Y@RKGdwUgq?*4+7^a$yFQTk_@ z;Rh^i%GlRnq^!ZupFc~a$^l_P;DNopJuNtlul~w$*nx8~@z<~GK-UuQRfLD*ot@<$ zK9}T_yxqIMY`FejRNVZAh^&{&4>1>A{@$ja*bjoIDKlmy4wF3scLf(2NtZo2x>1{? zX$WFIw9#QwUa{;^5lM~E*#D_15JzsqO`j0dKn*M^4(nC z+gW5}o-cIvzlP&^T5ijBr}+%(JiEqicO}(0y>3R`f4lXv>SH!ZRjp?58NUBzE>X&e z3>u#^Iohg+mW)jB=tMEZ!320NkXh6M?(kYgWo7?Vqr?JLM)3GP^hk|Wxl_bR-w12r z_~~LSPjX+Jp5`$tZnR}IQ3}P4I29iv(||?7FqdcnX5A&KaU;U+Gga(Pb@s%{(T{dv zX!}tCh81I1(+Qn_TwYAv;@;@`NI3=WU5I`WR5nkh;8cEYYs*<=K8ZEeN}Y+Qc~(|e zEg{#Mg(CnacyVS)V9&o=cJmM>Z|CuALAZlSeqn- z>5}pvGNrkZmS{B*7QRX~#_CL4M*o8THI98$Z)7ceSi)DzB6=YmOq!N#%Cya+tYCEK z7|8RG*0_+E7=Poe((6zgZS!a%!ZxU30dvJzRNDplh6@m-+<vdKAw5AZZ!y-zy2S=P`BYN7=#b)IOlS4kdxZxouX^K^h%;_{ z1o>@=%kcgc6^Oviot=$HMGqpEStz1JtgZ8vHdWQt0|y2arM?T285NSiKd@8lfC9%V zR2hfBdT?MDK)&wuB=`>lzNDc7>;aktS<9EwS8$fKy&bTZ1qk@uR*6fRwNi%tHqwi3 zkom2w7~TB%0Td#$zhca4A+728%VEKk4GK$;R*-&ozu<{?Tp$L&EWWx6DH(v#uf=qH zf&`B@I8MNMz?45E^XsFUR4GcqtG*$f94|36;a1;Q+bEhEKfi6(cX?XlxTmbf#<(E! z>E>09l4Ebwz`ZKbn6tvCYM=~){M!@g3mwo;aNBQ?Lc5z6g5jkP8tCaw1IhIV_&&>( zzYkzIlSvI1U!Cmh*B`R7SV4AEu6eaGBjZ*}OADX}@2YoM2rxlqat9Pncj3JL11DLo z)qJblF4_PEW@5 z3u+cb*~*w;nsAOv<`97Vut>K-EHs;0Jh5Wm2{f5EG=GrXzI{9O_wQ$VdbA*f2YR@d@Ucf_#h58X|aD?-ymtqB zZ_CPfk*=tGgD$&`h>g!0*-e0NPi`uMupUxW&Z#3v2RHXIHbgvo8~7BBFPmnAcotnx z6fG%qFG_D}<-`ekrxW7d`v}Py(v#x=?XbBpYZ|o#i4b(4CUM_glrlj zCML{gmHPwF2CovKBVt78ErPP=3UpOy!;hz?r|)vJc>A7-Q~3#s+{56Bi;H_mbZcp8 z>8VMEkG2-7ZXxE-Ne8h>Ht!<)c^78Nl*ijypEHp`6RxBmKTv^+pX#8y<~4dq+rQd_ z`+K20H~o(N2Dl3xcnD~L*}vw*F|FJ?HMUO+FDO^_#m&WQC~g9hf(s*G9>Rfsy(X8l z55k#06l96T#e{%-D}(VZRH6P*$)nUjH^3w9+G`NtvA#~sNNa0TfnD;5dGI`_=1@Uu z$M@~qw+a>ax;P7eiVrr~qB7Up4t-sN1)|6>Yoqzd>PC{poz+OC@k9%VSqwLbO4@Fs z1Ox`w6S;b2I7hK*RQSS%4v&f|S5M_4&F196?B}L7SbIkQqB}MGNXLy=klPA%ef*q| z{pBN(M8h5Bx05(^fgP+hbiO#(?l2}_F^agth~-RALA!D{*J?%5Bx{17a{cpr8A{nG zA}_Vuaa6Tahu4q$q0#CCungLe3cY5G`1;rY3ya*QGSTJxr%%@xH?StkoEj`phRST2 zK)S*Cv$2%HF9$<$2#qfRG}LT>#@j1&2`=vVk*LT zRYzSh+r_Q$_1X8$V!x-g^jP_w?yF$QFld`hRL;cY)|k%PEvZ(OSE7fN%f+khjuS>|)(Y;+RM|PPq zKH{q+6QJ`>5-RC1f58gt_xcp%;qQl`i2APF_w%6it(Z*)yev_rE9~{-;vX4m_3h6@ z6q)c}TczBg|Lr1Pr*Cva@EEr{$6QNgfZ{{LCQ4b!(?v!@Va}jtV(0D`R@9t~_wNnG zkQh2uk~>wp7Zc_Vi$(=nqa1bUdOvFm5uwouHwVSMj zsB4g*<_xiz^?ml$I><>1EPed&n9|DgwhabmYzx)xm1hJW-#kT)&a8aSUax#tO_Jfx z4?l}IhAT$NBd)yOcpApN_&bxYe)@=V)9|#%eDuF<6Ixg$DlJ9#Cd678B|-5)LSKo7 zE}HeU&T+=onUJW+n>aX)|p4<*req7*o%fw#Tu(b&n3Uyl;+uQfB6D zKssOlo-3AP_2z24yNrW&N2fridt@8~^$O*coGXd4BEl|LX2pGH3+dDo;RIp-QCP@X zu-Pw*UXxgjTCR9yKSC+UpyTSsQ0ws`o(-JYYqYeqs2_IqW&Y6Lq{d6UDSIs(qsBz8 zCh;{TD{JSm>zx<3bw1npCN7;_B|TN5r@K#0%R}4Dlk}y=_|=H6Wj0s9)r|{^8k^)sL`7ib4u%os)RI09%zJpLF5`qthd{7eHx27H8~aCL*Q3i-JI;Wj+n ze?HKsNcSH070;N0xDuX*i0h#Cq%AQck+|L7KEM+`f!Ql17uP`g+37om`MK3qfjGV~8}rs^y|r+ltg4RB z9l6Y2H2<>INWS+lNuKeM*%)_rr^0V}jT`*ton&jI_V2TToSm7uxVR`$o&V3j8UC-o zsZ*%3zQ4~v`^S&wU$3V^GS44s`!>D*(^#<41p%?6Rpw+se;pq^shfZETSHMBj8qVQ z1}y#O+RTH|{Ib6bgwFFLSMtxRRt@NtYM&o4$F4kS8!oo_9r2zr$IY2u4gPni+?wdQ z@I9ZBur(VyyIQS>2}wyblrr*lciO(r1?=-ZRPMZQkTH4D#7ggOuq3o1xZ0Oh=CXQvCzi5tQo1l_1Q-Ow`dHmrXXbVQ+ZvxQhkLhE7B z>(m*>4LmLru6szsKODgD_O%)Vh@o+s64Dg>wTu>N1qJ-Fva&6(1(^mc@wT0vuh^|w z_~lO_YL7l$!VBZ>0z@l_vry&amMZe9cp4ma-n0fp0`Ae3C{{ z#uX~_$Zr#HqMl=_S1a(~-n!$_QBf0sLl;Y8PV%3g2~764|05Fi&Sd7fDZp)0Nx+n~ zV#SIAIbjw?QBhHub0$rQya7!8d_2>PPs+CM2tEP4g~X!z+nUe!LEVP6Yh8i6YA#*A z92gbV1som~03O^73Nc{Y`{LcZy}A-zWNmW(5fM-@81s-DY5xAJ{;OT;nBukK{-}3MrD{@X` wkw#Zzg@6Mjr@6c<ph}6!sxYM={xkoM=ghSb@nHZ0Pgg&ebxsLQ07}T}n*aa+ literal 0 HcmV?d00001 diff --git a/results.parquet b/results.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f56f4d23628c5adddcc70511d22c819d367a858c GIT binary patch literal 7524 zcmd5>dpuNI``v^8F_S#SO3`ZT5 zfEu8dHmDeiAc*D_>PJ{Y7aYYj`3fYlDRluq`C|fpcqwEZiBa>vgz8(SIz8=1zY<6f= zL>S-LMxs9@XBK8^8U}hn6J22gZvse#GYBFIAGp47G?pp;Opbp*h~HN-MU<~Ynl-H< zK=_<3IH_Fhnwh+0qz{W58Xh3Xmn5`3n1Z+~&^ZG{MI4&#v~+ZId-*~)5axjk{VS+lay zr0C2t6WArLe2}K-xJb_7W-UXvIL#c6_Nb(cM|bn-?nB4B zpNc&yW+rO4t&-D@yIibY7@^}VN)hbl?EfYry-v>WlfRze*<=}%3G+Qi=4L6d&1Nwc+VMfANJ_pNJz zen3V88WptiXvMDwUYakZ)!;4ZV?B*ch@@?!N_NB8`bI>wpg~+-Gr{I){>$6;ri-b@ zk2a23<&1PVQ-b`#Yv^`I)@%Er0C&Y6uv{1tcr)WIKyS>%;Rt>3kJ*gI*wAPm@ zEvj*)+D~HrpX@nllJZEKXJ6RQD_GkQl^-!Yt-cqEhUs(WtFPxLy6`*h{* zgY;abeqM%XKTqLpKd;}vpEo8lXmVJJ&LiUW#DSciJ)@d#=;%R5&dN3;dOMmWRGSc| z^hV^?{AR6Pq4Y)Bvh=-NgE8qFdft5s`iQSo2eO;nwqxfLWD&O&*}1!=DaYCp{D+K9F7i1(XT zUH7({+wFy(GW7fsb-LN%L%K;pFOOCrVTn4Y2hgluW@_I92Y${)TZtNE5>H7-f0uKK zXR1BQ8?f&!3Lij@Mb0hj(>&%PcXe6itL&~2bDcWSOEcjWtT znrD?99CB*S0rh}e_bm#Jv%X$Q=`#-g{}*)U>c81v$ELUJO$x|40Hl~}$iL`1PTywk)ZIn#{h|J~CFO5#lw zv$&#zn#@R@l=5pul$@kry33plkAz?LXk=`zYO}bs)a<$_#c}7lh?s;u`^F6BX>g;Q zR;-%m-m=j}PkY@vUdPEomBy_h|BsTZUb2kk0GzoL)jjP(q`-o#cZQ zT=}wJ)Q)&+jJs=wxx|Hu^j>{KP*xUGFIuiej6MA_q=LuX^K$Huani$Tya_= z<3-SKl*m5MSaL#J+_Q+8;}Gs=`z&hLy>lzX&mQBZmpv!JK-I(5DwBKw&_lX^I z$1Xl?h-)utH`WY!*uc#HSvSwx_;Fy*glok7Yh{bCsARmnNUU5CFC8}&=(cf#`QQZc zu*ZR4=kygh%fHAlG*fJNw8Hrh&YgL+YA-s{)GAdbyY=0=e2qSaJdu>)ge`tB5V*0$ z_Hz36NNwfd&PdxnKSML%9~l>$=QVk1b>&k|V~;X=GIl=8KgxbfW3>-+_N5$0mSug? zJ>{A>P3HCDwA2G3PRB+HcL#3PUwX6W()A^{<4b0{=i7aZ6;GdE131NFQ8G7^Z`}@Fp|5(@udTkB7^1Hw>+#s|V%wtU2M-Qk z?VI~s_PXlFjYS6Z3-{_{{Ily*wxz$dh?j_K9hLE@xIN5v2p=gf_t|T!utX}`b|T%K zn@E_eR#cEvBcs#2?Y_oyuK#!xOGNI}+lCkAMK`RKsL#LWY?3_jt7|}1Xxj3@u*)auBWKUv0?K z*Nr1`N8UOYR*e=eo@Gud+OhLTZJpEI#@8<=Iv!hduV+CwHK6_bWLfI+fH~ru&-7d& z$eNj!kLLf*)OlDvYi}X)DSER?LLXO0RZdY&_3&>>?c%z-7CKd@dUmc*Su)wKqocOC zU&i;cZL&pB;;%i)qM`kb5+?^N-23+o%B4qZRaR1d-Se>g`kHW^7gC&d^!|?XTcS=# zZY1X!9vd1ID=+Z&xZ1hCKzdv9di$D#w^JH+t7a%axOm~}tRrJ1+NI5&yZh&`>g_}8 zJvXe>S-D8CA4_t5I9v}7D>VFn*^z%jDG6l$j{LQ47VgM(1oX8Cb`Jb}5Q1mD_|;{$ z@F3(1uawP=GaE7m**6O@lbSjuV`swfi6#JVo##(eP43s#)&1$)3gDPecHhgycVE@e zHZduE1fZa*+lcx@!$4Uqo@#P3H8r(cVj#~{hK4)XR47F=5txCJ0MlHmt!x`+QT#6o z4LeC8DU6Qa+pXkF2LEEF>GF&V`_Efv)06X>nyw7f?fTm%x87hb>Hu)Cj50;5+E}$$ zx4N$=#TRH+16}k!*Z5jdPds9ZcC5FHz8S1|GvOPdxV*BB1 z4Y5D<03W+EV}qeRU{uMD@IHG0D4Eo9M2`;uM`fkHb5R4}P$m092B#nFo?sbmSvU-$ zHAMzjWekJL;r-s2mrIm(K$sp)VY%K_UKMK}-cz3H=d<+OC_1>EKX~Fst;IKJ+ zWsdX!@N7~4b2fDVyg$9-9rt8ENGPkl`+C&?_;ZK-y*=*3z;yE5t+twBFxb7+J^sWn z*t4WI4_7q=GSfdy-VhlBiak4aXqJzHVO68u{!ODmaWaWFdl>p<>PSM809)J@I+~4x>SAl=X>rWvs-UD57 z#M%3+PlAc@xr1ttJAq(oKmVFJ6pMQfikII%{-h)yNbg+!yPVH3$nY^P(;dwPc8Y3G z*G0B~Gnc!LkCz_-3O0iV=VsS~EI;ngbM4LnfwMw6lh3UtN;z{aRE^7D=T2Ri6X|DzP=|_O3bk zGq{&J686`{ETD1V5IOteSzr*Yv?z6UB5>1qc$0YcBryL}^Dg#&CMf*jMT%d=KCpIc zjCLw59n7isu*%Ol2kMh`(?~X1;LUJH+2;5x(4GHN`jt87!B7DG6>D`isM5&IO{Qf4 z0m%gIzy-xR7r+{?fJ3{M<$?2~BQ?Kl$^{d58w2G(<^Y4C;#=`0xqw~zO0(=nE)cs6 zNDr4?1lH;;hlUdKz+_V@=|XcBI9JSN1Gis6MUbSyfejZxxFi?9HZdO*_IR9}GjRcQ z?>#85JSPVnS9`R%w&px|)qS>ZP&NFIU?8T|>JZWmI~TEcXDTo1h}I^E7#`MF)om~NwZ@&`FfLU?E>7mi1z)$yNeY3hiL zGUSC4VCNT=ttma>5|Tg_fvSJ6_)KXF)m)~P^`WVm_ahz^voVIo^a&*X$0CYHP)P_T z_-I6+w$Skfmhk=$eg5b#U&2#}LT`YnjxV;Ksjkntj}R@%%H(oN8%}wqeDtps8;RXE zr~>h&1*(lu21S(rlNurMPaBj-%&vP0RTgc$J6{?0%3MD@?5Q-|Fz&9&t#H$!cp%q48KS^0|DxFD!f2T@7tivYt z)CzM=mi7#$P^x?#_1h<1Vn2qOV~wGHq6<*}tur0*0|qt7hq|z021+Ok|Fv@nG06_% zoLh@gYBHI@`679RkiPNmBwoOfdih8^iy5TPIWGkBjs2cjg~4RnKtjZFYoaAZoH+iy z0=A8jq#2Hm>zr3`SpGi2Oj4R86}t>+W()?y9R4y4BjIzj1J;Dk{I0bO28Xel!GL!M z#!`DIkBY+NRt$#Uv>pfEP5l_K+l*n$VA%RWb@&6t_L!QDgDF(tz~Kx%Y=kv7&J-%b zzu2F3F&GH}C?AS1f<2JIddx1oJ5Jl^UmO@_SST%Y*sjj{f8#!d&%%aT@Z&v&?+EAhb$r;%7yY;5#ZNA#izC~MZ{Exh zn=fXX!QlIY=H_AteJzOP|7X7Zd=7?l@%6S~bKR!q!;ha0<}~5>IYQyr{flWbsQmQ> zQ=je*`4M%$P!VQdyO!_T*E{5-{sV#yCLbL{qweCDq&=vSBz*icKyU>99> zkQ>`5*h@E><3;z@VKJus`ycS-5A_Q6-sbA8jF&$&)}3d-(sW{{wm88S4N5 literal 0 HcmV?d00001 From a43488e2301b9576bad8b90f53aae412b8a05c37 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 11:06:13 +0100 Subject: [PATCH 20/38] a --- results.csv | 1 + 1 file changed, 1 insertion(+) create mode 100644 results.csv diff --git a/results.csv b/results.csv new file mode 100644 index 000000000..526b6af12 --- /dev/null +++ b/results.csv @@ -0,0 +1 @@ +analyzer,n_components,ngram_range,tf_idf_followup,vectorizer,fit_time,test_score,mean_fit_time,mean_score,std_fit_time,std_score From cdfaf1ab56310c9219b2e7b757e49779fe73c0b1 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 11:08:06 +0100 Subject: [PATCH 21/38] Removing some files used for prototyping --- examples/benchmarking_string_encoder.py | 197 ------------------------ results.csv | 1 - results.parquet | Bin 7524 -> 0 bytes 3 files changed, 198 deletions(-) delete mode 100644 examples/benchmarking_string_encoder.py delete mode 100644 results.csv delete mode 100644 results.parquet diff --git a/examples/benchmarking_string_encoder.py b/examples/benchmarking_string_encoder.py deleted file mode 100644 index d5a029cd9..000000000 --- a/examples/benchmarking_string_encoder.py +++ /dev/null @@ -1,197 +0,0 @@ -# %% -# Benchmarking different parameters for the StringEncoder transformer -# This script is used to test different parameters to use with the StringEncoder -# and see which configurations work best. -# -# For the moment, I am only considering the Toxicity dataset to test the performance, -# and more tables should be tested to have more reliable results. It's still a -# good start. -# -# The version of the StringEncoder used here will be simplified for the next release. - -# %% -# Import all the required libraries -import numpy as np -import pandas as pd -import polars as pl -import seaborn as sns -from matplotlib import pyplot as plt -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.model_selection import cross_validate -from sklearn.pipeline import make_pipeline -from tqdm import tqdm - -from skrub import StringEncoder, TableVectorizer - -# %% -# Import the toxicity dataset and prepare it for the experiments. -from skrub.datasets import fetch_toxicity - -dataset = fetch_toxicity() -X, y = dataset.X, dataset.y -X["is_toxic"] = y - -y = X.pop("is_toxic").map({"Toxic": 1, "Not Toxic": 0}) - -from skrub import TableReport - -TableReport(X) - -# %% -# Prepare the parameter grid to evaluate. -from sklearn.model_selection import ParameterGrid - -configurations = { - "ngram_range": [(1, 1), (1, 2), (3, 4)], - "analyzer": ["word", "char", "char_wb"], - "vectorizer": ["tfidf", "hashing"], - "n_components": [30], - "tf_idf_followup": [True, False], -} - -config_grid = ParameterGrid(configurations) - - -# %% -def format_name(params): - # Simple helper function to format the labels - s = ( - f'{params["vectorizer"]},' - + f'{params["ngram_range"]},' - + f'{params["analyzer"]},' - + f'{params["tf_idf_followup"]}' - ) - return s - - -# %% -# Run the experiments and save all the results in a dataframe. - -results = [] - -for params in tqdm(config_grid, total=len(config_grid)): - print(params) - this_pipe = make_pipeline( - TableVectorizer(high_cardinality=StringEncoder(**params)), - HistGradientBoostingClassifier(), - ) - results_ = cross_validate(this_pipe, X, y, scoring="roc_auc") - print(results_) - params.update( - { - "fit_time": list(results_["fit_time"]), - "test_score": list(results_["test_score"]), - "ngram_range": str(params["ngram_range"]), - } - ) - results.append(params) - -df = pl.from_dicts(results) - -# %% -df = df.with_columns( - mean_fit_time=pl.col("fit_time").list.mean(), - mean_score=pl.col("test_score").list.mean(), - std_fit_time=pl.col("fit_time").list.std(), - std_score=pl.col("test_score").list.std(), -) - -df.write_parquet("results.parquet") - - -# %% -# Build the Pareto frontier plot for a given set of variables, and color the -# dots by a specific variable. -def pareto_frontier_plot( - data, - x_var, - y_var, - hue_var, - ax, - ax_title=None, - ax_xlabel="", -): - if not isinstance(data, pd.DataFrame): - raise ValueError() - x = data[x_var] - y = data[y_var] - - # ax.set_xscale("log") - - xs = np.array(x) - ys = np.array(y) - perm = np.argsort(xs) - xs = xs[perm] - ys = ys[perm] - - sns.scatterplot( - data=data, - x=x_var, - y=y_var, - hue=hue_var, - ax=ax, - palette="tab10", - ) - - xs_pareto = [xs[0], xs[0]] - ys_pareto = [ys[0], ys[0]] - for i in range(1, len(xs)): - if ys[i] > ys_pareto[-1]: - xs_pareto.append(xs[i]) - ys_pareto.append(ys_pareto[-1]) - xs_pareto.append(xs[i]) - ys_pareto.append(ys[i]) - xs_pareto.append(ax.get_xlim()[1]) - ys_pareto.append(ys_pareto[-1]) - - ax.plot(xs_pareto, ys_pareto, "--", color="k", linewidth=2, zorder=0.8) - ax.set_ylabel("") - ax.set_xscale("log") - h, l = ax.get_legend_handles_labels() - ax.set_xlabel(ax_xlabel) - - return (h, l) - - -# %% -# Use the function defined above to plot three different Pareto plots that are -# colored by hue_var. -fig, axs = plt.subplots(1, 3, figsize=(10, 3)) - -for ax, hue_var in zip(axs, ["analyzer", "ngram_range", "vectorizer"]): - pareto_frontier_plot( - df.to_pandas(), - x_var="mean_fit_time", - y_var="mean_score", - hue_var=hue_var, - ax=ax, - ) -fig.savefig("results.png") -# %% -# Boxplots comparing the test score for different analyzers -sns.catplot( - data=df.to_pandas(), - x="analyzer", - y="test_score", - hue="ngram_range", - kind="box", - col="vectorizer", -) -# %% -g = sns.catplot( - data=df.to_pandas(), - x="analyzer", - y="fit_time", - hue="ngram_range", - kind="box", - col="vectorizer", -) -g.set(ylim=(0, 30)) -# %% -# From the results, it's clear that the tfidf vectorizer is much faster than the -# hashing vectorizer, and achieves similar if not better test score. The best -# ngram_range is (3,4), and `char` and `char_wb` are the better analyzers. -# -# As mentioned before, this is a preliminary study, but it's already providing -# interesting results and an indication of what should be used as default -# parameters for the StringEncoder. diff --git a/results.csv b/results.csv deleted file mode 100644 index 526b6af12..000000000 --- a/results.csv +++ /dev/null @@ -1 +0,0 @@ -analyzer,n_components,ngram_range,tf_idf_followup,vectorizer,fit_time,test_score,mean_fit_time,mean_score,std_fit_time,std_score diff --git a/results.parquet b/results.parquet deleted file mode 100644 index f56f4d23628c5adddcc70511d22c819d367a858c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7524 zcmd5>dpuNI``v^8F_S#SO3`ZT5 zfEu8dHmDeiAc*D_>PJ{Y7aYYj`3fYlDRluq`C|fpcqwEZiBa>vgz8(SIz8=1zY<6f= zL>S-LMxs9@XBK8^8U}hn6J22gZvse#GYBFIAGp47G?pp;Opbp*h~HN-MU<~Ynl-H< zK=_<3IH_Fhnwh+0qz{W58Xh3Xmn5`3n1Z+~&^ZG{MI4&#v~+ZId-*~)5axjk{VS+lay zr0C2t6WArLe2}K-xJb_7W-UXvIL#c6_Nb(cM|bn-?nB4B zpNc&yW+rO4t&-D@yIibY7@^}VN)hbl?EfYry-v>WlfRze*<=}%3G+Qi=4L6d&1Nwc+VMfANJ_pNJz zen3V88WptiXvMDwUYakZ)!;4ZV?B*ch@@?!N_NB8`bI>wpg~+-Gr{I){>$6;ri-b@ zk2a23<&1PVQ-b`#Yv^`I)@%Er0C&Y6uv{1tcr)WIKyS>%;Rt>3kJ*gI*wAPm@ zEvj*)+D~HrpX@nllJZEKXJ6RQD_GkQl^-!Yt-cqEhUs(WtFPxLy6`*h{* zgY;abeqM%XKTqLpKd;}vpEo8lXmVJJ&LiUW#DSciJ)@d#=;%R5&dN3;dOMmWRGSc| z^hV^?{AR6Pq4Y)Bvh=-NgE8qFdft5s`iQSo2eO;nwqxfLWD&O&*}1!=DaYCp{D+K9F7i1(XT zUH7({+wFy(GW7fsb-LN%L%K;pFOOCrVTn4Y2hgluW@_I92Y${)TZtNE5>H7-f0uKK zXR1BQ8?f&!3Lij@Mb0hj(>&%PcXe6itL&~2bDcWSOEcjWtT znrD?99CB*S0rh}e_bm#Jv%X$Q=`#-g{}*)U>c81v$ELUJO$x|40Hl~}$iL`1PTywk)ZIn#{h|J~CFO5#lw zv$&#zn#@R@l=5pul$@kry33plkAz?LXk=`zYO}bs)a<$_#c}7lh?s;u`^F6BX>g;Q zR;-%m-m=j}PkY@vUdPEomBy_h|BsTZUb2kk0GzoL)jjP(q`-o#cZQ zT=}wJ)Q)&+jJs=wxx|Hu^j>{KP*xUGFIuiej6MA_q=LuX^K$Huani$Tya_= z<3-SKl*m5MSaL#J+_Q+8;}Gs=`z&hLy>lzX&mQBZmpv!JK-I(5DwBKw&_lX^I z$1Xl?h-)utH`WY!*uc#HSvSwx_;Fy*glok7Yh{bCsARmnNUU5CFC8}&=(cf#`QQZc zu*ZR4=kygh%fHAlG*fJNw8Hrh&YgL+YA-s{)GAdbyY=0=e2qSaJdu>)ge`tB5V*0$ z_Hz36NNwfd&PdxnKSML%9~l>$=QVk1b>&k|V~;X=GIl=8KgxbfW3>-+_N5$0mSug? zJ>{A>P3HCDwA2G3PRB+HcL#3PUwX6W()A^{<4b0{=i7aZ6;GdE131NFQ8G7^Z`}@Fp|5(@udTkB7^1Hw>+#s|V%wtU2M-Qk z?VI~s_PXlFjYS6Z3-{_{{Ily*wxz$dh?j_K9hLE@xIN5v2p=gf_t|T!utX}`b|T%K zn@E_eR#cEvBcs#2?Y_oyuK#!xOGNI}+lCkAMK`RKsL#LWY?3_jt7|}1Xxj3@u*)auBWKUv0?K z*Nr1`N8UOYR*e=eo@Gud+OhLTZJpEI#@8<=Iv!hduV+CwHK6_bWLfI+fH~ru&-7d& z$eNj!kLLf*)OlDvYi}X)DSER?LLXO0RZdY&_3&>>?c%z-7CKd@dUmc*Su)wKqocOC zU&i;cZL&pB;;%i)qM`kb5+?^N-23+o%B4qZRaR1d-Se>g`kHW^7gC&d^!|?XTcS=# zZY1X!9vd1ID=+Z&xZ1hCKzdv9di$D#w^JH+t7a%axOm~}tRrJ1+NI5&yZh&`>g_}8 zJvXe>S-D8CA4_t5I9v}7D>VFn*^z%jDG6l$j{LQ47VgM(1oX8Cb`Jb}5Q1mD_|;{$ z@F3(1uawP=GaE7m**6O@lbSjuV`swfi6#JVo##(eP43s#)&1$)3gDPecHhgycVE@e zHZduE1fZa*+lcx@!$4Uqo@#P3H8r(cVj#~{hK4)XR47F=5txCJ0MlHmt!x`+QT#6o z4LeC8DU6Qa+pXkF2LEEF>GF&V`_Efv)06X>nyw7f?fTm%x87hb>Hu)Cj50;5+E}$$ zx4N$=#TRH+16}k!*Z5jdPds9ZcC5FHz8S1|GvOPdxV*BB1 z4Y5D<03W+EV}qeRU{uMD@IHG0D4Eo9M2`;uM`fkHb5R4}P$m092B#nFo?sbmSvU-$ zHAMzjWekJL;r-s2mrIm(K$sp)VY%K_UKMK}-cz3H=d<+OC_1>EKX~Fst;IKJ+ zWsdX!@N7~4b2fDVyg$9-9rt8ENGPkl`+C&?_;ZK-y*=*3z;yE5t+twBFxb7+J^sWn z*t4WI4_7q=GSfdy-VhlBiak4aXqJzHVO68u{!ODmaWaWFdl>p<>PSM809)J@I+~4x>SAl=X>rWvs-UD57 z#M%3+PlAc@xr1ttJAq(oKmVFJ6pMQfikII%{-h)yNbg+!yPVH3$nY^P(;dwPc8Y3G z*G0B~Gnc!LkCz_-3O0iV=VsS~EI;ngbM4LnfwMw6lh3UtN;z{aRE^7D=T2Ri6X|DzP=|_O3bk zGq{&J686`{ETD1V5IOteSzr*Yv?z6UB5>1qc$0YcBryL}^Dg#&CMf*jMT%d=KCpIc zjCLw59n7isu*%Ol2kMh`(?~X1;LUJH+2;5x(4GHN`jt87!B7DG6>D`isM5&IO{Qf4 z0m%gIzy-xR7r+{?fJ3{M<$?2~BQ?Kl$^{d58w2G(<^Y4C;#=`0xqw~zO0(=nE)cs6 zNDr4?1lH;;hlUdKz+_V@=|XcBI9JSN1Gis6MUbSyfejZxxFi?9HZdO*_IR9}GjRcQ z?>#85JSPVnS9`R%w&px|)qS>ZP&NFIU?8T|>JZWmI~TEcXDTo1h}I^E7#`MF)om~NwZ@&`FfLU?E>7mi1z)$yNeY3hiL zGUSC4VCNT=ttma>5|Tg_fvSJ6_)KXF)m)~P^`WVm_ahz^voVIo^a&*X$0CYHP)P_T z_-I6+w$Skfmhk=$eg5b#U&2#}LT`YnjxV;Ksjkntj}R@%%H(oN8%}wqeDtps8;RXE zr~>h&1*(lu21S(rlNurMPaBj-%&vP0RTgc$J6{?0%3MD@?5Q-|Fz&9&t#H$!cp%q48KS^0|DxFD!f2T@7tivYt z)CzM=mi7#$P^x?#_1h<1Vn2qOV~wGHq6<*}tur0*0|qt7hq|z021+Ok|Fv@nG06_% zoLh@gYBHI@`679RkiPNmBwoOfdih8^iy5TPIWGkBjs2cjg~4RnKtjZFYoaAZoH+iy z0=A8jq#2Hm>zr3`SpGi2Oj4R86}t>+W()?y9R4y4BjIzj1J;Dk{I0bO28Xel!GL!M z#!`DIkBY+NRt$#Uv>pfEP5l_K+l*n$VA%RWb@&6t_L!QDgDF(tz~Kx%Y=kv7&J-%b zzu2F3F&GH}C?AS1f<2JIddx1oJ5Jl^UmO@_SST%Y*sjj{f8#!d&%%aT@Z&v&?+EAhb$r;%7yY;5#ZNA#izC~MZ{Exh zn=fXX!QlIY=H_AteJzOP|7X7Zd=7?l@%6S~bKR!q!;ha0<}~5>IYQyr{flWbsQmQ> zQ=je*`4M%$P!VQdyO!_T*E{5-{sV#yCLbL{qweCDq&=vSBz*icKyU>99> zkQ>`5*h@E><3;z@VKJus`ycS-5A_Q6-sbA8jF&$&)}3d-(sW{{wm88S4N5 From c0c066f735e6337c09f5711ce36d870857c93484 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 11:43:03 +0100 Subject: [PATCH 22/38] Added new parameters, fixed docstring, added error checking --- skrub/_string_encoder.py | 51 ++++++++++++++----- skrub/tests/test_string_encoder.py | 82 ++++++++++++++++++++++++++++-- 2 files changed, 117 insertions(+), 16 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 75404b4ce..d7be3d299 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -23,8 +23,23 @@ class StringEncoder(SingleColumnTransformer): Parameters ---------- - n_components : int - Number of components to be used for the PCA decomposition. + n_components : int, default=30 + Number of components to be used for the PCA decomposition. Must be a + positive integer. + vectorizer : str, "tfidf" or "hashing" + Vectorizer to apply to the strings, either `tfidf` or `hashing` for + scikit-learn TfidfVectorizer or HashingVectorizer respectively. + + ngram_range : tuple of (int, int) pairs, default=(3,4) + Whether the feature should be made of word or character n-grams. + Option ‘char_wb’ creates character n-grams only from text inside word + boundaries; n-grams at the edges of words are padded with space. + + analyzer : str, "char", "word" or "char_wb", default="char_wb" + The lower and upper boundary of the range of n-values for different + n-grams to be extracted. All values of n such that min_n <= n <= max_n + will be used. For example an `ngram_range` of `(1, 1)` means only unigrams, + `(1, 2)` means unigrams and bigrams, and `(2, 2)` means only bigrams. See Also -------- @@ -62,18 +77,12 @@ def __init__( self, n_components=30, vectorizer="tfidf", - ngram_range=(1, 1), - tf_idf_followup=False, - n_features=None, - max_features=None, - analyzer="word", + ngram_range=(3, 4), + analyzer="char_wb", ): self.n_components = n_components self.vectorizer = vectorizer self.ngram_range = ngram_range - self.tf_idf_followup = tf_idf_followup - self.n_features = n_features - self.max_features = max_features self.analyzer = analyzer def get_feature_names_out(self): @@ -103,6 +112,25 @@ def fit_transform(self, X, y=None): """ del y + # ERROR CHECKING + if self.analyzer not in ["char_wb", "char", "word"]: + raise ValueError(f"Unknown analyzer {self.analyzer}") + + if not all(isinstance(x, int) and x > 0 for x in self.ngram_range): + raise ValueError( + "Values in `ngram_range` must be positive integers, " + f"found {self.ngram_range} instead." + ) + if not len(self.ngram_range) == 2: + raise ValueError( + f"`ngram_range` must have length 2, found {len(self.ngram_range)}." + ) + + if not isinstance(self.n_components, int) and self.n_components > 0: + raise ValueError( + f"`n_components` must be a positive integer, found {self.n_components}" + ) + if self.vectorizer == "tfidf": self.pipe = Pipeline( [ @@ -125,8 +153,7 @@ def fit_transform(self, X, y=None): ), ), ] - if self.tf_idf_followup: - pipe_elements.append(("tfidf", TfidfTransformer())) + pipe_elements.append(("tfidf", TfidfTransformer())) pipe_elements.append(("tsvd", TruncatedSVD(n_components=self.n_components))) self.pipe = Pipeline(pipe_elements) else: diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index c9dd8213d..34d377981 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -1,6 +1,6 @@ import pytest from sklearn.decomposition import TruncatedSVD -from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer from sklearn.pipeline import Pipeline from skrub import _dataframe as sbd @@ -20,10 +20,15 @@ def encode_column(df_module): def test_encoding(encode_column, df_module): + ngram_range = (3, 4) + analyzer = "char_wb" + n_components = 2 + + #### tfidf vectorizer pipe = Pipeline( [ - ("tfidf", TfidfVectorizer()), - ("tsvd", TruncatedSVD(n_components=2)), + ("tfidf", TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer)), + ("tsvd", TruncatedSVD(n_components=n_components)), ] ) check = pipe.fit_transform(sbd.to_numpy(encode_column)) @@ -32,7 +37,12 @@ def test_encoding(encode_column, df_module): check_df = df_module.make_dataframe(dict(zip(names, check.T))) - se = StringEncoder(2) + se = StringEncoder( + n_components=n_components, + vectorizer="tfidf", + ngram_range=ngram_range, + analyzer=analyzer, + ) result = se.fit_transform(encode_column) # Converting dtypes to avoid nullable shenanigans @@ -42,6 +52,70 @@ def test_encoding(encode_column, df_module): df_module.assert_frame_equal(check_df, result) +def test_hashing(encode_column, df_module): + ngram_range = (3, 4) + analyzer = "char_wb" + n_components = 2 + + #### hashing vectorizer + pipe = Pipeline( + [ + ("tfidf", HashingVectorizer(ngram_range=ngram_range, analyzer=analyzer)), + ("tsvd", TruncatedSVD(n_components=n_components)), + ] + ) + check = pipe.fit_transform(sbd.to_numpy(encode_column)) + + names = [f"col1_{idx}" for idx in range(2)] + + check_df = df_module.make_dataframe(dict(zip(names, check.T))) + + se = StringEncoder( + n_components=n_components, + vectorizer="hashing", + ngram_range=ngram_range, + analyzer=analyzer, + ) + result = se.fit_transform(encode_column) + + # Converting dtypes to avoid nullable shenanigans + check_df = sbd.pandas_convert_dtypes(check_df) + result = sbd.pandas_convert_dtypes(result) + + df_module.assert_frame_equal(check_df, result) + + +def test_error_checking(encode_column): + n_components = -1 + vectorizer = "notavectorizer" + ngram_range = (-1, 2) + analyzer = "noanalyzer" + + se = StringEncoder( + n_components=n_components, + ) + with pytest.raises(ValueError): + se.fit_transform(encode_column) + + se = StringEncoder( + vectorizer=vectorizer, + ) + with pytest.raises(ValueError): + se.fit_transform(encode_column) + + se = StringEncoder( + ngram_range=ngram_range, + ) + with pytest.raises(ValueError): + se.fit_transform(encode_column) + + se = StringEncoder( + analyzer=analyzer, + ) + with pytest.raises(ValueError): + se.fit_transform(encode_column) + + def test_get_feature_names_out(encode_column, df_module): """Test that ``get_feature_names_out`` returns the correct feature names.""" encoder = StringEncoder(n_components=4) From 887e04762b98f9ee50e9e6822c0841683988b65d Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 11:47:33 +0100 Subject: [PATCH 23/38] Removing an unnecessary file --- examples/results.png | Bin 39551 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/results.png diff --git a/examples/results.png b/examples/results.png deleted file mode 100644 index 633fc6d75fb37939ba4b5c03eb81a2af476a4a4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 39551 zcmeFZbyOT(wDw6Lc(C9O!CiuDaCd^cySoOrAi*61B)A245AN=+jl1_0@16JFyVlH| z`PTaW{jyd-tIFaHzO-OiG_=cgEJ2klkIf&g~UR`x%7Lx6u+RUiekXZIOd1ri+%ACQ*zu;K!rv4^Iq(h5d&ehMNVRh za{o?35Ar7shDCa9o8WwsjB~cL{At|YSkt)Y*i{jRm=dZm9BJ@B|HybY@<9CGuTh01 z0~r6~*Qmmc&@>Yv|9*~>1iJa(*PAc}U$Fl@e&rj+!N0GA@#7$M|2?k*lyWZOzvs34 zg5dJ+>u#j~evxC_kAi1f?FPWEw9K z?7$Q+5{#-|QosA7Sr*~i>+kwHag`tZjaOS#K3bk19W^xx2glrPTNwlS|MR>fX%!}3 zZF+tfAr9zu@tX4E33v^&lEG&uVBCYYH*i1phlk@Pj4%JV?=d zpsh(bJe>@>*`aL;lItGonT=2u2E@9~+-S?fb_}1bHUF0?*uk476eD}_`UCHj`$DoV zzBj}8wMQu2Ivln*WpEKvnV{kk8=K&C#&IyzeCZ)$Ve($9;cGtgN~nFMK|`&PwNPuTb7kdIFe);^ zIIYptKCf*b_ZK}!+-<>^u!2mZe=cYb%%qC1@nO3gcP2<)0VDW&zsh!_H-6UFrwwnc zg9H1HmvF>JS+Ru293>Z_7yHZj>4O`;wNCLZ1=kyTH5~FyyuLGfHnOqppnQ>bv#dwV zN3V!e%Iw>(Hu5YtIj@NpQo>Df| zz&krVUAn<;wdA^eu3TFp3X8#JToC;1ob7Wt7-ffzi#rJX&$!_@0G7z$=4cjs)eSg& zB!hF@PIEY!fdz-r@5W)XJ7mRUJGy?K*X`Kg{(KXA6?tQ$E9m|yr)}K*_4#2WM-ZI1 z?0UMAM(lUr<)^<4&ON*^wLq?f?1QAmG#!O;X11emh)$E0l7fc-rh)0vOmY1Q0wSX2 zY{}Q8E!{dROx?b}{-h4VgaeJ&mFGfs$eVq%h_ zhT6bk_w=*dCFdt!GTe1LzQtdQtu_&#wL?CoG=oX>jwaq72>lrwwm4g2Ha z!nybA@hu-02Kg7J6A4S&@9H1-pjMk@Ddgg!`hC=Is~P@3J%<`Rh@bK3=ZG-({?0qq z?vmRPKEXjl&mqE|W{VK?QCq6lFX8)BaUH_mD;cZJ5I8 zAFE%!F64)Sa$tSgARZnb$f>CGui5|n`9s@o=jD}D&kyeJwc${=D02{*Vp%T z?be5zoBQh$2PbDk_n*W>4B%EWHJ_w)-+f3JRNeK!6sz{uQ&d#^Wov9~e1e^bHFdn8 z+=^VujwN`s|50}OIN~g99n{B zFE1{Nr|N(F_<@-W4GXJoxtyDui!cnV`lT*&OUuj4r%mGd#h2dvcah}Z^eQW->J1GI zZxJI|&v`C`dR*J&pS#LA<&9opS>AdcTJpDa(&9MV+uIvB(-oMg^U+>!v3|dwxar`=SXM$jI9ZVIkV*|d3)=Hj8=$7rGq3>Bo~f2@Y+RAPYFC5f?$$>1%i!_0N9!KJOUfCb?bq06x7aBT)zq{& zN-)YBiGZ0!##6)Z%6oR^mE-g)6NIPNALXmpJ3l-xri5tDlKMgR)z>Mc3vxXCB5JdS zGzaZSrCOt;d}X)Aqj-hFPE5(;xOCO0HBwarvCR z{oQ^UU%jo_lgfCFI1BC5v)%DLq48;}6MDX>J8cQ~6v#HaDLs>RE8Q=ny^d)kSf9mA z#Vls*sTt{#A={9r zO6u6=I4f$u#NNb;f%c}|l7VH3-_iDP;^VcU(y~R6TU8bpv7Akn4@RGZZtleUt~^7%u4th=hXJ9GrF1C2 zZe1Y7UQINIX*B!O42BN3sB7q?qpgmu1^LJn;;5Z+*{WwkVPuT(UEJ6a9j{}2WA+~; zbxpR%scm0$)VAQemzcHS`udwX&ifhR>G1bg3BiBC}ajp)#Mil zo-lVa>mju;&@P^{lv*iieakYIWgeafY}dIv>20gsu@d2C%&$4fH6+4e-oJ!kcUa<+ zmvDYfZw+@V!uChi;*_9ZT>XEA>L6MscK zb6=ni*qwGuE37uwzdgL{FW-X2Nc#^ezcE6hI zuy>q4YLTRsbaM4@)yqdEK7H2z;pckA?6;P)=g{^kAQ!zjP5cm=771nG!9#Ol+rbbZ0Mg0{}oM)##Dco)Q8`~7RU$ZC{)Q8c7 zv>bq^{e2f}h#`qMY6mmr;x>ygVnXJvdA*&sQRNL^v3>_Az}<&v6Q2%fz;%qd=!Dz* zV)LvTx50g~LISp5nGlaK)Gb2cKcu++O#TSN1p1CP1dF(VcykUe-4s3|WwGZaqZCKd z4}X1LNAKqiGXlF~t_j(uI7>E*jMCZ84jK5wratb6wA> zKlCPDDSM-xV9V9&*OrPD$FIx3s#z{qQu&)&%@-@@@@j_XB%=y4(0Eb=KUoh_v+?Lu zx=$+_1(`ladUa7PJi=K!UbyO(lh1qgwX#b1Rbck3ZqXlZF|G*zt^Pn+o3&9@t{>zV zeBXGeDWg=WTQm>*O_OC_kzDwf^5j$@#U`>-NZ3#iO;#al9-^a=iLg#;z9postp@GL z8GRY$^hVG_!pLp>sl6fbvl zwGigQC_*$*tJM7dXo(?S3%P zj6&3y%kjzY5GVhZCriR9yf(E8pV`{LK*(fzo$jm8+3v2?!m3hnjNz8Lg)F(7 z5XqLUI{;+W@zqqAkak=kUCO5F@v`7FkLDmZg5F_R)S;O5TSKMytT!y3Ggz1%1_3{>I zML7?W3eS*bVKb%!7x0+#_UU@(y@L?GakdVU_6|H1$D zVW012MulGUPw-lsxAV>b+TZWELDIjBfXJf$W~tV)$G#&gE52`9ML?MJh<1=nIGm1@ zTNIAeinuB8l>BOc-2D0e;yWR4G7#X@IBW|0`1ol2F^TB8X|P%B{;ICqr|Z5drlUgu z1&eF~1UHytF5X^VA8~Oly@NxCNK}Pqz5?+$s=BaKII8fB05E9-22G6mV+cRsFn89E z^z;P1gMz7Y1*wHCU!SgK5c0Y$_`Q!M792}ww{F}F#Xp|o^}42>oSbB{oCA95b&msoH2U2L!?etnHO^Jd43#)&pB z^rxk!E(&$04tC)9Z?}dDxHXh>mHOi{Ol`eSXC31m;Md`Xb-&@G+0ry#_X^uAQjt)H zenR(&c3*E8B)ndrGXulJ+ohH5B`zII9nY-q-o2}ru(7d`Yb7KNM*Od?Y6Cm;gttQi-G3X|w_rU?iL zV6$J#3=R&KHuiixEKK8ZEjjTN5(?0MWzH7xU796@+f*Zu)!_m9kx9Y5GbM$LCfo=G z^yEEWbSR^4bCumH&oXONUw5-B=wP-qrN=fhDvIv;ym;0M0z=HPF}llOW1-quR7~tE zUX)s?SmpdOe=zE)0x&|iq$Poqg)d1}nd%DYvfNxISe=&CFv1U$BPJNx@BU$JZUv%O;gu?`6d3AC{O zEoCL%2)es4T4A>vB+F}eoIl&@YqVa87ZLT{saWrS{3i{i+YyX}$40``!3!)BNXf-$VvdI(CK+m%A0nLU2mMQvW&c+(~p-{-=CuR10diEk+7vf^NK= zxD*UeU5w(HVz+O%Qn6|UvK*X}j0u)`yh^&r&*C`d9Ol5fu6yejUBYD4yn1F#5hWOm zg1|zo=TDu2ANgh~Vq|B~fz)e*1CW1-&(4+yW;Zb5T7Y}b_?DvbM|nqZOuYNjhm~LzBGvI6mc$UzopGUgt0Bw+Oxn8Mbse@i|$>XI8fr^PDyIn9&OkT>l=$P?smR6Q$QShq?^Shdx-meqH0Urt;D~$E4 z+PS$}bRz+R^W36Pg0TLO-K$yLrF=7dMWSt#jKb4l16G3-ZtVzy_&QKfKS&ri_6rvl zh*+k~zpSVJP0Z}+3c^N|HgLb^%F-SgIlG$%-`rgLo}m2o`;(~uOk|> z@$AvAAXv>-4;FpD2a0jNs}a+)nr0xOTd1q=9CV_>5>9;*#}+wfty)U(&ju z)M{*;E1b`^nYZ94@AZVL8{TZsDY=DkDKsi@5$l0TPlT5jlRdpWSer*J?Ckw`vyUD* z(M5F!Ng=q!D3z~d1{hh@EMI0?Kb~sUMkn#gtC-ajMIG}bjoTtN#K##rm=)!5M4iHM zwn1sVESd^M8(znv9^AjrMV*!VMIE_yL~@h|JjRJ~ZO%2{J6#BREX21b_Xje7*1Vcmo4Se7lT~{*ed^Dl~C#SWD&4$j>Zu3+N}1* z{Ff49^v;10&;J2P-WG|kiK#;mM%o`yp50-?rVwaoKG$$^=P1e4Urn#E4UNQ92I3Bn zK0A@LO}*I3518$uluF6T)oEvEX6pQE6kSy^s|UuDw3Qxw$gjCW+1437(I>HK%37og zp$pe$u8!2^IEi@YsHT%32A}2|a-i!wDTx<@$>=NX4s?S}*avdPBbBXOuv2%>@<8?* z@K7$JJ40v;j!^{yb*fjKp~^7+WN+uO^Zp0cYwPjfS?~u zg@9nXizTq?74I*<_wgW~lOs*nLG)@!4rQAZc@y*qMMX&OOVvyC&! z(Kmf#^Jht8wH7Qtm2kqVq8dWBmgL&IWvsyJI953bKqe>fDaR(I4smDNZTw<~z26sQ zByHC^J)2I|=IuwVXkZ!`X4p+-Fsseh!WIMJ0ZCkl0&E;l%Tb{)Tl;6PVA7Mk@bUsq z`3wtxFAiIUZw=UHQGsGg_dePmwPQ@t54OeB%qY#ATUR;aMUK(}RTmy`ZS>(NnkK5S zKlr@s)(V{W>~A#vwXq3(^yo_eNk;_1g|pi4#Y@lYkYddr?CW)XAS5gd87uHe?Yuu` zx;K)R(zafhyCFPoQo$7w#)UkHjD91 z&ii_|-k5bFUP3hJ0xw~0gXWdFWrFd8$;OLi={}g{dDnB@d`$4juMqw7B8mhqZ77iV z0dwP~)h)b0b{rVbWosE)hMom_{f+I>mPDK5dm@F@V}k~>Dd1lUl|(D9^SX0MeC*WF z#YOFn(~k`h6Zck)zVOfM`6e{)?(gq&1u49-bGwl&Cpe8WUMAMBtalY2>5V8P}Zc)DZS&nowb`nD1%x#bj{{-^L5@5_JKcVyBPKt z-1!T7Tsl#AlH@lU_-5bV6A^8mlC|)jSRdBdKXk!v&eT)N!GnR|1^1d|;=~Wc62aI> zVHou&1gK^>BqXtNwS;WD^)_05{;UDXeps0QIl}PPe0o1&RcBF`;y3#xzry3LX9QpS zN4)w_kN$D*eMdmHmN>^oQoH&17ph&1&#%*j^%bEIo_(f#PcM1`ZI!&`pI$sI&X*9L zvY8Db(J5scceun(oXOiJfCPJwN&WeyLYRlW3Dq-VYD@xR0BO4Hj$$OHa_-*EYhh43KY!y>y0_)hj=ws8G>ir(_H4k}iF@ij zRU&53>iPsrZBx&a2u17+$|4L|joL~7mcje1o3PJPgLGZfHNhx%J_F{kf;DxvvTKrM z{`66>jFdx_rzm`y%g%7$Qi6An7a*wESy2{e zdRaad77?=K9)DSXhZTJ!-M;|wd;Quq#8(EnSi(tR0ejzEW2EXh%&`P@m1gQ;Ipt0Z(-Q6a9Nxye#>rm$rUZ0v2vwUtQo3S^o{ zC(WnUkA|;Wf#bM7za+K|BEOqq*-O`A?n3~}KQ6e>Px(@sIVt18WEj3>>-mFN!0xjA zR{jrX;hWVMbndrdlTEx9>g~tV{91H^f!ScAU)1 zp6RdN?1J55LXZ~g35Xxiv7H)rhJ&{*$QTw%7|@Zd250Iq?p}{M;5HycxJd)nJrEqj z*(%29Gk)H|67@udZ-0c2a-{Mx;19`TW^YYH!%58gLs#0!xJ;7(o8e&UFiO$gK9H$zPv%z z6E@3M<`sMBL)9%!|Mt(aEIs=ev0!V-fK&1}=W{Yoj9$C`dk`!NtK*hvWMm``uNyr? zzIgQ3VPQ^vxuDGgv%viZlvGn6*wQBbb;zaZZ3&m*k>%{Sg0CTfzxc@uWHPY~S8g$kL7nTK7m`*7xJPHU8Cd z&$!9!B;#yNYm`!Q!c8t*xxL!Lv+T9~T&PVi!$HNP!`V^r-Cuskh%IB$d8rG24Jc_pfC8(pfH(>UW6tRK6$jfA9Xn9*s?Tl+DLD$i=v z3&;BTa~iufmZhbo%kjdGI_vo*7&R$r=|8+O{KkJjsM2D7HXC^hXRE51Ly_?RMlrQ( z?E0!v)rCdMM?*gBR+m0lft2zfT}*IkJGLB{f5i!mc3Ti__K&G}BF|eA97Q#Vk7>=T zLsL+$J@5x8)Mr*PJiREw;K!I;&V|R2n_;n3xxUZVG08r@RNjX<61Vaug1?);7_vR# z&bBldu9l5=q9tzPubHqo$;!WhO*%Sm+uy%Jti0tCV~EcPpWm{+pyk#!fVIt(3o9b_ zIQtE~ayLq&C3d(Nr|X%`oF+#-6N_v1HZ(2p%lG&DEB5wZEPKPA9BB&+rd-`~#Y8c5_WOrrtc4sKqb?%z@zH__Awdl4 z!dwv}L3-mlr#fLkc6#OfR{Sz@n4sR@#U2}`K4M4F>K$im+vH|jz`8B>4#O$X+3+_hG5ge zdR}!-6jT3O`O~LefiMt~4O?83`ejh?P5px`e!2e>g~;({TX{5FQygAnazta1_L|!$3Xvdgste@zH0n zORi?vE5JjVWw5}1h4)8tEQ4~KqqdbXS}gazu(=v4J8+$DF0y^9{v2;o{N$0X~c zTU)Yg?d5V2f-*@#nk%x)x*`tGI?pKRfoBE!l@&kOfe7#xN<_P&m!YF{JP+^NZwnU?zplQ*YM-BKE@K^Iow{vbM@mVz5UVEI+BXlV3!fvg1WCm<$AZ;;+Kl*R+QiCpO?1Z%@TM6A{z??B!c}n`4oQUR&r2d}| z;zJBzUaX7hyqkFAA6_Sr-0vaAoz3vWZD7$~gOMtQrp(Tff=YjXNwDsuS|~P*d<5LE z`OoT0(S3?q;%Yx4Muf@Q@f3ug*+dUq<-=8H*rm9|be>|mP)MjM2YV#{7S3YJ1 zjM#fyrCmUc08lF$7>IzoQk$J3Fq^_#TFG6opdp{!B`N+BEh{BmQIG-6i-9(22!1k> zhk^-y4P@75y63Z5Za9O2>1~CJ1oRnYoM$nrw0;jj$SQK9qP#W&UVNx8UEQ3Rh+ z1+%anNvCvmb<&vy8JJd=VyM(ltF^Z4vKh#OKvS=Fo?Z!%lt?)h(yoT64s8cqv>xu=#9(#Lk!q(TB4-a)8vFp5Uto|b!c66oA!@8RDa1vRTT*nG~Ga|{j!xD!B!n)tL_u%TGVe^YC2m{*IJ5Uqm? z4^UbcnbNO%LIer9HaMcrb-!YTce`*k!PmFL{{CdaKBCnN%kHFkur-wBpQjKmvsWBq zHm?oq`LvHdz>R?9tzD>rXyf6=@65eJ*{zSb*_$)h-9B!Nx{Bs=q!%l&srQzsn!y3? zU3p%s$Lnv7L``}N{#nds^3(CPabLl8iKbr!-+G@#>wnwq{*aG#P%mvczlo=#6rQnu zw6O4|t8IM8H-ic5$ZFCF=7*G+%hpZygnQM5+@2G1*iMgDSHNW0cdK}gVH7jmOd^@qG@1d!_)%fHmSupw_ri{33VYiKPIRcFHo1N}+NnoHpK!{!nyyM!<)IBU)=Tj>s%fy~xu${;(0fk+Q|VzD(q z#HtXK|J^lEc<0(6*G4G{9b$Pt>H*^4GtYd5J$*dtUiKUS5_yadi0CQDs$DdHU--t} zROT4}$N_^w0;>Z@!AXEEabVMks}RCna${FJ7dWT$yGSqWU*45om0hVT18!qoszj_# zrq2dXVRKsqWvdC`SDcv2u(ABjWk&-^XUo*?Xw2W;cPN<*W8;!b2y&IhJ}qu!Ca+sO zox{UXDouUQ`#t4=cqSq3=_J!XHZcPKCO5^tnzNUnX85dcppd60vuLWHne*crh3#1L z!41iqmw}uVV2T|hfOPG>$60?O9p)JtBMp<`*CVqLxyk^4ED@#Oh@UX*D%sYT%2xw6 z$`R9+FVI|$B3D1=9KVv2uCY2nuMr$^(PgfR1+o*I``Y32<>5SaxPNkQ@gZ#3N+kW>{Wi>il#PfZfzN$&Ajyd6fn(fZY_o7!##XCzCU>65nN=-8-Xv4t+~6(<1hTh~}}Y~N`F zQwP)PlS^G{7KHBFbLPC+H9B!2zwjuK`e9-Z%fc1?{oiID9wnsoTAQBV2~A1$XEI~^ zr$o{BjtaE9$HfWTr4cSGZGCaw+&h?USgn>kt)X(2Pg#P| z@kMn`pQj`24oBN}Aj3|6coE78`+6Lt|Gu}{RaEEN_!Tr}ym3$&vuw;GqG)^ou64Cy z9|vb~VdcR{V!Jyk0k*@=>(iMzV5%Old;M)7)<0|(!9y`z2`$dFa!0&>KWmVfr)KHJ zAX;16$XDCsYG<5;pioEkZV7^B3>5Bs1=s!)b#3V)k|k^NdcEqw_B^&|kwold6C1i` z9lV}vzmNL}bN_N?F9CvG3jq9NmQw%wY}iNk;@X#bMkEi9KE_8*6tWowKM_$1JQqiw zN>2l_0}t$?+1%UOJye=8F{JqE$gRA+(G2suR#!=DO1IR3FmHk0%MQEaY+40M*Yw`s zJ90QfAAEc^RI*!+xLf&6uT?j{3E-W+BE&p@05d+VvsvE>595DfnkJqV+uuhM_kqb- zwaTIdeUCG982tc(aO~qYpf8)m5o6fed>3622+P^^yMe?P(syx zvcb_&3To3JrMlhzq-J$%3SUL`IcfoLB(V%YaQL=jFBIRaIqyASK|FK zR$k!=)*<|J(#&jwZ<15L!BoV^&loG`M#*3mA(@EQXahnqDc@57!k-Dl3t{dkVrwkZ!>WHL>6X4 z(TcQ>vIgTI&*_mGXwI*&;9iB*gtt(g)JshsYYhg{V5^mV8FXy9s?(!Hl{(S2I5&!x9d6CO5-od2So^ zx!-q$sf{EVHYiifgyQqkb%yk1V+>b>BW29NIFP>u+-)>$>@fMle2G{hoAvg#Dc7gF zvsme0CFeLiE=9IkEXIFTEr|I%s#MlR090ZUbVKj8?|T2B2?}pC?PFVbVXt)aoT|WY z`76@%J^Ql--oi{V3%rv4Ob7ww9N$MgyrIJ&aOY2O4db&ni%SbJ-t8ZK?lOv%hma9= zI#KpwI?wmH0~%u)9YkFkc0WhTslO-valAh_?Czl>^6+~)X-Ji&T)sL!w&8I(=p=MsF>F`^ z*lx{cm*SHqHPA|z!v`3skdP1;J9HF${VImL)f;Figz3WtV`Q2{P1 zzw^z{XuzKSJU}Y0)l1u(J~A*&TC4?!|L3cg51UgeU2u^vxik0V^TQRcpdX)ce+;w% zK%d@umbm4998-|CBMx*FrXxT)(%@qT%{2@VUJ&J#nVl+ToOb94Lk#Ech>fq}8W z&p7|be(uZs6Cr0Z{?FN};k5?T?(CKw)S`C2cmUt8r^MqU{6JDnJ@+(eI`M_HB)Vni z#}OZvX0;1iLp_qy0>$r0>Tzo*-i z1}$L+t148Ry=&hdMs0ou7etFCVfKnEXiS>FT9uY=%!(I3aVV278><+BN2K2g`Q z1um295q*XdIX5Ye{>vCgwmWKEG2A!1EzTms86!8-ify*ef0f=-%!Pb)mwJ zV_!r=6}#XSa>;dA?*5PDa{LIK!pIO{wvi~wvx;_h_yBzfI)KdT;WKVLz3vfNN;IsLm#zatw| z(NfXvm+kjx3D8DXZvec*qTqL|s6HO|8(n?jIW(^4gc~dMQo{{mBp&eaQYET6U`xd8 z5N)Q(jmO2MKzs>Scwgf+nrCgI?!&j+Sz1y;7xv|5@5c>(!&y*x}_uH#y-KRM++aT)k$6BJ3ObWq1m5V8)km*i!o(VTi3()~v8KOLkFrWmc`sxCEqoH($Bw?M8Bly&7%3RPj=T6W- zarUh{F+IA!4{-7vabsTW0A&hlSX9hZsksREdZTR8fmIHuZZTT?V@km15f_0@rL3xI zf-mJdQ>4WAay^Y%uMPO#oa;WHNl9;mC~!Z(WP5DEzdnMP%%S?wK<< zY_|Z{ueeIEPHQxq-&>>J2G`TmQ&d9Yv%|)hy|FB9va48xO<6r6z23v^h4?>xhTe(# z7KodcDq$R1^9kDHZd`A{8+g7fr-rN7;I zfBV&Hw9G&(6o$r5oPa(%$ETd~7GWwmF*8_6*ZQ+n-g#%4bMng+H(@}aB*CK8`hP<< zB-A0u0;nbl(ddc$?#`w;?@0w4o)*5`cPm%6S+rxI{18O!CHZUkO}2MP20I9#TnBTz zG^OI)eUXWg3xg72RjM~a(ypg}I_$OjMbq_)Rvs&}*Mwt|yb7MQ8P%uNB$Ay}6!Rc| zGse_FGAt8)gBbY{2aux>Kv-@SoPvyZ7Zz48WS3VbiDHY~ZkU@uH`4v^550mJttC>c z&o=1?bRf76Jx)PjHG+!GD^V}n9Ug(?du>wrdw@uZBK^V@l7#Of`RhT-N z7zbN<020d>^am&r3GnR7PidJYXBnzod2qB=Doh^)>L-Cfy)A>PV9tx~H;WR29&>|fmyI}~E;ptLsGt8R3uM}z*(1BL}jQG}}l6=`r|+$HM(H;jKfOay)> zFPs|A^>sUm88hGEQ|gwtkaGF^X`T zb05;GtXRpPo!l1ND>o;=UH9vzZ%AsNDPeFd?3bp=$5v{@*E2K=77RPDWgf&}RK~5% zBZy$s8;Sa^ftvNRI?Vljtp|J7LueqR1BlhHm-0(M`XVuVQNe#rV4v>oBc;x>aQ}i`g`RiK6mGZm}^BWkrt>T%ngvc0Y9E=K?1nZ%*!Khz} zxPwD#^$n<#3@%~`03y;D6tb3qfthSCqt@YS50A^~YS!eFf#2FX0A`FK+}M&VVz>>O zc$K$sBc#S+#@$#~AT6S~4{qcUCJpAjLJpm4`1059fCJg5r%WA19t@O;j>0za2rG>P z!v?qnApG}G?7!1qC-7Hm-6p=Od+`ocJ)knbr^5Rjde10d8;pGJOwlzb!6a?ux~<0 zQC{52>iM>>IW{$d9=srWobLjF%0#>AI&@o-$A(*}=pmF6g$@Pm^H2l?MwdJ8c@ zd+CyUimGO;px**nCcg}%=qG8S;(TC~7h&~rpMzxkZ9^>MYM~*)Dm5(dj>x#=@^DJd zLo#LNg>(sx)q6&0!ZY$Tyma<-vd}7yqkW&O5<5Bsfx=h$`v^J}HAot;+FJIxHtZUW8z{}=Rx zVoIrixum3|1OU;W4LTtJI_zlXj@kd!2LK6r09l&#$dgBO9`%g~ov@*}E%&gF9bbV# z5!(q4AZBN&OZY9G2^w4*5q)~BIkL3?fs9645~3gMswgF)N;I*TjE5ub=eree~V_VP8+w8mw77NZt;LOH~?jD z0^m#ckB_Vt6C~0yGFi6*d*eCdz&=X_JGV7dJe%l>7sBmV_2Q&JxW}tL;G4f+O!rpU z6>hadEcURv{%x*3qlTg$d)GGyf+;Hs(pkZkA$A}_z_Cx2*1{1A@ZbEJw4fT8pnbnv zZ++bQAjTyJsIVld1j@zij8QWR9v&@EfIcg&eaWzIFTQ_zqJs$nSZ1+m4SH!4lLA)z zHNIsYU|Wk+EB)GjzZFG$@>x(&|6Stq^D}KnEP$CpD;gOYm9$2q#%R!E?JoN((13Lt zM5K7CbO#rzB&C*LqyN~0iA0vDL`5d#WQ* z*ZYpT*uPSlm4jYc{$xGRr^T`G67H8aK5*C&p z0No4o0mnd_3aFBMqX+?J{IvCI?8p~gSheOFlC1L8CABtCnav6%4=O|1hBZMc^?j%9 zDjxdJlu~^XZqe`|0(rWfeG$!PA2{Kc){A*|GIqh(sXqi;!cLaN_+J`3cF^Jg>Q^Mx z<=z#>gs^T+4;$4wtG$|}Z)t6h11_%gpxgCLL6()#kC=T%L%R(E)W4|1!34B;^=l5G zm22$R1+bX(Y_yY;lg*a%3$p<|FBBjN?5r-f`rhF1)72LJNk2Wm2UCDgwuxO>K>Y4U z7o1=@Lv(L3lT_ai;|TzwuG+4V#GwV3U1%$Yz!@n zq|fNk3v&TQp9#UCp?PYhQVI$U+SvevYhtnk-M~OgYt$Esx#ACgaT^}Dai@9m0nqco z@$m+Jfb4cHwigSxLf@<0tRMms*bi1SwNdaz0I!(>#54a~IF^#sf}fDMeZ~E&arbi_ zJHd}9rxc*PQnV3s9qxdA(J?~{O73yqf(u9!11faVg3aY{>J5=tOxC2wdMmlE)N5mG zJek+`kDj#H0r0MB?X^Zn<4z`j4%#4TcX`q5 z#6@=Gbg||0Mz+0)IPRv?Tdjv2z=i@G`*fYPdb9i4SRgbaHE?H+7pft?2fpK;;+{9Jq^FRO};}0=+P;=HE*{2tJ95>17kSSYq^!t2{dKm4>fuv1*Clzd2(A z>f1->D>1-oNN^te?dR3cb(Oew;#Me9HlpYJa@~y(2hs;bmpn8w@wl|K!7|MnNg0`Z zAo?%UZJFb!l2%s!NJMmN{)j>>F!Eb2;~wDk2~MvDsd5U6io)U1DdSR7Byt3NHCo&l zc(T50&{@SA=p%Kt<7tTxeoaIKS{7`aZG~bK=o1cK^6ckgNZ~0OE)66~i8mQ^2_sP5 zR3r1KE0Bd}G69qk_%kp*Ug?BglVR5!F)||C-Q9h^8AhB*@Duf>f7ge2HQ?>*7ewA% z>Hi|<<#Pm1df`Hf1g)%ziidX)nj5LPQ?Z0~{sUT{ZE`j@GB$=sL~OYq0;)q~-uC~u zucTPX&OAUDo`GeZDgLi-{s4|H(pjUCT!Mu6|01bk;r#hM8w67tf()Fa;{yeN%Su1k znX_{DQQBDmgdH_MnisC5G@KGUVCx^t4(;BF;IhG88ft`DC3qFRcI$g%x0jr232G)B&EaHwUS#Z>bc zS@1|4p?tj1TUXK6P7thf0-sIPB`8azto({~9sT}xu|a_W{*kq&4_YarnVg{`F=d{> zr1Kf+I-?;Qjo8lgW%C1q%&-n&`v1k(TYzP?cHP3Dq5=vijWiOHihwkVl!zcmNh2xU z9U>q|ONxXDB1m_abf4%cSyYZDL8TI;^&J?9u>j+uJ>Q!l%Q zhBaupZoJ0(da!TCjuFV}C-s+Dk2a*>wqSX`6Z4KVy|MuQPN*7Lbm}5x`}h$AP#;eV zeXm`A`J*8u-*jHAnR}{iNMt-D4>xlB8u5+HuC|xjpvrgiiA71HvARV!QC!@C*Z7h!q6;#~NcinhuFW04ij3rL3C+q7TDr=M_lXbZi6N-9#?6p$zd0>2JWM}VJ|&quUET2?%$ZXp%Nu(Z}ZkS5sQK8 z>6B=p_S|NpRX0C%7GuRv6KdpJelBaM!{w+Npb!^7OETgjmo>$Z zY;~9&q~R{()_ASbS^c7P&xEuc6lhb~yJ^QLqH%Y!vRxi>2aa52D@^<6?euBQbtS#U z8dWrrnaSkilQm(aXMF7>;medD=V_0FXTWDR*!0(gUpN+I*8cfI z_}1-JqE@Bd?HDl26>)>3eBgj1pF0dF=TMAL1`hlIDVLc%{Fv5ZXWs1IgYySy6?V?f z-@H!x)m|q`ul#*@GjL>^s(mY{$Lr`l3d5raYsQqXUbn77H2BJ5W$azf=l;6Hnz)VS z$XLK(dBE=4X{+z~491c7^DZAob93vq;maqw%I4K=JZ9@8Y+hkV$sT!e<#5Eo!HTgu zwuH`VUrmilhGj4QR!Y&c9U+4#=hRHkyC>qZY8veu0pk^0gP8a6Rm9({la0q1IPpKH zait+dV`UXfI-q2%i5Yv>lw{a+-RM2eoTrbE+BlvPom}>OlB_z?LlGX`MKI1nbMCly zvf$wC7#gYBI=UK^V*y09goJH}n z{O(SiHGj|Oc3O?>qvS!Z{We;vdX|9WkHsqcYJpM8CF&$j1&OUV84jAp#C_e3X%9_` zr_*a5hmqkd0dM&}%|BtLACb9kW-0#1O@+Lwh!{8OQuPOr>@lN0gP&*AtUfz0kTfttVf}bkFEL^TS+yG7Xz&wej#ORH-Pbw+I zg0pk_tIeCUgU>Bujb{jK;{kPGcI6n`6>zb`BKp zNHGnK`1I#@@Rz98+|@@O2VRSupT^al=^!&7!Cy4cjnK_yvAtJZKPmU)Rf!MzD`AJJ z6*`xK?UHqkJ-Q!N?UkjsS?>Ma7}dESRgywbH6Wf){Q05#YK=Bj^Pu%q+dOj%_b{b4 z#G6qK)AnQZ>t0pn{l%9QWm0TwKbR^&G$I(9Vs1#AwClezPpI$8v1C*|X82*BZ4^NO z=4$^abf4ZS^<#|{2J$&l675c=_8CQpeW)56%d&=siS&37*Rp|=_#s_8{Vn&v>U5gQ z`vzF4b#-;y+uP4*TmG&y_z7EDJ|ZS2CX%{-pue7JGv4tahWPH2YjIJeKTDG!Hweo% zZHybgX9W#L58YYNqp#b=wkH*|3#T&(^mt5UVeRHguv*G2E?|F1O=SquCt_5uPAG`| zcy}3fx)Uh>?T-41OQvr8iofJ0FPioHv+zzhv1x>}X==*t4p(r^I+bsx6e~@{m!N{6 zMvy(`L#i7fHqMOr*Xlzudme3WI%Y_#nGW**p1m(tMEc?bSB)sD21)*~310#hI=ZDT z_YA1WGDTI5R2Ux#J|bn~->|5BxcLxxc*yp3E_m6_Y&DR6i!O212Mj!XnwDRyBk^&a zdO0%eZsXCH)>p>0P{8F6bS+&ueB_+^w9(Jn{R=8@1;0YY@v5xg>FQ6@WB+t)_9}B7 zx&w2uEi^#`fT&cix_!YiKfRV=8fn#JDy&9Tiu#hffW=f5c(tiY_02ExA#)#nS-YZ7 zMO}{3SOMYYes-$`Tg+{#7vifz^K7c=CLd#buuRnxFLtkX8k|t<;pScFZ-$R#U96&iQ3Jchz*VwLW9wcRItIr zueZFG<}iMhw=r{k`L^lsFa?69E@8A0cj5|FX}xRQ=`i?NIj(jFT{{`F_cSA3?n>1N zym=TUJ2tMa{hpY@j9c;jX`)c@87oX{d|3onkE_s7BOa^8B9H>$f5J)&5# zfqIcf?ysmixB47=?zcw`ni+Q)gCcUqqOFCes|7JrXj$$yz-pIG-kgZv?6H0PTVJnwYJB{{l)wP&D5^|pgD|(xnCHg*|KJk9x*`3@XqW^K z0cOCW5R?M?{h03+9WGQh0@Kse8EU1WP)10{^S{v2N*H%C9VwWGhU0vJ!@Z1);Mdtb zeUH=V(}Oath9-oHgm`62F9-2Y4mV{0EPb1!<&pl}l2Is-f~=M|L?gDd_%qD=inN5uA(^ z`O@N+##GrE2CDCn(lzhh<+EplS87#Cc<+K~n7;Lu|CxfsDn5ov1zM#E!{}a7Lk|)Z zn>)QP*|fPvFB!-LoWhru48L)_uN}9ad?goZeci-=-+w;;o?y~T5kliLM(iwO2eP(L z%rqlxBM-74jD2sB=i9uO6$ldN?Lo)wq5Td=BZ-%z{W})!pfBtVDtF>Yd!qC(yFS`u zS?v|l%|2iBYvdQ=mNleBQ59o@?>d`#c#cNf?POc=VyZUgYFp z{UvltC+89ciUHZNMy@dZ`{1XZ#mkAZ1s1omefoj1yibJLPG7`M#Ozk*BNmXx+j}zR zy5z7qSkKg>gOq&I?UZFu&HA>WpdbKr ze>Nx|@bSe!-}hZyT+iTO?DdhtSG;0-js=m7_%bkcP{EtPfd3kX4HPdW5E zl3a(kMBLrJj`@5U*IEB1#`r{4qLXQQ6PL$?JND1#3~G#*<4F%I@y<^V7r%&;PeVC; zAC#cY%@P1r>KF^zEa4y~11WtyWwypZ{69f1$o*2lf=e7}9^G3V#Ez40+L>=l4=03; z*@d){A#Vo*16euX2_Y*lUk^ao_Tgd5+kE}Ay*?S}ZCoA97EhMj?voMfNq$LKpu&Q1 zTJTf1Cu@DLjXHKYyR*+sj}H!LM1{g-;w8{yEomD+UVZ*icdn{OFr3etcc`WxR=Mf95>Jf z>5&}GT3o-Rn-Be_`7yCs;#}BQMer9XWll;#ge5-z+Z!_Ke{M7U^qANy*p?6)j31|o2kh|{5ipt{6gL{FWkB?8D>#3dUDRh~qgE7ZE+;PAxB#n&H*1v>= zByboHNR@)S2ROl9oO^kDKS01stF5VL^KgR|vZ^nf=D!(dG~t-#GDkqWt~0Ym!RN&^;&5-_0Yq&!w4h+_pHdlBGS z!9d}<_XR8t@953cf0mSx@CBCFrswdaiNc-XLob+F02bsX3*>k>NLuv3P6hEDiKuiw zWE2k~mDO+m`eVt4cYdxVtjP2pr_s-AH*PR(RvyM%?Jdd9!>=mx^YfdO!jsoT$HG$F zC|xAiySzAW1$T^aP?0fx$g^IQ7L`qljA31ToaVpcj0HG%3j>2J`Nf&0D%N|BbMfTW zG0lj|lwG|1VXKBy!o;ld*W`DyfTzNRs01BtqgWN4M4U$9R(-FYKtJe8dbT3;gz}c_ z84qO0qBIzCD4wXmO;Uh{?cb84FJHcl01t=JS$OLGJG9I!ObQx9Cm#U-S?Dnr6{77A zaS}bvX3gT)YHM$oN737ys4Vi_w8qvt*qqEpX@zcYrtzlkAI~G;{)`=*ja^Zf#nWpJ zp=z!FjBDBjc8%J}?w1$%2RgaR;i_h(qRY_bC8wsA!QhwJ9>=Fd%xwWjxxs}X9QY>A zm4KM4kKC)ivcHaUDlFs&%Okd#ouT8qx{ke;>%#VNQ5fOmv)gHs9up}t6y9$en2nc-f`DJt$3r(Gq z>W5(y{f)6w1Pq^=(t-_l1o{p(2lL6J?b+iQe>MgdmIht+ZTO<~r0b!A@@BL_yf(bS z43$C*bMsYK=W_ea1!!Kc>FVk_;C`&o8J%^E0zR6h?e1SNbH4t_z{BXH+{6oC@O-;r zE~MnzOjAwKTBE$-@a?C)y9{Mg%{Kqa#>&{{6XohzC=L)qo}nS-@@IvxH8(6MU=0>J zYxGxGY#68a8wm>w8xCg0nGQ#rbsFrvQi!GS|D&Y^<~#*P{eB?$g167@ur-CI1MV3j zVqzm2hg;Ki1?H3FaT{w=l#7?D@|;g1K;0b~`6X6W3YuaY*&h=u51726u#(XK@g+G% zhTDXRfyKaGvcIg?->Fsp+FNn$zL+id;6H!*L%y%AqRF)p#kIq-?G%pdn=vo z37D0&$|-QPK;ltfU%v%fx2q4Ktq$q=N1`P;EN>v}S`iN%nqdBSql*Ra`-r$qh#>A# z)6#l_S3;6vIx&xxZeX<#G}O1qIX;VcC-M7_;aFXCmp=6Bff56DhT&*Y=6H>ffmj2Y z$zfL<56-tI=H%v;`vizt65J<*d(&$T$HZ#qg8n@?1L1j*fn`lz9=lofxm~%f;OW}q z376ehV8|mQB^6*=3il3r`YKLV((wn_=Gi>megG zKidD?Rv=iO%k}P4D-HDdpanx~okh3mKPO~D#ofRCcB{=5IxvGYPKA#btlmX)nUWwa zgW1&=yr5*M8y$@qHLtFe%vEb$Riu3q-3ojTYhuCL+LWGT>1Xz^=yz{;c%#wrQcsa> z@}pb%80FLFx1zwHrz*5cg7P=VikacEO@=S>sTK{aCWxag}7$GGKh!D0C^%14a`&*!4)BZ zlqVDdG+^mVeJR-P=lg`(+MGgc1zcs1$wQ9!>&4SBlAz|oy!?DbD+Ah|MDxii z6qAvH<nrsxn(>*BNAJpK z$*EbW&w88&0;M%rR0^gb2qTt9g5?mI$|XlNq%|xeJ7S(dhar(!={!M=io|zr%bCyv zSUH%;GY=4>QOUGTtK7Y0VU3Jo_7nc~$506*^{Q%OMSmo6CN@K7($ZI)uzDOFaTc;^ z^ShUFDot(n)qWG&zSbsjvA!IhISZ%=+gfFYK1f3_m>vwdGX5FK6X5_OE6UWYuBy!g zdDS-5UNZFOd(sq`YMfA9#Z**CipQ)l;a1XnUca3e zIFhcI4#b5{@6Q)tKI{vzaYQU$T0%k#ST>il?WWXz>2KHD96VTkk{uHyRCtxtJ88!6 z-N2oj6*aSLkR46Ie*er@bNnmQIUEFtb`ROlgBs3(;H3@6Gpg2;yM;7SF@&*Hf`ah~ z^9^Yf7;u95(gN)vP6?>|91eyJqM=P&MqnQga)k;wQEG|<8LV3z&-X0%4`#isxB%Nv72Y=~uSnWdxf zUv3#}88b7x$JllbIB_Q?nziw=0*;#_hfstbKVHt7o1XsThtGuY!O-bHpW8&PGg09f zRS*pRP2IriAf79bFnkUOm|Flge8~-|o zTz_g~!x8huywQXUQGJ2Pu?b1LZ`~>=Pn26{JbiX8Sf2o`#m;3@Nm@yekfH8Ty1Pw-rE;qgOiKn$@D z9%3Ra|EL1&N{z0|^U2Fx<3U!$lnQ*h+5nvRkbg-)*!8LvZIhEN)w^%+b6b+!Z=%0+ z9}Ub5KW8tjy_h$%e8x;M=-G@C&7>}$ct3lG%m0|ViBXt4N>cWx(0f<8*RP*GeTsM) z_p?j*_n6#Izwa;nhH4^y;@W%9ZsYNkmv2E*2=*u(W@AT*H);zb5*g~7ns&RNV?VN8 zkyrAuwiLUnOJO@C?-x|B_Bb$xD?R`FU}@Uvsq&jNz76oa9D9fR%gP&^!l};+Q~Nh; zFG|X+TOdDMhFAi500&?XB-9y|k!+fs@T3s3w_G%ZeO@0EVXC=?DV{RT)3M7*= zl+4HD!%VwSYfmpOio3Y*!#m|aSxQo~?N`{(>3#1{^hb{#txZ%Sm$Li)@+?KN zf#*ZYi02XtSrfbe>&{i%r2V~wXCc9lYYEJxW`7sCZ9K~g68TGVLS)E=bJs0bGfVDI z$l%H=OV+V45O4i`2ha@g;yXR!6nF6CUxHDU1`gg0(c#zO=zhc;ZVADkb}%P{nM6z# zl$DzyFhlD!IBbQ3<=s}}J@b;ThS`KAMSZ6R(7+ zloHV%wMdbztxaNl80*HqlE7OaTWk?Xt5 z4mw!tAwvcbp`owSi+|=mB*wROcGB_l$KujU|2;BBB-5-hxEeR@*^$$#>R4aNhsw+1 z5xhjIj}%>4`g52*@r6*+<;!%P4=y30f$BjsO>p*1w_@&l z>XvJbIR z3#58W`%d61h@ys-Zl&kYN&xSLX`4|=Qz%*?uropC58LpdIx%7zTjT-5;7 z^n06=-{H1x!5;PjL!L0GK-mHG1qkcyH1pYNcC|FvKMs{z(RB>Y1Wi~xU#r=+7}sp)t{ z1wXmN6e=J+-ODxwrlXO_^Vwe=d~CO-ytBItSv{Z(fFoUfYG%d?b`{hN3^xJn|8$GE z7Sb)#Q-16Da~}$~1Q5Q!I}HsDT`MbrT&T8a(wYJQsp545u-DpKA0;$5HwWc|kqSB?iv!vxaK##S1JMl#?nx3QYNKlwVOb& zzyT4T{YJgkLqu^5zCM7A=zvkEn*F$Kdn`{F+zcceLsFy)$KAC9M=<2u0iknW!TUGJ z<5o4VYyc^wBLIi*Cn|V?`FC!@foRow3QnD|={p^bf%lLS_|1X3!g#s80klj&)gIMI zwGzp&4M5Hu&1dl0v@Q25SrX4Ps6sJj7oJu%IbG&E~>n1gF8l@mkHh+=l@5 zahXrF!!m31xO&53K0@hrdOwm8KmnXExC*9Mw49s~kOrX+IJaCLFG7wweFG|BKy$_{ z>TcWhh2j$u?gC1HKvVhdmu|Inb?fhf0|N`!pTIB1@!12`Q41{`w`+2BnuwsSqj=oz zp7X)lbQ@c>F5GljRKsRMi{E3x!A}Twd&vi8&)>Cwc8J}{JE+^HDmLCBw-7!V{A%G^ zO7{m;VJrhazN_HNdE(%z+V6i0$UpJYI~Y2 zcZZl5K=}J^Zf>N2YQsJBfWr7|mFp?ug9mAVw3QV*AYYX%K~R(Z;|g`S@d_#m#`e=J z#dOot+qZ7b{kcK$6KemW^gW~j2BOB3H@~EiBotg}cMnF*cfp}WPDRCNwCk9D(Hm!@ ztFIsR=5!~9WE<;{hQpTwrXIj>BbndU0?xR-vy%wa4iW@l6NSaZyaarV^}TylRh6`# z{7qRhwFq-S4PmcI58Ep^Iy+w-NcR(#PeJTsw`ZGnp{p{f-{@_X3lEZalarGpDFIX( zqgh`)_uaO7d*#6Sbhyx%5EMrVKq@J?Rf46=%<}Sk`z!qS?<1D67J&ARvImMihg+9k z&~>^|9t;__+xfl{KYzyHfZgWAKOF~EE~TadqOv*4%=hW&dLijgm}zmO@;VK>qV9{? zy;{LKwBZidX1O4^2l!l{V!$C>lahv`_t=;kx%-(Vib|nT1MIr>qVWpH$M6=#GZmG} zE)_FXa!p6M%JtxZDREX~Qi&+QvVcS+G$v*>+*L_IS{e%luzJ?O=YIuYojsSuQ_R+Q z1^Mexi6xDB)iE8)bx#k(!4!6EVXYl+T&LYkGjT;_C4?eE^-XU=jj6@&?)q?XAUGY3 zym#w562HhHuwabB^n&h#ODii4-x2Y$62AY*+wpr;RMXEFVF;{~-rrl{Xbpv|wdg6R z5&p{5^>6>c8U?X8?O&PMf1IUM}`jbMF;p;-?D?S=le%TaHnR+DE_#epJ;Q%n4&pZ zSUk*BEv92<4~3P9GC;AMU3CIwlwvoW!f?7Z>~;xteSM_ETCxdUFnRF86Hcov$e)LI zR0}yq?1@`_GTgRmxNWycIDFwqQ7Z9^4+*IOBnZ(4{`~0$CnuCLSuek(wdk@Bn?}XP zwj5CTi^WT0B3Q|NAnm()U;14YcbRHgzdNDn{Nt?|H^|vJ! zKgRRmavwbP7Jv6H(Uj~d^nkukK7A!6Wm6Ii-xwZv@Zfja>Yb|Faqs05IOpTMX>CX?KdaV-OJ;k2Uty%!Ji7v<=0yi$fg}$goGi>=)+rvj>E2Jwb3@v=RXGY1*O$2EUL^Dxv zW`-Pt5FQ{zIL!e;)Bv%h@d8dXG_?B0#?bHIgJJkf(cxMaDDYoMN&N%}g_Yvk3pF*@ z(~5sJURfNuhV##8tei|Pz}j;JMrx?o{JSYmnm_XM5X4EjEk3||X>(gg_)Or*f`ELV z&pSl5y?ku-lXW5Fg|L;TmJ@-mVeP>N`|RtRqe_0~j_fiXds8rl!F+4P z*A9J%XmvoxIL-z>ahUcdgng{}l)ouaEc&;3#dT5S;5qj^l7&6x!VCBf>z0i<5<@CjOgh z`)8dgl$KjNJ9#k#>F_9EvuLwNvl@R#QOP98RCGq$6uUW%@qc)>BmH*h0QbSW(S{9K zr1yWC)FI{d{rmT|a+OWmU(Aod}_2f&&} z$atc;EvZFCMZsB~;Dc^@pQGyb>vn+MJEl!Y&0RT+W* zc2T1%?aJt+zNtKgP}0bWdd2Ip{cYhd{Ul+?$1OqWq`6YzB6`8RFq*fB$?I zXQ%+&jouICP_b1VC1#!j1EXf$!3moYs52xRH!_A|S|8{f zpc$ae{5h1Y5M9{T4krH_Z!hgGBkaqVI{iOap?Qau;7tmy?k!MY{Q+DWxM;np8V{KI zH9A1du6q+&?Gd^x>YP_NmaTfIKKIsDzA!7E$oG`hL!9(Kg30(A3kNk`LO=wBP$a1b zhnV;?_z&y|Le0g%#H6#-{R8~(r)Otp>wUd(vgnW+HthQ?&B+^^cAml6p7!Ont*4o7 zaZ8ZRE<`A4XhTtl1RfX)H)ci>19eIy6e9>G2qZ@*P+t$&*luLdWY|Gh11xr=fdn!2 zhtUzL)2;mgV|E*IAZr8KGI{sUp(DNq?f+4)R=ONpkvlC>0p2J9Z1^!$W>9oPlW=EI zkV+;JaS<5#pNnUr&Gu^1T$?L9zE?7T&Z2hMc%A&>9X-fcrJXy#XCX(s?mChsfQ)MW zOI%!s>*2VMWbzEuUY0`*=kT8&Pz%Eov}j`~4vC919fX374+VM&EPFJ@*xgJHKdP9hRt$;}tol%~Z+O=Ks ze`|O%xn?;Mu~$6d zF^&i0ByNOgKBHp&SuoW)!WLFsd?-2yC0QPDug=4rxAI^N1wZmiKu2{V`w;R(A`ZjX z-X2rHL#ID7qN>NYl&ykv2h=h=dXtMzP_`@QzC#0EPu+WP_uv396)g^uIC7uV(f_Ld zzS%*=1#ZLI#TigWviiXxh0G!&p+`ukJ9PJ`C9gwlINl%BL~b1< zx^po3ky1cl{6Q`TRoZ9&omVN+aoiSDAKo0?-WoB^eq_I)20P~^tRk{3WcTyhuFMq> ziJaeANts?bdhKJX)Z}w_Md^HevvWs${obkb;6-7@$Ky ztdTV0A<^2vC={4b`eZ~N(ySDtP~6TQ8>kY7v05$+*%Jd34-UV72alH~P|5+7ckS)l zw?(6!pu!R-*9I*Q!ZjfX15^!zc9*3%iEg1V0HfsV2ciByFkwe63+_%?py_5A|c;(IgK5oSBXN?i^++P6Ifq*IG>sN;!Q-Bs* z5=H%yNhTn)dsp)Twf*ZOPcX6Oj34fBknCsEdj<~BSg*1^ZTV|U+4mujX z8%uC{dWWqcxFmdjAJ6+oq;d(cX+{n-ac}%5EBS;))(R-&3u)=B9nvS|f0;&nAd!((`uCLMUsBOlFIHu~ar{=%I2>d0Msgfide3_Ff8{%Xq5r4n z9=?FgM&8f`5fC|ukf1DqE&S(TeUwT45Q5CJWVvAh8{W8aUbyeSb2aP{csg z)&VN*N0HGFvTNnv!~2b!Be(f~TJ+SBnI#RbB6_iPh=FPU{QZAh^Q329Oc^VKk|O)e zCPblJ4u}xBKOYpQ`GqQ2WKlu0E^{*~gjCK}Z-E^&@AJO{WxgQuPeZtT-&X4P@RdCg z)DT2^_XlAnh^?MJCNWy_9A3l^Ei%q=nbjBI2+9#}L?{kL0vq(7;da0z8~~;eI2iDR z4xoV`o@uv)(ngp8J?I+{P#^%I#N4i}yx5;RDh-lF>jJ>kdF_Rzl z@`w+oMlQNeDY3ukf}=p4y4B4Rck)N7?JiMUWO@Wm#SP43g*cuquj^#d4c_3Pg$;{% z#dl~YTn^~<~DrDcWzhOml8kY~h{oZ+cGg!MTDkSWt z9&J{TwLRJ?#0Q2P0$wQmED88^2-xB0vJ#e^UQ&Op?d?w*b=1uPb8D_lDuf z(AFT)!bAnldthYf_iCQO4{v!&kmuT%P%u%47}7UUaBeZ7E!jH%A)Qqg&wFO)UC=z) zto-x?Sn1rR+^=AO-`j?_Sy)}BB`?V>4@s|ReXvKz%~CfgrYYLPC=V+(0Yeu8c|B z?bC2z@Ony-P2wl=_nN3|GuKyGSxn3_Eldr}Ni`1b1 zU-usn5@hrWR#a3h{r!y#gmFYxAFx<3eTzw5JiQ;#m38tBx_X&5PL^8IqQbx)+?BQLwU)8x@7$13lV;Ldpb0tb%ABEqS+R$7(Y@wREaIEjE z3%kQ2Hub~mEQ~96R%DOl&~JDg!A}qsFVKB8DmmEX{9ggl2XPbBtWaGY`<2mE@3b-@ zQ{k8bKVxl$$z@y>dy0Du_XW7bv|0^zh-_ZMH~R&F#);BU%Gr6Q{vn)KDKub zER!|wd(ois8;XSR!p7N^VsPle^Z>w(Lx2J88E`v0vdk)7Ac0A8kf3>vpL;>nKqN~b zV}h@N5{^vMW!roF0^HxaK#BtuDaMZB7o_rC^8@It!!>-W zJF6rHozgyIZvVFOFw0eT0Im$gun2ey8l0gKFqe>l`gd!8pO%X&(udsXrsuSY(h8Is z?4W-8`0*orCmAqK$PpUNE@5PN0wm~2DP3-FUfy%iwK?OqpqLK~PaHsopey{!@&2x! zoQP^2Dqi(0yr0+z^_%_cXJRS)ZXbqe9?N7u-JT9d*BP{$u!rtqq4kiT?#Ke;@_99e3_SdUZ9!vxpz4<}tfb zQ&21-DKgANe*sEgyPlcWanT!2yMM;xTDZP8vlF53^y93g)Bc?-Y;df4^nF-WKc805 ze|!hp$fEQ0u(CDI8Nt&srQ!gAzaWot0I(g=qQDg)qUwPQh~GPao+DHmBC~;P{{@Ik zAz=b02hpE`@B+L75nDT$LluP-oB;lg3L-!hU@^n+u0>~&*ayYRBmI10aEk&jd!$hS z!s#v8nUMJbh9U+zvFRvO(^w$nhX{y3JeE)R;I%EC~O(Vx!Cl?>~}DlSmZ=;-ET2jC13wGAT~J;g2JAS7i!U zLbKx8U}gdW&H<$aXiO_4mdGoYKsLU;wKWA<@Grm>K|3A`%h4A$Ln!P3iVx6t1;F7E z#A(2U=W9H!9vvSKuU`%ue4hu$D@tbO07TpZgw5v1>S_10?HdqNK*2@(wzfs))-771FE`b&z>Px5zwE|4GkG6fTjY3icm;^ zs@)+Y>wxzQx_7gJvw2qZaAS58zK_nM& z8SGAbhA2RDOM>}<$)PVYrtrsepDP3De?dw?MMdQS3km?UI*`GdoI1k+dJO8~W2ltY zF!qj)95kHQ3mF^!7Mp9rK>1Qbwey0<${n87K})R z4#C9%hIk@~wu94xiS(R2X~D~+gBmr=vO5EKEmTpBZ`NOcZJDf_nGOEt-I5irsejSz zZD@m_-yZtRXBD5C(s3r9H^`2YY6#P zc4@vjk(9`IDEPsB5##EutG=zGIGA{kYcaJKoY12jQl%cR2luVmmH8eRJ#ng39C@1_ zmfpwCvMW#TJQ=~Fc<687XrhOsY{csN%mttGO%5J6rFPB3Th_y;=Q>zA&9@#uY;TVj znCwzHn|3>B=NH^vn~XOQD)z%@k9m?kxfc`=a0@w7m4R{1_qc#GO$*XzXeUFR`pRoj zSba9n*12i{h2U2qcR`TFfJ#MufnHFM+`MA@I_PcSn0o}aL-O+STEKyD1Ly-SESAs+ z0m}sy#cplLBO@b2`+V}bA1nFSLxZVt#~`ZpYQh+dpB=ZPt9)y&mrZ7>mEMIe6`N?g z{dbHY`uqdSVXP`jI;9Tb+e3%LVqdCPa@)7~cBQj{B4=uY_ zKy8-R*Bi!4t;gecPk$u3)?uisssj3UyL>z-O2Y{qiBBZ=?rG;LcaM(xr>0(1O2)f| zUur-gx3adDRZu|7RK(0w^bZZaYcf`HM=a?M5V-fYW(A%+DV{WdiK>|JB~zAH%@QO& zzkO|Ia*rc_K3bMd+!D7MU2k1G*C^;`$n#Bjq*Y@RKGdwUgq?*4+7^a$yFQTk_@ z;Rh^i%GlRnq^!ZupFc~a$^l_P;DNopJuNtlul~w$*nx8~@z<~GK-UuQRfLD*ot@<$ zK9}T_yxqIMY`FejRNVZAh^&{&4>1>A{@$ja*bjoIDKlmy4wF3scLf(2NtZo2x>1{? zX$WFIw9#QwUa{;^5lM~E*#D_15JzsqO`j0dKn*M^4(nC z+gW5}o-cIvzlP&^T5ijBr}+%(JiEqicO}(0y>3R`f4lXv>SH!ZRjp?58NUBzE>X&e z3>u#^Iohg+mW)jB=tMEZ!320NkXh6M?(kYgWo7?Vqr?JLM)3GP^hk|Wxl_bR-w12r z_~~LSPjX+Jp5`$tZnR}IQ3}P4I29iv(||?7FqdcnX5A&KaU;U+Gga(Pb@s%{(T{dv zX!}tCh81I1(+Qn_TwYAv;@;@`NI3=WU5I`WR5nkh;8cEYYs*<=K8ZEeN}Y+Qc~(|e zEg{#Mg(CnacyVS)V9&o=cJmM>Z|CuALAZlSeqn- z>5}pvGNrkZmS{B*7QRX~#_CL4M*o8THI98$Z)7ceSi)DzB6=YmOq!N#%Cya+tYCEK z7|8RG*0_+E7=Poe((6zgZS!a%!ZxU30dvJzRNDplh6@m-+<vdKAw5AZZ!y-zy2S=P`BYN7=#b)IOlS4kdxZxouX^K^h%;_{ z1o>@=%kcgc6^Oviot=$HMGqpEStz1JtgZ8vHdWQt0|y2arM?T285NSiKd@8lfC9%V zR2hfBdT?MDK)&wuB=`>lzNDc7>;aktS<9EwS8$fKy&bTZ1qk@uR*6fRwNi%tHqwi3 zkom2w7~TB%0Td#$zhca4A+728%VEKk4GK$;R*-&ozu<{?Tp$L&EWWx6DH(v#uf=qH zf&`B@I8MNMz?45E^XsFUR4GcqtG*$f94|36;a1;Q+bEhEKfi6(cX?XlxTmbf#<(E! z>E>09l4Ebwz`ZKbn6tvCYM=~){M!@g3mwo;aNBQ?Lc5z6g5jkP8tCaw1IhIV_&&>( zzYkzIlSvI1U!Cmh*B`R7SV4AEu6eaGBjZ*}OADX}@2YoM2rxlqat9Pncj3JL11DLo z)qJblF4_PEW@5 z3u+cb*~*w;nsAOv<`97Vut>K-EHs;0Jh5Wm2{f5EG=GrXzI{9O_wQ$VdbA*f2YR@d@Ucf_#h58X|aD?-ymtqB zZ_CPfk*=tGgD$&`h>g!0*-e0NPi`uMupUxW&Z#3v2RHXIHbgvo8~7BBFPmnAcotnx z6fG%qFG_D}<-`ekrxW7d`v}Py(v#x=?XbBpYZ|o#i4b(4CUM_glrlj zCML{gmHPwF2CovKBVt78ErPP=3UpOy!;hz?r|)vJc>A7-Q~3#s+{56Bi;H_mbZcp8 z>8VMEkG2-7ZXxE-Ne8h>Ht!<)c^78Nl*ijypEHp`6RxBmKTv^+pX#8y<~4dq+rQd_ z`+K20H~o(N2Dl3xcnD~L*}vw*F|FJ?HMUO+FDO^_#m&WQC~g9hf(s*G9>Rfsy(X8l z55k#06l96T#e{%-D}(VZRH6P*$)nUjH^3w9+G`NtvA#~sNNa0TfnD;5dGI`_=1@Uu z$M@~qw+a>ax;P7eiVrr~qB7Up4t-sN1)|6>Yoqzd>PC{poz+OC@k9%VSqwLbO4@Fs z1Ox`w6S;b2I7hK*RQSS%4v&f|S5M_4&F196?B}L7SbIkQqB}MGNXLy=klPA%ef*q| z{pBN(M8h5Bx05(^fgP+hbiO#(?l2}_F^agth~-RALA!D{*J?%5Bx{17a{cpr8A{nG zA}_Vuaa6Tahu4q$q0#CCungLe3cY5G`1;rY3ya*QGSTJxr%%@xH?StkoEj`phRST2 zK)S*Cv$2%HF9$<$2#qfRG}LT>#@j1&2`=vVk*LT zRYzSh+r_Q$_1X8$V!x-g^jP_w?yF$QFld`hRL;cY)|k%PEvZ(OSE7fN%f+khjuS>|)(Y;+RM|PPq zKH{q+6QJ`>5-RC1f58gt_xcp%;qQl`i2APF_w%6it(Z*)yev_rE9~{-;vX4m_3h6@ z6q)c}TczBg|Lr1Pr*Cva@EEr{$6QNgfZ{{LCQ4b!(?v!@Va}jtV(0D`R@9t~_wNnG zkQh2uk~>wp7Zc_Vi$(=nqa1bUdOvFm5uwouHwVSMj zsB4g*<_xiz^?ml$I><>1EPed&n9|DgwhabmYzx)xm1hJW-#kT)&a8aSUax#tO_Jfx z4?l}IhAT$NBd)yOcpApN_&bxYe)@=V)9|#%eDuF<6Ixg$DlJ9#Cd678B|-5)LSKo7 zE}HeU&T+=onUJW+n>aX)|p4<*req7*o%fw#Tu(b&n3Uyl;+uQfB6D zKssOlo-3AP_2z24yNrW&N2fridt@8~^$O*coGXd4BEl|LX2pGH3+dDo;RIp-QCP@X zu-Pw*UXxgjTCR9yKSC+UpyTSsQ0ws`o(-JYYqYeqs2_IqW&Y6Lq{d6UDSIs(qsBz8 zCh;{TD{JSm>zx<3bw1npCN7;_B|TN5r@K#0%R}4Dlk}y=_|=H6Wj0s9)r|{^8k^)sL`7ib4u%os)RI09%zJpLF5`qthd{7eHx27H8~aCL*Q3i-JI;Wj+n ze?HKsNcSH070;N0xDuX*i0h#Cq%AQck+|L7KEM+`f!Ql17uP`g+37om`MK3qfjGV~8}rs^y|r+ltg4RB z9l6Y2H2<>INWS+lNuKeM*%)_rr^0V}jT`*ton&jI_V2TToSm7uxVR`$o&V3j8UC-o zsZ*%3zQ4~v`^S&wU$3V^GS44s`!>D*(^#<41p%?6Rpw+se;pq^shfZETSHMBj8qVQ z1}y#O+RTH|{Ib6bgwFFLSMtxRRt@NtYM&o4$F4kS8!oo_9r2zr$IY2u4gPni+?wdQ z@I9ZBur(VyyIQS>2}wyblrr*lciO(r1?=-ZRPMZQkTH4D#7ggOuq3o1xZ0Oh=CXQvCzi5tQo1l_1Q-Ow`dHmrXXbVQ+ZvxQhkLhE7B z>(m*>4LmLru6szsKODgD_O%)Vh@o+s64Dg>wTu>N1qJ-Fva&6(1(^mc@wT0vuh^|w z_~lO_YL7l$!VBZ>0z@l_vry&amMZe9cp4ma-n0fp0`Ae3C{{ z#uX~_$Zr#HqMl=_S1a(~-n!$_QBf0sLl;Y8PV%3g2~764|05Fi&Sd7fDZp)0Nx+n~ zV#SIAIbjw?QBhHub0$rQya7!8d_2>PPs+CM2tEP4g~X!z+nUe!LEVP6Yh8i6YA#*A z92gbV1som~03O^73Nc{Y`{LcZy}A-zWNmW(5fM-@81s-DY5xAJ{;OT;nBukK{-}3MrD{@X` wkw#Zzg@6Mjr@6c<ph}6!sxYM={xkoM=ghSb@nHZ0Pgg&ebxsLQ07}T}n*aa+ From af3b0870c36f892d03a77388da0d15ed5b996a37 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:33:00 +0100 Subject: [PATCH 24/38] Update examples/02_text_with_string_encoders.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- examples/02_text_with_string_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index b81431462..54a4ed71e 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -322,5 +322,5 @@ def plot_performance_tradeoff(results): # ---------- # In conclusion, |TextEncoder| provides powerful vectorization for text, but at # the cost of longer computation times and the need for additional dependencies, -# such as torch. \StringEncoder| represents a simpler alternative that can provide +# such as torch. |StringEncoder| represents a simpler alternative that can provide # good performance at a fraction of the cost of more complex methods. From 09b55a1f4ca20cfe5c24f9811f9f815f83eaf467 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 15:07:44 +0100 Subject: [PATCH 25/38] Adding another example (needs formatting) --- ..._with_string_encoders_employee_salaries.py | 295 ++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 examples/02_text_with_string_encoders_employee_salaries.py diff --git a/examples/02_text_with_string_encoders_employee_salaries.py b/examples/02_text_with_string_encoders_employee_salaries.py new file mode 100644 index 000000000..4f5de564e --- /dev/null +++ b/examples/02_text_with_string_encoders_employee_salaries.py @@ -0,0 +1,295 @@ +""" +.. _example_string_encoders: + +===================================================== +Various string encoders: a sentiment analysis example +===================================================== + +In this example, we explore the performance of string and categorical encoders +available in skrub. + +.. |GapEncoder| replace:: + :class:`~skrub.GapEncoder` + +.. |MinHashEncoder| replace:: + :class:`~skrub.MinHashEncoder` + +.. |TextEncoder| replace:: + :class:`~skrub.TextEncoder` + +.. |StringEncoder| replace:: + :class:`~skrub.StringEncoder` + +.. |TableReport| replace:: + :class:`~skrub.TableReport` + +.. |TableVectorizer| replace:: + :class:`~skrub.TableVectorizer` + +.. |pipeline| replace:: + :class:`~sklearn.pipeline.Pipeline` + +.. |HistGradientBoostingRegressor| replace:: + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` + +.. |RandomizedSearchCV| replace:: + :class:`~sklearn.model_selection.RandomizedSearchCV` + +.. |GridSearchCV| replace:: + :class:`~sklearn.model_selection.GridSearchCV` +""" + +# %% +# The Toxicity dataset +# -------------------- +# We focus on the toxicity dataset, a corpus of 1,000 tweets, evenly balanced +# between the binary labels "Toxic" and "Not Toxic". +# Our goal is to classify each entry between these two labels, using only the +# text of the tweets as features. +from skrub.datasets import fetch_employee_salaries + +dataset = fetch_employee_salaries() +X, y = dataset.X, dataset.y + +# %% +# When it comes to displaying large chunks of text, the |TableReport| is especially +# useful! Click on any cell below to expand and read the tweet in full. +from skrub import TableReport + +TableReport(X) + +# %% +# GapEncoder +# ---------- +# First, let's vectorize our text column using the |GapEncoder|, one of the +# `high cardinality categorical encoders `_ +# provided by skrub. +# As introduced in the :ref:`previous example`, the |GapEncoder| +# performs matrix factorization for topic modeling. It builds latent topics by +# capturing combinations of substrings that frequently co-occur, and encoded vectors +# correspond to topic activations. +# +# To interpret these latent topics, we select for each of them a few labels from +# the input data with the highest activations. In the example below we select 3 labels +# to summarize each topic. +from skrub import GapEncoder + +gap = GapEncoder(n_components=30) +X_trans = gap.fit_transform(X["text"]) +# Add the original text as a first column +X_trans.insert(0, "text", X["text"]) +TableReport(X_trans) + +# %% +# We can use a heatmap to highlight the highest activations, making them more visible +# for comparison against the original text and vectors above. + +import matplotlib.pyplot as plt +import numpy as np + +# %% +# Now that we have an understanding of the vectors produced by the |GapEncoder|, +# let's evaluate its performance in toxicity classification. The |GapEncoder| excels +# at handling categorical columns with high cardinality, but here the column consists +# of free-form text. Sentences are generally longer, with more unique ngrams than +# high cardinality categories. +# +# To benchmark the performance of the |GapEncoder| against the toxicity dataset, +# we integrate it into a |TableVectorizer|, as introduced in the +# :ref:`previous example`, +# and create a |pipeline| by appending a |HistGradientBoostingRegressor|, which +# consumes the vectors produced by the |GapEncoder|. +# +# We set ``n_components`` to 30; however, to achieve the best performance, we would +# need to find the optimal value for this hyperparameter using either |GridSearchCV| +# or |RandomizedSearchCV|. We skip this part to keep the computation time for this +# small example. +# +# Recall that the ROC AUC is a metric that quantifies the ranking power of estimators, +# where a random estimator scores 0.5, and an oracle —providing perfect predictions— +# scores 1. +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import cross_validate +from sklearn.pipeline import make_pipeline + +from skrub import TableVectorizer + + +# %% +def plot_box_results(named_results): + fig, ax = plt.subplots() + names, scores = zip( + *[(name, result["test_score"]) for name, result in named_results] + ) + ax.boxplot(scores, vert=False) + ax.set_yticks(range(1, len(names) + 1), labels=list(names), size=12) + ax.set_xlabel("R2 score", size=14) + plt.title( + "R2 score across folds (higher is better)", + size=14, + ) + plt.show() + + +# %% +results = [] + +gap_pipe = make_pipeline( + TableVectorizer(high_cardinality=GapEncoder(n_components=30)), + HistGradientBoostingRegressor(), +) +gap_results = cross_validate(gap_pipe, X, y, scoring="r2") +results.append(("GapEncoder", gap_results)) + +plot_box_results(results) + +# %% +# MinHashEncoder +# -------------- +# We now compare these results with the |MinHashEncoder|, which is faster +# and produces vectors better suited for tree-based estimators like +# |HistGradientBoostingRegressor|. To do this, we can simply replace +# the |GapEncoder| with the |MinHashEncoder| in the previous pipeline +# using ``set_params()``. +from sklearn.base import clone + +from skrub import MinHashEncoder + +minhash_pipe = clone(gap_pipe).set_params( + **{"tablevectorizer__high_cardinality": MinHashEncoder(n_components=30)} +) +minhash_results = cross_validate(minhash_pipe, X, y, scoring="r2") +results.append(("MinHashEncoder", minhash_results)) + +plot_box_results(results) + +# %% +# Remarkably, the vectors produced by the |MinHashEncoder| offer less predictive +# power than those from the |GapEncoder| on this dataset. +# +# TextEncoder +# ----------- +# Let's now shift our focus to pre-trained deep learning encoders. Our previous +# encoders are syntactic models that we trained directly on the toxicity dataset. +# To generate more powerful vector representations for free-form text and diverse +# entries, we can instead use semantic models, such as BERT, which have been trained +# on very large datasets. +# +# |TextEncoder| enables you to integrate any Sentence Transformer model from the +# Hugging Face Hub (or from your local disk) into your |pipeline| to transform a text +# column in a dataframe. By default, |TextEncoder| uses the e5-small-v2 model. +from skrub import TextEncoder + +text_encoder = TextEncoder( + "sentence-transformers/paraphrase-albert-small-v2", + device="cpu", +) +text_encoder_pipe = clone(gap_pipe).set_params( + **{"tablevectorizer__high_cardinality": text_encoder} +) +text_encoder_results = cross_validate(text_encoder_pipe, X, y, scoring="r2") +results.append(("TextEncoder", text_encoder_results)) + +plot_box_results(results) + +# %% +# |TextEncoder| embeddings are very strong, but they are also quite expensive to +# use. A simpler, faster alternative for encoding strings is the |StringEncoder|, +# which works by first performing a tf-idf (computing vectors of rescaled word +# counts, [wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) of the text, and then +# following it with TruncatedSVD to reduce the number of dimensions to, in this +# case, 30. +from skrub import StringEncoder + +string_encoder = StringEncoder(n_components=30, ngram_range=(1, 1), analyzer="word") + +string_encoder_pipe = clone(gap_pipe).set_params( + **{"tablevectorizer__high_cardinality": string_encoder} +) +string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="r2") +results.append(("StringEncoder,word,(1,1)", string_encoder_results)) + +# %% +plot_box_results(results) + + +# %% +# The performance of the |TextEncoder| is significantly stronger than that of +# the syntactic encoders, which is expected. But how long does it take to load +# and vectorize text on a CPU using a Sentence Transformer model? Below, we display +# the tradeoff between predictive accuracy and training time. Note that since we are +# not training the Sentence Transformer model, the "fitting time" refers to the +# time taken for vectorization. + + +def plot_performance_tradeoff(results): + fig, ax = plt.subplots(figsize=(5, 4), dpi=200) + markers = ["s", "o", "^", "x", "+"] + for idx, (name, result) in enumerate(results): + ax.scatter( + result["fit_time"], + result["test_score"], + label=name, + marker=markers[idx], + ) + mean_fit_time = np.mean(result["fit_time"]) + mean_score = np.mean(result["test_score"]) + ax.scatter( + mean_fit_time, + mean_score, + color="k", + marker=markers[idx], + ) + std_fit_time = np.std(result["fit_time"]) + std_score = np.std(result["test_score"]) + ax.errorbar( + x=mean_fit_time, + y=mean_score, + yerr=std_score, + fmt="none", + c="k", + capsize=2, + ) + ax.errorbar( + x=mean_fit_time, + y=mean_score, + xerr=std_fit_time, + fmt="none", + c="k", + capsize=2, + ) + + ax.set_xlabel("Time to fit (seconds)") + ax.set_ylabel("ROC AUC") + ax.set_title("Prediction performance / training time trade-off") + + ax.annotate( + "", + xy=(1.5, 0.98), + xytext=(8.5, 0.90), + arrowprops=dict(arrowstyle="->", mutation_scale=15), + ) + ax.text(8, 0.86, "Best time / \nperformance trade-off") + ax.legend(bbox_to_anchor=(1, 0.3)) + plt.show() + + +plot_performance_tradeoff(results) + +# %% +# The black points represent the average time to fit and AUC for each vectorizer, +# and the width of the bars represents one standard deviation +# +# The green outlier dot on the right side of the plot corresponds to the first time +# the Sentence Transformers model was downloaded and loaded into memory. +# During the subsequent cross-validation iterations, the model is simply copied, +# which reduces computation time for the remaining folds. +# +# Interestingly, |StringEncoder| has a performance remarkably similar to that of +# |GapEncoder|, while being significantly faster. +# Conclusion +# ---------- +# In conclusion, |TextEncoder| provides powerful vectorization for text, but at +# the cost of longer computation times and the need for additional dependencies, +# such as torch. \StringEncoder| represents a simpler alternative that can provide +# good performance at a fraction of the cost of more complex methods. From 2bb353dda21befe770fc46e03fbfb60a7fb3e208 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 13 Dec 2024 16:19:39 +0100 Subject: [PATCH 26/38] Simplified error checking --- skrub/_string_encoder.py | 15 --------------- skrub/tests/test_string_encoder.py | 8 ++++---- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index d7be3d299..f7f905aaa 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -116,21 +116,6 @@ def fit_transform(self, X, y=None): if self.analyzer not in ["char_wb", "char", "word"]: raise ValueError(f"Unknown analyzer {self.analyzer}") - if not all(isinstance(x, int) and x > 0 for x in self.ngram_range): - raise ValueError( - "Values in `ngram_range` must be positive integers, " - f"found {self.ngram_range} instead." - ) - if not len(self.ngram_range) == 2: - raise ValueError( - f"`ngram_range` must have length 2, found {len(self.ngram_range)}." - ) - - if not isinstance(self.n_components, int) and self.n_components > 0: - raise ValueError( - f"`n_components` must be a positive integer, found {self.n_components}" - ) - if self.vectorizer == "tfidf": self.pipe = Pipeline( [ diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 34d377981..13398347d 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -88,8 +88,8 @@ def test_hashing(encode_column, df_module): def test_error_checking(encode_column): n_components = -1 vectorizer = "notavectorizer" - ngram_range = (-1, 2) - analyzer = "noanalyzer" + ngram_range = "a" + analyzer = "notanalyzer" se = StringEncoder( n_components=n_components, @@ -104,13 +104,13 @@ def test_error_checking(encode_column): se.fit_transform(encode_column) se = StringEncoder( - ngram_range=ngram_range, + analyzer=analyzer, ) with pytest.raises(ValueError): se.fit_transform(encode_column) se = StringEncoder( - analyzer=analyzer, + ngram_range=ngram_range, ) with pytest.raises(ValueError): se.fit_transform(encode_column) From 7783565be33b7941c4b28018aa038eca8fdbc19e Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 14:10:27 +0100 Subject: [PATCH 27/38] Fixing hashing test. --- skrub/tests/test_string_encoder.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 13398347d..152a77172 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -1,6 +1,10 @@ import pytest from sklearn.decomposition import TruncatedSVD -from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer +from sklearn.feature_extraction.text import ( + HashingVectorizer, + TfidfTransformer, + TfidfVectorizer, +) from sklearn.pipeline import Pipeline from skrub import _dataframe as sbd @@ -19,7 +23,7 @@ def encode_column(df_module): return df_module.make_column("col1", corpus) -def test_encoding(encode_column, df_module): +def test_tfidf_vectorizer(encode_column, df_module): ngram_range = (3, 4) analyzer = "char_wb" n_components = 2 @@ -52,7 +56,8 @@ def test_encoding(encode_column, df_module): df_module.assert_frame_equal(check_df, result) -def test_hashing(encode_column, df_module): +def test_hashing_vectorizer(encode_column, df_module): + # Testing is less strict because HashingVectorizer is not deterministic. ngram_range = (3, 4) analyzer = "char_wb" n_components = 2 @@ -60,7 +65,8 @@ def test_hashing(encode_column, df_module): #### hashing vectorizer pipe = Pipeline( [ - ("tfidf", HashingVectorizer(ngram_range=ngram_range, analyzer=analyzer)), + ("hashing", HashingVectorizer(ngram_range=ngram_range, analyzer=analyzer)), + ("tfidf", TfidfTransformer()), ("tsvd", TruncatedSVD(n_components=n_components)), ] ) @@ -82,7 +88,14 @@ def test_hashing(encode_column, df_module): check_df = sbd.pandas_convert_dtypes(check_df) result = sbd.pandas_convert_dtypes(result) - df_module.assert_frame_equal(check_df, result) + assert check_df.shape == result.shape + assert type(check_df) == type(result) + + assert len(se.pipe.named_steps) == len(pipe.named_steps) + + for name, estimator in se.pipe.named_steps.items(): + assert name in pipe.named_steps + assert isinstance(estimator, type(pipe.named_steps[name])) def test_error_checking(encode_column): From 3ff3f1ad06ee730014fb1974b896add5b1077120 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 14:25:49 +0100 Subject: [PATCH 28/38] Making coverage happy --- skrub/tests/test_string_encoder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 152a77172..81fcf0895 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -55,6 +55,12 @@ def test_tfidf_vectorizer(encode_column, df_module): df_module.assert_frame_equal(check_df, result) + # Making coverage happy + result_transform = se.transform(encode_column) + result_transform = sbd.pandas_convert_dtypes(result_transform) + + df_module.assert_frame_equal(result, result_transform) + def test_hashing_vectorizer(encode_column, df_module): # Testing is less strict because HashingVectorizer is not deterministic. From 144ab11aec7e0dcdc48e1069b490c1eab0b36200 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 15:19:39 +0100 Subject: [PATCH 29/38] Updating code for clarity --- ..._with_string_encoders_employee_salaries.py | 173 +++++------------- 1 file changed, 44 insertions(+), 129 deletions(-) diff --git a/examples/02_text_with_string_encoders_employee_salaries.py b/examples/02_text_with_string_encoders_employee_salaries.py index 4f5de564e..874b729cc 100644 --- a/examples/02_text_with_string_encoders_employee_salaries.py +++ b/examples/02_text_with_string_encoders_employee_salaries.py @@ -40,79 +40,21 @@ """ # %% -# The Toxicity dataset -# -------------------- -# We focus on the toxicity dataset, a corpus of 1,000 tweets, evenly balanced -# between the binary labels "Toxic" and "Not Toxic". -# Our goal is to classify each entry between these two labels, using only the -# text of the tweets as features. from skrub.datasets import fetch_employee_salaries dataset = fetch_employee_salaries() X, y = dataset.X, dataset.y -# %% -# When it comes to displaying large chunks of text, the |TableReport| is especially -# useful! Click on any cell below to expand and read the tweet in full. -from skrub import TableReport - -TableReport(X) # %% # GapEncoder # ---------- -# First, let's vectorize our text column using the |GapEncoder|, one of the -# `high cardinality categorical encoders `_ -# provided by skrub. -# As introduced in the :ref:`previous example`, the |GapEncoder| -# performs matrix factorization for topic modeling. It builds latent topics by -# capturing combinations of substrings that frequently co-occur, and encoded vectors -# correspond to topic activations. -# -# To interpret these latent topics, we select for each of them a few labels from -# the input data with the highest activations. In the example below we select 3 labels -# to summarize each topic. -from skrub import GapEncoder - -gap = GapEncoder(n_components=30) -X_trans = gap.fit_transform(X["text"]) -# Add the original text as a first column -X_trans.insert(0, "text", X["text"]) -TableReport(X_trans) - -# %% -# We can use a heatmap to highlight the highest activations, making them more visible -# for comparison against the original text and vectors above. - import matplotlib.pyplot as plt -import numpy as np - -# %% -# Now that we have an understanding of the vectors produced by the |GapEncoder|, -# let's evaluate its performance in toxicity classification. The |GapEncoder| excels -# at handling categorical columns with high cardinality, but here the column consists -# of free-form text. Sentences are generally longer, with more unique ngrams than -# high cardinality categories. -# -# To benchmark the performance of the |GapEncoder| against the toxicity dataset, -# we integrate it into a |TableVectorizer|, as introduced in the -# :ref:`previous example`, -# and create a |pipeline| by appending a |HistGradientBoostingRegressor|, which -# consumes the vectors produced by the |GapEncoder|. -# -# We set ``n_components`` to 30; however, to achieve the best performance, we would -# need to find the optimal value for this hyperparameter using either |GridSearchCV| -# or |RandomizedSearchCV|. We skip this part to keep the computation time for this -# small example. -# -# Recall that the ROC AUC is a metric that quantifies the ranking power of estimators, -# where a random estimator scores 0.5, and an oracle —providing perfect predictions— -# scores 1. from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline -from skrub import TableVectorizer +from skrub import GapEncoder, TableVectorizer # %% @@ -131,26 +73,27 @@ def plot_box_results(named_results): plt.show() -# %% +# %% Base GapEncoder results = [] gap_pipe = make_pipeline( TableVectorizer(high_cardinality=GapEncoder(n_components=30)), HistGradientBoostingRegressor(), ) -gap_results = cross_validate(gap_pipe, X, y, scoring="r2") +gap_results = cross_validate(gap_pipe, X, y, scoring="r2", verbose=1) results.append(("GapEncoder", gap_results)) -plot_box_results(results) +# %% GapEncoder with add_words=True +gap_pipe = make_pipeline( + TableVectorizer(high_cardinality=GapEncoder(n_components=30, add_words=True)), + HistGradientBoostingRegressor(), +) +gap_results = cross_validate(gap_pipe, X, y, scoring="r2", verbose=1) +results.append(("GapEncoder - add_words", gap_results)) # %% # MinHashEncoder # -------------- -# We now compare these results with the |MinHashEncoder|, which is faster -# and produces vectors better suited for tree-based estimators like -# |HistGradientBoostingRegressor|. To do this, we can simply replace -# the |GapEncoder| with the |MinHashEncoder| in the previous pipeline -# using ``set_params()``. from sklearn.base import clone from skrub import MinHashEncoder @@ -158,26 +101,10 @@ def plot_box_results(named_results): minhash_pipe = clone(gap_pipe).set_params( **{"tablevectorizer__high_cardinality": MinHashEncoder(n_components=30)} ) -minhash_results = cross_validate(minhash_pipe, X, y, scoring="r2") +minhash_results = cross_validate(minhash_pipe, X, y, scoring="r2", verbose=1) results.append(("MinHashEncoder", minhash_results)) -plot_box_results(results) - -# %% -# Remarkably, the vectors produced by the |MinHashEncoder| offer less predictive -# power than those from the |GapEncoder| on this dataset. -# -# TextEncoder -# ----------- -# Let's now shift our focus to pre-trained deep learning encoders. Our previous -# encoders are syntactic models that we trained directly on the toxicity dataset. -# To generate more powerful vector representations for free-form text and diverse -# entries, we can instead use semantic models, such as BERT, which have been trained -# on very large datasets. -# -# |TextEncoder| enables you to integrate any Sentence Transformer model from the -# Hugging Face Hub (or from your local disk) into your |pipeline| to transform a text -# column in a dataframe. By default, |TextEncoder| uses the e5-small-v2 model. +# %% TextEncoder from skrub import TextEncoder text_encoder = TextEncoder( @@ -187,44 +114,48 @@ def plot_box_results(named_results): text_encoder_pipe = clone(gap_pipe).set_params( **{"tablevectorizer__high_cardinality": text_encoder} ) -text_encoder_results = cross_validate(text_encoder_pipe, X, y, scoring="r2") +text_encoder_results = cross_validate(text_encoder_pipe, X, y, scoring="r2", verbose=1) results.append(("TextEncoder", text_encoder_results)) -plot_box_results(results) - -# %% -# |TextEncoder| embeddings are very strong, but they are also quite expensive to -# use. A simpler, faster alternative for encoding strings is the |StringEncoder|, -# which works by first performing a tf-idf (computing vectors of rescaled word -# counts, [wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) of the text, and then -# following it with TruncatedSVD to reduce the number of dimensions to, in this -# case, 30. +# %% StringEncoder from skrub import StringEncoder -string_encoder = StringEncoder(n_components=30, ngram_range=(1, 1), analyzer="word") +string_encoder = StringEncoder(n_components=30, ngram_range=(3, 4), analyzer="char_wb") string_encoder_pipe = clone(gap_pipe).set_params( **{"tablevectorizer__high_cardinality": string_encoder} ) string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="r2") -results.append(("StringEncoder,word,(1,1)", string_encoder_results)) +results.append(("StringEncoder - char_wb, (3,4)", string_encoder_results)) + +# %% Drop column +drop_pipe = clone(gap_pipe).set_params(**{"tablevectorizer__high_cardinality": "drop"}) +drop_results = cross_validate(drop_pipe, X, y, scoring="r2") +results.append(("Drop", drop_results)) + + +# %% OrdinalEncoder +from sklearn.preprocessing import OrdinalEncoder + +ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value") + +ordinal_encoder_pipe = clone(gap_pipe).set_params( + **{"tablevectorizer__high_cardinality": ordinal_encoder} +) +ordinal_encoder_results = cross_validate(ordinal_encoder_pipe, X, y, scoring="r2") +results.append(("OrdinalEncoder", ordinal_encoder_results)) # %% plot_box_results(results) # %% -# The performance of the |TextEncoder| is significantly stronger than that of -# the syntactic encoders, which is expected. But how long does it take to load -# and vectorize text on a CPU using a Sentence Transformer model? Below, we display -# the tradeoff between predictive accuracy and training time. Note that since we are -# not training the Sentence Transformer model, the "fitting time" refers to the -# time taken for vectorization. +import numpy as np def plot_performance_tradeoff(results): fig, ax = plt.subplots(figsize=(5, 4), dpi=200) - markers = ["s", "o", "^", "x", "+"] + markers = ["s", "o", "^", "x", "+", "v", "1"] for idx, (name, result) in enumerate(results): ax.scatter( result["fit_time"], @@ -260,36 +191,20 @@ def plot_performance_tradeoff(results): ) ax.set_xlabel("Time to fit (seconds)") - ax.set_ylabel("ROC AUC") + ax.set_ylabel("R2") ax.set_title("Prediction performance / training time trade-off") - ax.annotate( - "", - xy=(1.5, 0.98), - xytext=(8.5, 0.90), - arrowprops=dict(arrowstyle="->", mutation_scale=15), - ) - ax.text(8, 0.86, "Best time / \nperformance trade-off") - ax.legend(bbox_to_anchor=(1, 0.3)) + # ax.annotate( + # "", + # xy=(1.5, 0.98), + # xytext=(8.5, 0.90), + # arrowprops=dict(arrowstyle="->", mutation_scale=15), + # ) + # ax.text(8, 0.86, "Best time / \nperformance trade-off") + ax.legend() plt.show() plot_performance_tradeoff(results) # %% -# The black points represent the average time to fit and AUC for each vectorizer, -# and the width of the bars represents one standard deviation -# -# The green outlier dot on the right side of the plot corresponds to the first time -# the Sentence Transformers model was downloaded and loaded into memory. -# During the subsequent cross-validation iterations, the model is simply copied, -# which reduces computation time for the remaining folds. -# -# Interestingly, |StringEncoder| has a performance remarkably similar to that of -# |GapEncoder|, while being significantly faster. -# Conclusion -# ---------- -# In conclusion, |TextEncoder| provides powerful vectorization for text, but at -# the cost of longer computation times and the need for additional dependencies, -# such as torch. \StringEncoder| represents a simpler alternative that can provide -# good performance at a fraction of the cost of more complex methods. From ffc0d7325388aba23421a5e5b9a73995e0f4f445 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 15:53:22 +0100 Subject: [PATCH 30/38] Updating docstring --- skrub/_string_encoder.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index f7f905aaa..5565b5b59 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -13,19 +13,19 @@ class StringEncoder(SingleColumnTransformer): """Generate a lightweight string encoding of a given column using tf-idf \ - vectorization and truncated SVD. + vectorization and truncated singular value decomposition (SVD). First, apply a tf-idf vectorization of the text, then reduce the dimensionality - with a truncated SVD decomposition with the given number of parameters. + with a truncated SVD with the given number of parameters. - New features will be named `{col_name}_{component}` if the series has a name, - and `tsvd_{component}` if it does not. + New features will be named ``{col_name}_{component}`` if the series has a name, + and ``tsvd_{component}`` if it does not. Parameters ---------- n_components : int, default=30 - Number of components to be used for the PCA decomposition. Must be a - positive integer. + Number of components to be used for the singular value decomposition (SVD). + Must be a positive integer. vectorizer : str, "tfidf" or "hashing" Vectorizer to apply to the strings, either `tfidf` or `hashing` for scikit-learn TfidfVectorizer or HashingVectorizer respectively. From c5c3a732b2e0b2af17dbadaaf93c10fe425c5690 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 15:53:47 +0100 Subject: [PATCH 31/38] Fixing a bug --- examples/02_text_with_string_encoders_employee_salaries.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/02_text_with_string_encoders_employee_salaries.py b/examples/02_text_with_string_encoders_employee_salaries.py index 874b729cc..1f557ebeb 100644 --- a/examples/02_text_with_string_encoders_employee_salaries.py +++ b/examples/02_text_with_string_encoders_employee_salaries.py @@ -135,9 +135,12 @@ def plot_box_results(named_results): # %% OrdinalEncoder +import numpy as np from sklearn.preprocessing import OrdinalEncoder -ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value") +ordinal_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=np.nan +) ordinal_encoder_pipe = clone(gap_pipe).set_params( **{"tablevectorizer__high_cardinality": ordinal_encoder} From b103ca6ea6f8ef31d5e3f50cbb4b1403646c1eee Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:56:39 +0100 Subject: [PATCH 32/38] Update skrub/_string_encoder.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- skrub/_string_encoder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index f7f905aaa..48d468f23 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -47,8 +47,6 @@ class StringEncoder(SingleColumnTransformer): Encode string columns as a numeric array with the minhash method. GapEncoder : Encode string columns by constructing latent topics. - SimilarityEncoder : - Encode string columns as a numeric array with n-gram string similarity. TextEncoder : Encode string columns using pre-trained language models. From d9242fa6ec3608052923de23f1268dc334d440b1 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 16 Dec 2024 15:57:03 +0100 Subject: [PATCH 33/38] Updating docstring --- skrub/_string_encoder.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 5565b5b59..72b60a754 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -31,15 +31,15 @@ class StringEncoder(SingleColumnTransformer): scikit-learn TfidfVectorizer or HashingVectorizer respectively. ngram_range : tuple of (int, int) pairs, default=(3,4) - Whether the feature should be made of word or character n-grams. - Option ‘char_wb’ creates character n-grams only from text inside word - boundaries; n-grams at the edges of words are padded with space. - - analyzer : str, "char", "word" or "char_wb", default="char_wb" The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n - will be used. For example an `ngram_range` of `(1, 1)` means only unigrams, - `(1, 2)` means unigrams and bigrams, and `(2, 2)` means only bigrams. + will be used. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, + ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams. + + analyzer : str, "char", "word" or "char_wb", default="char_wb" + Whether the feature should be made of word or character n-grams. + Option ``char_wb`` creates character n-grams only from text inside word + boundaries; n-grams at the edges of words are padded with space. See Also -------- From 64c43c3b73f414de0e1f9d86296ca9c2ec18bba6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 17 Dec 2024 11:15:58 +0100 Subject: [PATCH 34/38] Updating tests and code to address corner cases --- skrub/_string_encoder.py | 64 +++++++++++++++++++----------- skrub/tests/test_string_encoder.py | 36 +++++++++++++++-- 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 1b5476fb7..7c119619f 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -1,3 +1,5 @@ +import warnings + from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import ( HashingVectorizer, @@ -115,41 +117,50 @@ def fit_transform(self, X, y=None): raise ValueError(f"Unknown analyzer {self.analyzer}") if self.vectorizer == "tfidf": - self.pipe = Pipeline( + self.vectorizer_ = TfidfVectorizer( + ngram_range=self.ngram_range, analyzer=self.analyzer + ) + elif self.vectorizer == "hashing": + self.vectorizer_ = Pipeline( [ ( - "tfidf", - TfidfVectorizer( + "hashing", + HashingVectorizer( ngram_range=self.ngram_range, analyzer=self.analyzer ), ), - ("tsvd", TruncatedSVD(n_components=self.n_components)), + ("tfidf", TfidfTransformer()), ] ) - - elif self.vectorizer == "hashing": - pipe_elements = [ - ( - "hashing", - HashingVectorizer( - ngram_range=self.ngram_range, analyzer=self.analyzer - ), - ), - ] - pipe_elements.append(("tfidf", TfidfTransformer())) - pipe_elements.append(("tsvd", TruncatedSVD(n_components=self.n_components))) - self.pipe = Pipeline(pipe_elements) else: raise ValueError(f"Unknown vectorizer {self.vectorizer}.") - name = sbd.name(X) - if not name: - name = "tsvd" - self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)] + X_out = self.vectorizer_.fit_transform(sbd.to_numpy(X)) - result = self.pipe.fit_transform(sbd.to_numpy(X)) + if (min_shape := min(X_out.shape)) >= self.n_components: + self.tsvd_ = TruncatedSVD(n_components=self.n_components) + result = self.tsvd_.fit_transform(X_out) + else: + warnings.warn( + f"The matrix shape is {(X_out.shape)}, and its minimum is " + f"{min_shape}, which is too small to fit a truncated SVD with " + f"n_components={self.n_components}. " + "The embeddings will be truncated by keeping the first " + f"{self.n_components} dimensions instead. " + ) + # self.n_components can be greater than the number + # of dimensions of result. + # Therefore, self.n_components_ below stores the resulting + # number of dimensions of result. + result = X_out[:, : self.n_components].toarray() self._is_fitted = True + self.n_components_ = result.shape[1] + + name = sbd.name(X) + if not name: + name = "tsvd" + self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components_)] return self._transform(X, result) @@ -163,12 +174,17 @@ def transform(self, X): Returns ------- - X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) + result: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) The embedding representation of the input. """ check_is_fitted(self) - result = self.pipe.transform(sbd.to_numpy(X)) + X_out = self.vectorizer_.fit_transform(sbd.to_numpy(X)) + if hasattr(self, "tsvd_"): + result = self.tsvd_.fit_transform(X_out) + else: + result = X_out[:, : self.n_components].toarray() + return self._transform(X, result) def _transform(self, X, result): diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 81fcf0895..0c1f1fb5b 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -1,4 +1,5 @@ import pytest +from sklearn.base import clone from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import ( HashingVectorizer, @@ -95,11 +96,11 @@ def test_hashing_vectorizer(encode_column, df_module): result = sbd.pandas_convert_dtypes(result) assert check_df.shape == result.shape - assert type(check_df) == type(result) + assert isinstance(check_df, type(result)) - assert len(se.pipe.named_steps) == len(pipe.named_steps) + assert all(hasattr(se, x) for x in ["tsvd_", "vectorizer"]) - for name, estimator in se.pipe.named_steps.items(): + for name, estimator in se.vectorizer_.named_steps.items(): assert name in pipe.named_steps assert isinstance(estimator, type(pipe.named_steps[name])) @@ -159,3 +160,32 @@ def test_get_feature_names_out(encode_column, df_module): encoder.fit(X) expected_columns = ["tsvd_0", "tsvd_1", "tsvd_2", "tsvd_3"] assert encoder.get_feature_names_out() == expected_columns + + +def test_n_components(df_module): + ngram_range = (3, 4) + analyzer = "char_wb" + n_components = 2 + + encoder = StringEncoder( + n_components=n_components, + vectorizer="tfidf", + ngram_range=ngram_range, + analyzer=analyzer, + ) + + X = df_module.make_column("", ["hello sir", "hola que tal"]) + + encoder_2 = clone(encoder).set_params(n_components=2).fit(X) + for meth in ("fit_transform", "transform"): + X_out = getattr(encoder_2, meth)(X) + assert sbd.shape(X_out)[1] == 2 + assert encoder_2.n_components_ == 2 + + encoder_30 = clone(encoder).set_params(n_components=30) + with pytest.warns(UserWarning, match="The embeddings will be truncated"): + for meth in ("fit_transform", "transform"): + X_out = getattr(encoder_30, meth)(X) + assert not hasattr(encoder_30, "tsvd_") + assert sbd.shape(X_out)[1] == 30 + assert encoder_30.n_components_ == 30 From eb0a13187eb6e2a5759609d67b66b5e937d7320a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 17 Dec 2024 12:01:27 +0100 Subject: [PATCH 35/38] Updating docs for encoders --- doc/encoding.rst | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/doc/encoding.rst b/doc/encoding.rst index 31bb03524..7ebaa073a 100644 --- a/doc/encoding.rst +++ b/doc/encoding.rst @@ -10,10 +10,22 @@ Encoding or vectorizing creates numerical features from the data, converting dataframes, strings, dates... Different encoders are suited for different types of data. -.. _dirty_categories: +Summary +....... +:class:`StringEncoder` should be used in most cases when working with high-cardinality +features, as it provides good performance on both categorical features (e.g,, +work titles, city names etc.) and free-flowing text (reviews, comments etc.), +while being very efficient and quick to fit. + +:class:`GapEncoder` provides better performance on dirty categories, while +:class:`TextEncoder` works better on free-flowing text. However, both encoders +are much slower to execute, and in the case of ``TextEncoder``, additional +dependencies are needed. + +:class:`MinHashEncoder` may scale better in case of large datasets, but its +performance is in general not as good as that of the other methods. -Encoding string columns -------------------------- +.. _dirty_categories: Non-normalized entries and dirty categories ............................................ @@ -59,11 +71,31 @@ Text with diverse entries When strings in a column are not dirty categories, but rather diverse entries of text (names, open-ended or free-flowing text) it is useful to -use language models of various sizes to represent string columns as embeddings. -Depending on the task and dataset, this approach may lead to significant improvements -in the quality of predictions, albeit with potential increases in memory usage and computation time. +use methods that can address the variety of terms that can appear. Skrub provides +two encoders to handle these to represent string columns as embeddings, +:class:`TextEncoder` and :class:`StringEncoder`. -Skrub integrates these language models as scikit-learn transformers, allowing them +Depending on the task and dataset, this approach may lead to significant improvements +in the quality of predictions, albeit with potential increases in memory usage +and computation time in the case of :class:`TextEncoder`. + +Vectorizing text +---------------- +A lightweight solution for handling diverse strings is to first apply a +`tf-idf vectorization `_, then +follow it with a dimensionality reduction algorithm such as +`TruncatedSVD `_ +to limit the number of features: the :class:`StringEncoder` implements this +operation. + +In simpler terms, :class:`StringEncoder` builds a sparse matrix that counts the +number of times each word appears in all documents (where a document in this case +is a string in the column to encode), and then reduces the size of the sparse +matrix to a limited number of features for the training operation. + +Using language models +--------------------- +Skrub integrates language models as scikit-learn transformers, allowing them to be easily plugged into :class:`TableVectorizer` and :class:`~sklearn.pipeline.Pipeline`. @@ -98,7 +130,7 @@ like any other pre-trained model. For more information, see the Encoding dates ---------------- +.............. The :class:`DatetimeEncoder` encodes date and time: it represent them as time in seconds since a fixed date, but also added features useful to From 92683311d660399768fd9afa957a58b0781213ce Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:56:40 +0100 Subject: [PATCH 36/38] Delete examples/02_text_with_string_encoders_employee_salaries.py --- ..._with_string_encoders_employee_salaries.py | 213 ------------------ 1 file changed, 213 deletions(-) delete mode 100644 examples/02_text_with_string_encoders_employee_salaries.py diff --git a/examples/02_text_with_string_encoders_employee_salaries.py b/examples/02_text_with_string_encoders_employee_salaries.py deleted file mode 100644 index 1f557ebeb..000000000 --- a/examples/02_text_with_string_encoders_employee_salaries.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -.. _example_string_encoders: - -===================================================== -Various string encoders: a sentiment analysis example -===================================================== - -In this example, we explore the performance of string and categorical encoders -available in skrub. - -.. |GapEncoder| replace:: - :class:`~skrub.GapEncoder` - -.. |MinHashEncoder| replace:: - :class:`~skrub.MinHashEncoder` - -.. |TextEncoder| replace:: - :class:`~skrub.TextEncoder` - -.. |StringEncoder| replace:: - :class:`~skrub.StringEncoder` - -.. |TableReport| replace:: - :class:`~skrub.TableReport` - -.. |TableVectorizer| replace:: - :class:`~skrub.TableVectorizer` - -.. |pipeline| replace:: - :class:`~sklearn.pipeline.Pipeline` - -.. |HistGradientBoostingRegressor| replace:: - :class:`~sklearn.ensemble.HistGradientBoostingRegressor` - -.. |RandomizedSearchCV| replace:: - :class:`~sklearn.model_selection.RandomizedSearchCV` - -.. |GridSearchCV| replace:: - :class:`~sklearn.model_selection.GridSearchCV` -""" - -# %% -from skrub.datasets import fetch_employee_salaries - -dataset = fetch_employee_salaries() -X, y = dataset.X, dataset.y - - -# %% -# GapEncoder -# ---------- -import matplotlib.pyplot as plt -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.model_selection import cross_validate -from sklearn.pipeline import make_pipeline - -from skrub import GapEncoder, TableVectorizer - - -# %% -def plot_box_results(named_results): - fig, ax = plt.subplots() - names, scores = zip( - *[(name, result["test_score"]) for name, result in named_results] - ) - ax.boxplot(scores, vert=False) - ax.set_yticks(range(1, len(names) + 1), labels=list(names), size=12) - ax.set_xlabel("R2 score", size=14) - plt.title( - "R2 score across folds (higher is better)", - size=14, - ) - plt.show() - - -# %% Base GapEncoder -results = [] - -gap_pipe = make_pipeline( - TableVectorizer(high_cardinality=GapEncoder(n_components=30)), - HistGradientBoostingRegressor(), -) -gap_results = cross_validate(gap_pipe, X, y, scoring="r2", verbose=1) -results.append(("GapEncoder", gap_results)) - -# %% GapEncoder with add_words=True -gap_pipe = make_pipeline( - TableVectorizer(high_cardinality=GapEncoder(n_components=30, add_words=True)), - HistGradientBoostingRegressor(), -) -gap_results = cross_validate(gap_pipe, X, y, scoring="r2", verbose=1) -results.append(("GapEncoder - add_words", gap_results)) - -# %% -# MinHashEncoder -# -------------- -from sklearn.base import clone - -from skrub import MinHashEncoder - -minhash_pipe = clone(gap_pipe).set_params( - **{"tablevectorizer__high_cardinality": MinHashEncoder(n_components=30)} -) -minhash_results = cross_validate(minhash_pipe, X, y, scoring="r2", verbose=1) -results.append(("MinHashEncoder", minhash_results)) - -# %% TextEncoder -from skrub import TextEncoder - -text_encoder = TextEncoder( - "sentence-transformers/paraphrase-albert-small-v2", - device="cpu", -) -text_encoder_pipe = clone(gap_pipe).set_params( - **{"tablevectorizer__high_cardinality": text_encoder} -) -text_encoder_results = cross_validate(text_encoder_pipe, X, y, scoring="r2", verbose=1) -results.append(("TextEncoder", text_encoder_results)) - -# %% StringEncoder -from skrub import StringEncoder - -string_encoder = StringEncoder(n_components=30, ngram_range=(3, 4), analyzer="char_wb") - -string_encoder_pipe = clone(gap_pipe).set_params( - **{"tablevectorizer__high_cardinality": string_encoder} -) -string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="r2") -results.append(("StringEncoder - char_wb, (3,4)", string_encoder_results)) - -# %% Drop column -drop_pipe = clone(gap_pipe).set_params(**{"tablevectorizer__high_cardinality": "drop"}) -drop_results = cross_validate(drop_pipe, X, y, scoring="r2") -results.append(("Drop", drop_results)) - - -# %% OrdinalEncoder -import numpy as np -from sklearn.preprocessing import OrdinalEncoder - -ordinal_encoder = OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=np.nan -) - -ordinal_encoder_pipe = clone(gap_pipe).set_params( - **{"tablevectorizer__high_cardinality": ordinal_encoder} -) -ordinal_encoder_results = cross_validate(ordinal_encoder_pipe, X, y, scoring="r2") -results.append(("OrdinalEncoder", ordinal_encoder_results)) - -# %% -plot_box_results(results) - - -# %% -import numpy as np - - -def plot_performance_tradeoff(results): - fig, ax = plt.subplots(figsize=(5, 4), dpi=200) - markers = ["s", "o", "^", "x", "+", "v", "1"] - for idx, (name, result) in enumerate(results): - ax.scatter( - result["fit_time"], - result["test_score"], - label=name, - marker=markers[idx], - ) - mean_fit_time = np.mean(result["fit_time"]) - mean_score = np.mean(result["test_score"]) - ax.scatter( - mean_fit_time, - mean_score, - color="k", - marker=markers[idx], - ) - std_fit_time = np.std(result["fit_time"]) - std_score = np.std(result["test_score"]) - ax.errorbar( - x=mean_fit_time, - y=mean_score, - yerr=std_score, - fmt="none", - c="k", - capsize=2, - ) - ax.errorbar( - x=mean_fit_time, - y=mean_score, - xerr=std_fit_time, - fmt="none", - c="k", - capsize=2, - ) - - ax.set_xlabel("Time to fit (seconds)") - ax.set_ylabel("R2") - ax.set_title("Prediction performance / training time trade-off") - - # ax.annotate( - # "", - # xy=(1.5, 0.98), - # xytext=(8.5, 0.90), - # arrowprops=dict(arrowstyle="->", mutation_scale=15), - # ) - # ax.text(8, 0.86, "Best time / \nperformance trade-off") - ax.legend() - plt.show() - - -plot_performance_tradeoff(results) - -# %% From 49553d982d2ddb6266e14b038a2628b02aedc951 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 19 Dec 2024 14:36:25 +0100 Subject: [PATCH 37/38] Adding StringEncoder to doc index --- doc/reference/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/reference/index.rst b/doc/reference/index.rst index 8d7646848..dfb151114 100644 --- a/doc/reference/index.rst +++ b/doc/reference/index.rst @@ -46,6 +46,7 @@ Encoding a column DatetimeEncoder ToCategorical ToDatetime + StringEncoder .. autosummary:: :toctree: generated/ From a0afc68dab5993ae1ee21d681b00df0181f6ba82 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 19 Dec 2024 14:40:07 +0100 Subject: [PATCH 38/38] Doc fixes --- examples/02_text_with_string_encoders.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py index 54a4ed71e..e92c17feb 100644 --- a/examples/02_text_with_string_encoders.py +++ b/examples/02_text_with_string_encoders.py @@ -61,7 +61,7 @@ # %% # GapEncoder -# ---------- +# ^^^^^^^^^^ # First, let's vectorize our text column using the |GapEncoder|, one of the # `high cardinality categorical encoders `_ # provided by skrub. @@ -177,7 +177,7 @@ def plot_box_results(named_results): # %% # MinHashEncoder -# -------------- +# ^^^^^^^^^^^^^^ # We now compare these results with the |MinHashEncoder|, which is faster # and produces vectors better suited for tree-based estimators like # |HistGradientBoostingClassifier|. To do this, we can simply replace @@ -200,7 +200,7 @@ def plot_box_results(named_results): # power than those from the |GapEncoder| on this dataset. # # TextEncoder -# ----------- +# ^^^^^^^^^^^ # Let's now shift our focus to pre-trained deep learning encoders. Our previous # encoders are syntactic models that we trained directly on the toxicity dataset. # To generate more powerful vector representations for free-form text and diverse @@ -225,10 +225,12 @@ def plot_box_results(named_results): plot_box_results(results) # %% +# SringEncoder +# ^^^^^^^^^^^^ # |TextEncoder| embeddings are very strong, but they are also quite expensive to # use. A simpler, faster alternative for encoding strings is the |StringEncoder|, # which works by first performing a tf-idf (computing vectors of rescaled word -# counts, [wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) of the text, and then +# counts of the text `wiki `_), and then # following it with TruncatedSVD to reduce the number of dimensions to, in this # case, 30. from skrub import StringEncoder @@ -318,6 +320,7 @@ def plot_performance_tradeoff(results): # # Interestingly, |StringEncoder| has a performance remarkably similar to that of # |GapEncoder|, while being significantly faster. +# # Conclusion # ---------- # In conclusion, |TextEncoder| provides powerful vectorization for text, but at