diff --git a/cu_cat/__init__.py b/cu_cat/__init__.py index c81a06de9..8c3448880 100644 --- a/cu_cat/__init__.py +++ b/cu_cat/__init__.py @@ -18,7 +18,7 @@ from ._datetime_encoder import DatetimeEncoder from ._dep_manager import DepManager from ._gap_encoder import GapEncoder # type: ignore -from ._table_vectorizer import SuperVectorizer, TableVectorizer +from ._table_vectorizer import TableVectorizer from ._version import get_versions @@ -29,7 +29,7 @@ __all__ = [ "DatetimeEncoder", "GapEncoder", - "SuperVectorizer", + "TableVectorizer", "TableVectorizer", "DepManager", "deduplicate", diff --git a/cu_cat/_dep_manager.py b/cu_cat/_dep_manager.py index b9eff79a4..49923ce55 100644 --- a/cu_cat/_dep_manager.py +++ b/cu_cat/_dep_manager.py @@ -2,6 +2,21 @@ class DepManager: + """Easily keep track of dependencies in a project. + Parameters + ---------- + package : str + The name of the package to import. + + Attributes + ---------- + pkgs : dict + A dictionary of the imported packages. + + Examples + -------- + >>> numpy = deps.numpy + """ def __init__(self): self.pkgs = {} diff --git a/cu_cat/_gap_encoder.py b/cu_cat/_gap_encoder.py index 9a21d7194..c44417977 100644 --- a/cu_cat/_gap_encoder.py +++ b/cu_cat/_gap_encoder.py @@ -760,11 +760,7 @@ class GapEncoder(BaseEstimator, TransformerMixin): See Also -------- - :class:`~cu_cat.MinHashEncoder` : - Encode string columns as a numeric array with the minhash method. - :class:`~cu_cat.SimilarityEncoder` : - Encode string columns as a numeric array with n-gram string similarity. - :class:`~cu_cat.deduplicate` : + :class:`~cu_cat.deduplicate` : Deduplicate data by hierarchically clustering similar strings. References diff --git a/cu_cat/_table_vectorizer.py b/cu_cat/_table_vectorizer.py index e22cbcc5d..fc24827b8 100644 --- a/cu_cat/_table_vectorizer.py +++ b/cu_cat/_table_vectorizer.py @@ -372,6 +372,47 @@ class TableVectorizer(ColumnTransformer): columns might be shuffled, e.g., ['job', 'year', 'name'], but every call to :func:`TableVectorizer.transform` on this instance will return this order. + + Examples + -------- + + First we can import the necessary modules and create a sample dataset: + + >>> from time import time + >>> from cu_cat._table_vectorizer import TableVectorizer as cu_TableVectorizer + >>> from sklearn.datasets import fetch_20newsgroups + + Let's subsample from the newsgroup, non-normalized data: + + >>> n_samples = 2000 # speed boost improves as n_samples increases, to the limit of gpu mem + + >>> news, _ = fetch_20newsgroups( + shuffle=True, + random_state=1, + remove=("headers", "footers", "quotes"), + return_X_y=True, + ) + + For fun, lets time the :class:`~cu_cat.TableVectorizer.fit_transform` + + >>> news = news[:n_samples] + >>> news=pd.DataFrame(news) + >>> table_vec = cu_TableVectorizer() + >>> t = time() + >>> aa = table_vec.fit_transform((news)) + >>> ct = time() - t + + Now let's compare with the same operation on the CPU: + + >>> from dirty_cat._table_vectorizer import TableVectorizer as dirty_TableVectorizer + + >>> t = time() + >>> bb = dirty_TableVectorizer().fit_transform(news) + >>> dt = time() - t + + >>> print(f"cu_cat: {ct:.2f}s, dirty_cat: {dt:.2f}s, speedup: {dt/ct:.2f}x") + + cu_cat: 58.76s, dirty_cat: 84.54s, speedup: 1.44x """ transformers_: List[Tuple[str, Union[str, TransformerMixin], List[str]]] @@ -874,9 +915,3 @@ def get_feature_names(self, input_features=None) -> List[str]: return self.get_feature_names_out(input_features) #### AttributeError: Transformer numeric (type StandardScaler) does not provide get_feature_names. - -@deprecated("use TableVectorizer instead.") -class SuperVectorizer(TableVectorizer): - """Deprecated name of TableVectorizer.""" - - pass diff --git a/docs/source/cu_cat.rst b/docs/source/cu_cat.rst index 2bba5d6d4..603a9fc22 100644 --- a/docs/source/cu_cat.rst +++ b/docs/source/cu_cat.rst @@ -1,31 +1,32 @@ -Dependency_Manager + +Gap_Encoder ================== -.. automodule:: cu_cat._dep_manager +.. automodule:: cu_cat._gap_encoder :members: :undoc-members: :show-inheritance: -Gap_Encoder +Table_Vectorizer ================== -.. automodule:: cu_cat._gap_encoder +.. automodule:: cu_cat._table_vectorizer :members: :undoc-members: :show-inheritance: -Table_Vectorizer +DeDuplicater ================== -.. automodule:: cu_cat._table_vectorizer +.. automodule:: cu_cat._deduplicater :members: :undoc-members: :show-inheritance: -Versioneer +Dependency_Manager ================== -.. automodule:: cu_cat._version +.. automodule:: cu_cat._dep_manager :members: :undoc-members: :show-inheritance: