Skip to content

Commit

Permalink
docs update
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Feb 5, 2024
1 parent 1caed39 commit 6876c6b
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 21 deletions.
4 changes: 2 additions & 2 deletions cu_cat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ._datetime_encoder import DatetimeEncoder
from ._dep_manager import DepManager
from ._gap_encoder import GapEncoder # type: ignore
from ._table_vectorizer import SuperVectorizer, TableVectorizer
from ._table_vectorizer import TableVectorizer

from ._version import get_versions

Expand All @@ -29,7 +29,7 @@
__all__ = [
"DatetimeEncoder",
"GapEncoder",
"SuperVectorizer",
"TableVectorizer",
"TableVectorizer",
"DepManager",
"deduplicate",
Expand Down
15 changes: 15 additions & 0 deletions cu_cat/_dep_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@


class DepManager:
"""Easily keep track of dependencies in a project.
Parameters
----------
package : str
The name of the package to import.
Attributes
----------
pkgs : dict
A dictionary of the imported packages.
Examples
--------
>>> numpy = deps.numpy
"""
def __init__(self):
self.pkgs = {}

Expand Down
6 changes: 1 addition & 5 deletions cu_cat/_gap_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,11 +760,7 @@ class GapEncoder(BaseEstimator, TransformerMixin):
See Also
--------
:class:`~cu_cat.MinHashEncoder` :
Encode string columns as a numeric array with the minhash method.
:class:`~cu_cat.SimilarityEncoder` :
Encode string columns as a numeric array with n-gram string similarity.
:class:`~cu_cat.deduplicate` :
:class:`~cu_cat.deduplicate` :
Deduplicate data by hierarchically clustering similar strings.
References
Expand Down
47 changes: 41 additions & 6 deletions cu_cat/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,47 @@ class TableVectorizer(ColumnTransformer):
columns might be shuffled, e.g., ['job', 'year', 'name'], but every call
to :func:`TableVectorizer.transform` on this instance will return this
order.
Examples
--------
First we can import the necessary modules and create a sample dataset:
>>> from time import time
>>> from cu_cat._table_vectorizer import TableVectorizer as cu_TableVectorizer
>>> from sklearn.datasets import fetch_20newsgroups
Let's subsample from the newsgroup, non-normalized data:
>>> n_samples = 2000 # speed boost improves as n_samples increases, to the limit of gpu mem
>>> news, _ = fetch_20newsgroups(
shuffle=True,
random_state=1,
remove=("headers", "footers", "quotes"),
return_X_y=True,
)
For fun, lets time the :class:`~cu_cat.TableVectorizer.fit_transform`
>>> news = news[:n_samples]
>>> news=pd.DataFrame(news)
>>> table_vec = cu_TableVectorizer()
>>> t = time()
>>> aa = table_vec.fit_transform((news))
>>> ct = time() - t
Now let's compare with the same operation on the CPU:
>>> from dirty_cat._table_vectorizer import TableVectorizer as dirty_TableVectorizer
>>> t = time()
>>> bb = dirty_TableVectorizer().fit_transform(news)
>>> dt = time() - t
>>> print(f"cu_cat: {ct:.2f}s, dirty_cat: {dt:.2f}s, speedup: {dt/ct:.2f}x")
cu_cat: 58.76s, dirty_cat: 84.54s, speedup: 1.44x
"""

transformers_: List[Tuple[str, Union[str, TransformerMixin], List[str]]]
Expand Down Expand Up @@ -874,9 +915,3 @@ def get_feature_names(self, input_features=None) -> List[str]:
return self.get_feature_names_out(input_features)

#### AttributeError: Transformer numeric (type StandardScaler) does not provide get_feature_names.

@deprecated("use TableVectorizer instead.")
class SuperVectorizer(TableVectorizer):
"""Deprecated name of TableVectorizer."""

pass
17 changes: 9 additions & 8 deletions docs/source/cu_cat.rst
Original file line number Diff line number Diff line change
@@ -1,31 +1,32 @@
Dependency_Manager

Gap_Encoder
==================

.. automodule:: cu_cat._dep_manager
.. automodule:: cu_cat._gap_encoder
:members:
:undoc-members:
:show-inheritance:

Gap_Encoder
Table_Vectorizer
==================

.. automodule:: cu_cat._gap_encoder
.. automodule:: cu_cat._table_vectorizer
:members:
:undoc-members:
:show-inheritance:

Table_Vectorizer
DeDuplicater
==================

.. automodule:: cu_cat._table_vectorizer
.. automodule:: cu_cat._deduplicater
:members:
:undoc-members:
:show-inheritance:

Versioneer
Dependency_Manager
==================

.. automodule:: cu_cat._version
.. automodule:: cu_cat._dep_manager
:members:
:undoc-members:
:show-inheritance:

0 comments on commit 6876c6b

Please sign in to comment.