diff --git a/cu_cat/VERSION.txt b/cu_cat/VERSION.txt index 66f742f87..3dccf65f0 100644 --- a/cu_cat/VERSION.txt +++ b/cu_cat/VERSION.txt @@ -1 +1 @@ -0.07.09 +0.07.10 diff --git a/cu_cat/_table_vectorizer.py b/cu_cat/_table_vectorizer.py index 2c42ba56e..78836257c 100644 --- a/cu_cat/_table_vectorizer.py +++ b/cu_cat/_table_vectorizer.py @@ -635,12 +635,12 @@ def fit_transform(self, X, y=None): _nunique_values = { # Cache results col: X[col].nunique() for col in categorical_columns } - low_cardinality_columns = [ + low_card_cat_columns = [ col for col in categorical_columns if _nunique_values[col] < self.cardinality_threshold ] - high_cardinality_columns = [ + high_card_cat_columns = [ col for col in categorical_columns if _nunique_values[col] >= self.cardinality_threshold @@ -654,15 +654,15 @@ def fit_transform(self, X, y=None): all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [ # type: ignore ("numeric", self.numerical_transformer, numeric_columns), ("datetime", self.datetime_transformer_, datetime_columns), - ("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns), - ("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns), + ("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns), + ("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns), ] else: all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [ # type: ignore ("numeric", self.numerical_transformer, numeric_columns), # ("datetime", self.datetime_transformer_, datetime_columns), ## commented out if in dt format so pyg can handle - ("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns), - ("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns), + ("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns), + ("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns), ] # We will now filter this list, by keeping only the ones with: # - at least one column @@ -768,6 +768,7 @@ def transform(self, X) -> np.ndarray: f"array seen during fit. Got {X.shape[1]} " f"columns, expected {len(self.columns_)}" ) + self.Xt_= df_type(X) X, y = make_safe_gpu_dataframes(X, None, self.engine_) if not isinstance(X, pd.DataFrame) and not 'cudf' in self.Xt_: X = pd.DataFrame(X) @@ -800,7 +801,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]: typing.List[str] Feature names. """ - if 'cudf' not in self.Xt_ and not deps.cudf: + if not deps.cudf: if parse_version(sklearn_version) > parse_version("1.0"): ct_feature_names = super().get_feature_names() else: diff --git a/cu_cat/tests/test_table_vectorizer.py b/cu_cat/tests/test_table_vectorizer.py index d0ec36064..47113e626 100644 --- a/cu_cat/tests/test_table_vectorizer.py +++ b/cu_cat/tests/test_table_vectorizer.py @@ -201,15 +201,15 @@ # # Warning: order-dependant # expected_transformers_df = { # "numeric": ["int", "float"], -# "low_cardinarlity": ["str1", "cat1"], -# "high_cardinarlity": ["str2", "cat2"], +# "low_card_cat": ["str1", "cat1"], +# "high_card_cat": ["str2", "cat2"], # } # vectorizer_base.fit_transform(X) # check_same_transformers(expected_transformers_df, vectorizer_base.transformers_) # # Test with higher cardinality threshold and no numeric transformer # expected_transformers_2 = { -# "low_cardinarlity": ["str1", "str2", "cat1", "cat2"], +# "low_card_cat": ["str1", "str2", "cat1", "cat2"], # "numeric": ["int", "float"], # } # vectorizer_default = TableVectorizer() # Using default values @@ -220,8 +220,8 @@ # arr = X.to_numpy() # # Instead of the columns names, we'll have the column indices. # expected_transformers_np_no_cast = { -# "low_cardinarlity": [2, 4], -# "high_cardinarlity": [3, 5], +# "low_card_cat": [2, 4], +# "high_card_cat": [3, 5], # "numeric": [0, 1], # } # vectorizer_base.fit_transform(arr) @@ -231,7 +231,7 @@ # # Test with single column dataframe # expected_transformers_series = { -# "low_cardinarlity": ["cat1"], +# "low_card_cat": ["cat1"], # } # vectorizer_base.fit_transform(X[["cat1"]]) # check_same_transformers(expected_transformers_series, vectorizer_base.transformers_) @@ -246,8 +246,8 @@ # X_str = X.astype("object") # # With pandas # expected_transformers_plain = { -# "high_cardinarlity": ["str2", "cat2"], -# "low_cardinarlity": ["str1", "cat1"], +# "high_card_cat": ["str2", "cat2"], +# "low_card_cat": ["str1", "cat1"], # "numeric": ["int", "float"], # } # vectorizer_cast.fit_transform(X_str) @@ -255,8 +255,8 @@ # # With numpy # expected_transformers_np_cast = { # "numeric": [0, 1], -# "low_cardinarlity": [2, 4], -# "high_cardinarlity": [3, 5], +# "low_card_cat": [2, 4], +# "high_card_cat": [3, 5], # } # vectorizer_cast.fit_transform(X_str.to_numpy()) # check_same_transformers( @@ -357,8 +357,8 @@ # """ # expected_transformers = { # "numeric": [0, 1], -# "low_cardinarlity": [2, 4], -# "high_cardinarlity": [3, 5], +# "low_card_cat": [2, 4], +# "high_card_cat": [3, 5], # } # vectorizer = TableVectorizer( # cardinality_threshold=4, @@ -668,7 +668,7 @@ # # [ # # ("numeric", "passthrough", ["int", "float"]), # # ("minhashencoder", "MinHashEncoder", ["str1", "str2"]), -# # ("low_cardinarlity", "OneHotEncoder", ["cat1", "cat2"]), +# # ("low_card_cat", "OneHotEncoder", ["cat1", "cat2"]), # # ], # # ), # # ( @@ -676,7 +676,7 @@ # # [ # # ("numeric", "passthrough", ["int", "float"]), # # ("mh_cat1", "MinHashEncoder", ["cat1"]), -# # ("low_cardinarlity", "OneHotEncoder", ["str1", "str2", "cat2"]), +# # ("low_card_cat", "OneHotEncoder", ["str1", "str2", "cat2"]), # # ], # # ), # # ], @@ -761,7 +761,7 @@ # table_vec.fit_transform(df) # expected_transformers_df = { # "numeric": ["int_str", "float_str", "int_float"], -# "low_cardinarlity": ["bool_str"], +# "low_card_cat": ["bool_str"], # } # check_same_transformers(expected_transformers_df, table_vec.transformers_) @@ -772,7 +772,7 @@ # table_vec.fit_transform(X) # expected_transformers_array = { # "numeric": [0, 1, 2], -# "low_cardinarlity": [3], +# "low_card_cat": [3], # } # check_same_transformers(expected_transformers_array, table_vec.transformers_) diff --git a/examples/01_encodings.py b/examples/01_encodings.py index 1b9c0cacc..a0659c3e1 100644 --- a/examples/01_encodings.py +++ b/examples/01_encodings.py @@ -94,14 +94,14 @@ # - The |OneHotEncoder| for low cardinality string variables, the columns # ``'gender'``, ``'department'``, ``'department_name'`` and ``'assignment_category'``. -tv.named_transformers_["low_cardinarlity"].get_feature_names_out() +tv.named_transformers_["low_card_cat"].get_feature_names_out() ############################################################################### # - The |GapEncoder| for high cardinality string columns, ``'employee_position_title'`` # and ``'division'``. The |GapEncoder| is a powerful encoder that can handle dirty # categorical columns. -tv.named_transformers_["high_cardinarlity"].get_feature_names_out() +tv.named_transformers_["high_card_cat"].get_feature_names_out() ############################################################################### # - The |DatetimeEncoder| to the ``'date_first_hired'`` column. The |DatetimeEncoder| diff --git a/setup.cfg b/setup.cfg index 003274e10..788297572 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,113 +1,113 @@ -; [metadata] -; name = cu_cat -; version = file: cu_cat/VERSION.txt -; description = Prepping tables for machine learning -; long_description = file: README.md -; license = BSD -; license_files = LICENSE.txt -; classifiers = -; Development Status :: 5 - Production/Stable -; Environment :: Console -; Intended Audience :: Science/Research -; License :: OSI Approved :: BSD License -; Operating System :: OS Independent -; Programming Language :: Python :: 3.10 -; Programming Language :: Python :: 3.11 -; Topic :: Scientific/Engineering -; Topic :: Software Development :: Libraries -; project_urls = -; Homepage = http://github.com/graphistry/cu-cat/ -; Source = https://github.com/graphistry/cu-cat +[metadata] +name = cu_cat +version = file: cu_cat/VERSION.txt +description = Prepping tables for machine learning +long_description = file: README.md +license = BSD +license_files = LICENSE.txt +classifiers = + Development Status :: 5 - Production/Stable + Environment :: Console + Intended Audience :: Science/Research + License :: OSI Approved :: BSD License + Operating System :: OS Independent + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Topic :: Scientific/Engineering + Topic :: Software Development :: Libraries +project_urls = + Homepage = http://github.com/graphistry/cu-cat/ + Source = https://github.com/graphistry/cu-cat -; [options] -; include_package_data = True -; packages = find: -; install_requires = -; scikit-learn>=1.2.1 -; numpy>=1.23.5 -; scipy>=1.9.3 -; pandas>=1.5.3 -; packaging>=23.1 -; python_requires = >=3.10 +[options] +include_package_data = True +packages = find: +install_requires = + scikit-learn>=1.2.1 + numpy>=1.23.5 + scipy>=1.9.3 + pandas>=1.5.3 + packaging>=23.1 +python_requires = >=3.10 -; [options.extras_require] -; dev = -; pytest -; pytest-cov -; pytest-xdist==2.5.0 -; pytest-xdist[psutil] -; coverage -; mypy -; numpydoc -; flake8 -; openml -; pre-commit -; pyarrow = -; pyarrow -; polars = -; pyarrow -; polars -; doc = -; pydata-sphinx-theme -; sphinxext-opengraph -; sphinx-copybutton -; matplotlib -; seaborn -; statsmodels -; numpydoc -; jupyterlite-sphinx -; jupyterlite-pyodide-kernel -; pyarrow -; benchmarks = -; numpy -; pandas -; matplotlib -; seaborn -; tqdm -; thefuzz -; autofj -; pyarrow -; loguru -; min-py310 = -; scikit-learn==1.2.1 -; numpy==1.23.5 -; scipy==1.9.3 -; pandas==1.5.3 +[options.extras_require] +dev = + pytest + pytest-cov + pytest-xdist==2.5.0 + pytest-xdist[psutil] + coverage + mypy + numpydoc + flake8 + openml + pre-commit +pyarrow = + pyarrow +polars = + pyarrow + polars +doc = + pydata-sphinx-theme + sphinxext-opengraph + sphinx-copybutton + matplotlib + seaborn + statsmodels + numpydoc + jupyterlite-sphinx + jupyterlite-pyodide-kernel + pyarrow +benchmarks = + numpy + pandas + matplotlib + seaborn + tqdm + thefuzz + autofj + pyarrow + loguru +min-py310 = + scikit-learn==1.2.1 + numpy==1.23.5 + scipy==1.9.3 + pandas==1.5.3 -; [flake8] -; max-line-length = 88 -; target-version = ['py310'] -; ignore = -; E24, -; E121, -; E123, -; E126, -; E203, -; E226, -; E704, -; E731, -; E741, -; W503, -; W504 -; per-file-ignores = -; examples/*:E402 -; doc/conf.py:E402 -; exclude = -; .git, -; __pycache__, -; dist, -; build +[flake8] +max-line-length = 88 +target-version = ['py310'] +ignore = + E24, + E121, + E123, + E126, + E203, + E226, + E704, + E731, + E741, + W503, + W504 +per-file-ignores = + examples/*:E402 + doc/conf.py:E402 +exclude = + .git, + __pycache__, + dist, + build -; [mypy] -; ignore_missing_imports = True -; allow_redefinition = True +[mypy] +ignore_missing_imports = True +allow_redefinition = True -; [codespell] -; skip = ./.git,./.mypy_cache +[codespell] +skip = ./.git,./.mypy_cache -; [egg_info] -; tag_build = -; tag_date = 0 +[egg_info] +tag_build = +tag_date = 0 ##github diff --git a/setup.py b/setup.py index 7dc0847f2..54b7d194b 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def unique_flatten_dict(d): # if __name__ == "__main__": setup( name='cu-cat', - version='v0.07.09', # versioneer.get_version(), + version='v0.07.10', # versioneer.get_version(), # cmdclass='0.7.7', # versioneer.get_cmdclass(), packages = find_packages(), platforms='any',