lints

graphistry · Jan 2, 2024 · e317e6a · e317e6a
1 parent 11d814d
commit e317e6a
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 131 deletions.
diff --git a/cu_cat/VERSION.txt b/cu_cat/VERSION.txt
@@ -1 +1 @@
-0.07.09
+0.07.10
diff --git a/cu_cat/_table_vectorizer.py b/cu_cat/_table_vectorizer.py
@@ -635,12 +635,12 @@ def fit_transform(self, X, y=None):
         _nunique_values = {  # Cache results
             col: X[col].nunique() for col in categorical_columns
         }
-        low_cardinality_columns = [
+        low_card_cat_columns = [
             col
             for col in categorical_columns
             if _nunique_values[col] < self.cardinality_threshold
         ]
-        high_cardinality_columns = [
+        high_card_cat_columns = [
             col
             for col in categorical_columns
             if _nunique_values[col] >= self.cardinality_threshold
@@ -654,15 +654,15 @@ def fit_transform(self, X, y=None):
             all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [  # type: ignore
                 ("numeric", self.numerical_transformer, numeric_columns),
                 ("datetime", self.datetime_transformer_, datetime_columns),
-                ("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns),
-                ("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns),
+                ("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns),
+                ("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns),
             ]
         else:
             all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [  # type: ignore
             ("numeric", self.numerical_transformer, numeric_columns),
             # ("datetime", self.datetime_transformer_, datetime_columns), ## commented out if in dt format so pyg can handle
-            ("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns),
-            ("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns),
+            ("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns),
+            ("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns),
         ]
         # We will now filter this list, by keeping only the ones with:
         # - at least one column
@@ -768,6 +768,7 @@ def transform(self, X) -> np.ndarray:
                 f"array seen during fit. Got {X.shape[1]} "
                 f"columns, expected {len(self.columns_)}"
             )
+        self.Xt_= df_type(X)
         X, y = make_safe_gpu_dataframes(X, None, self.engine_)
         if not isinstance(X, pd.DataFrame) and not 'cudf' in self.Xt_:
             X = pd.DataFrame(X)
@@ -800,7 +801,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
         typing.List[str]
             Feature names.
         """
-        if 'cudf' not in self.Xt_ and not deps.cudf:
+        if not deps.cudf:
             if parse_version(sklearn_version) > parse_version("1.0"):
                 ct_feature_names = super().get_feature_names()
             else:

diff --git a/cu_cat/tests/test_table_vectorizer.py b/cu_cat/tests/test_table_vectorizer.py
@@ -201,15 +201,15 @@
 #     # Warning: order-dependant
 #     expected_transformers_df = {
 #         "numeric": ["int", "float"],
-#         "low_cardinarlity": ["str1", "cat1"],
-#         "high_cardinarlity": ["str2", "cat2"],
+#         "low_card_cat": ["str1", "cat1"],
+#         "high_card_cat": ["str2", "cat2"],
 #     }
 #     vectorizer_base.fit_transform(X)
 #     check_same_transformers(expected_transformers_df, vectorizer_base.transformers_)
 
 #     # Test with higher cardinality threshold and no numeric transformer
 #     expected_transformers_2 = {
-#         "low_cardinarlity": ["str1", "str2", "cat1", "cat2"],
+#         "low_card_cat": ["str1", "str2", "cat1", "cat2"],
 #         "numeric": ["int", "float"],
 #     }
 #     vectorizer_default = TableVectorizer()  # Using default values
@@ -220,8 +220,8 @@
 #     arr = X.to_numpy()
 #     # Instead of the columns names, we'll have the column indices.
 #     expected_transformers_np_no_cast = {
-#         "low_cardinarlity": [2, 4],
-#         "high_cardinarlity": [3, 5],
+#         "low_card_cat": [2, 4],
+#         "high_card_cat": [3, 5],
 #         "numeric": [0, 1],
 #     }
 #     vectorizer_base.fit_transform(arr)
@@ -231,7 +231,7 @@
 
 #     # Test with single column dataframe
 #     expected_transformers_series = {
-#         "low_cardinarlity": ["cat1"],
+#         "low_card_cat": ["cat1"],
 #     }
 #     vectorizer_base.fit_transform(X[["cat1"]])
 #     check_same_transformers(expected_transformers_series, vectorizer_base.transformers_)
@@ -246,17 +246,17 @@
 #     X_str = X.astype("object")
 #     # With pandas
 #     expected_transformers_plain = {
-#         "high_cardinarlity": ["str2", "cat2"],
-#         "low_cardinarlity": ["str1", "cat1"],
+#         "high_card_cat": ["str2", "cat2"],
+#         "low_card_cat": ["str1", "cat1"],
 #         "numeric": ["int", "float"],
 #     }
 #     vectorizer_cast.fit_transform(X_str)
 #     check_same_transformers(expected_transformers_plain, vectorizer_cast.transformers_)
 #     # With numpy
 #     expected_transformers_np_cast = {
 #         "numeric": [0, 1],
-#         "low_cardinarlity": [2, 4],
-#         "high_cardinarlity": [3, 5],
+#         "low_card_cat": [2, 4],
+#         "high_card_cat": [3, 5],
 #     }
 #     vectorizer_cast.fit_transform(X_str.to_numpy())
 #     check_same_transformers(
@@ -357,8 +357,8 @@
 #     """
 #     expected_transformers = {
 #         "numeric": [0, 1],
-#         "low_cardinarlity": [2, 4],
-#         "high_cardinarlity": [3, 5],
+#         "low_card_cat": [2, 4],
+#         "high_card_cat": [3, 5],
 #     }
 #     vectorizer = TableVectorizer(
 #         cardinality_threshold=4,
@@ -668,15 +668,15 @@
 # #             [
 # #                 ("numeric", "passthrough", ["int", "float"]),
 # #                 ("minhashencoder", "MinHashEncoder", ["str1", "str2"]),
-# #                 ("low_cardinarlity", "OneHotEncoder", ["cat1", "cat2"]),
+# #                 ("low_card_cat", "OneHotEncoder", ["cat1", "cat2"]),
 # #             ],
 # #         ),
 # #         (
 # #             ("mh_cat1", MinHashEncoder(), ["cat1"]),
 # #             [
 # #                 ("numeric", "passthrough", ["int", "float"]),
 # #                 ("mh_cat1", "MinHashEncoder", ["cat1"]),
-# #                 ("low_cardinarlity", "OneHotEncoder", ["str1", "str2", "cat2"]),
+# #                 ("low_card_cat", "OneHotEncoder", ["str1", "str2", "cat2"]),
 # #             ],
 # #         ),
 # #     ],
@@ -761,7 +761,7 @@
 #     table_vec.fit_transform(df)
 #     expected_transformers_df = {
 #         "numeric": ["int_str", "float_str", "int_float"],
-#         "low_cardinarlity": ["bool_str"],
+#         "low_card_cat": ["bool_str"],
 #     }
 #     check_same_transformers(expected_transformers_df, table_vec.transformers_)
 
@@ -772,7 +772,7 @@
 #     table_vec.fit_transform(X)
 #     expected_transformers_array = {
 #         "numeric": [0, 1, 2],
-#         "low_cardinarlity": [3],
+#         "low_card_cat": [3],
 #     }
 #     check_same_transformers(expected_transformers_array, table_vec.transformers_)
 

diff --git a/examples/01_encodings.py b/examples/01_encodings.py
@@ -94,14 +94,14 @@
 #     - The |OneHotEncoder| for low cardinality string variables, the columns
 #       ``'gender'``, ``'department'``, ``'department_name'`` and ``'assignment_category'``.
 
-tv.named_transformers_["low_cardinarlity"].get_feature_names_out()
+tv.named_transformers_["low_card_cat"].get_feature_names_out()
 
 ###############################################################################
 #     - The |GapEncoder| for high cardinality string columns, ``'employee_position_title'``
 #       and ``'division'``. The |GapEncoder| is a powerful encoder that can handle dirty
 #       categorical columns.
 
-tv.named_transformers_["high_cardinarlity"].get_feature_names_out()
+tv.named_transformers_["high_card_cat"].get_feature_names_out()
 
 ###############################################################################
 #     - The |DatetimeEncoder| to the ``'date_first_hired'`` column. The |DatetimeEncoder|

diff --git a/setup.cfg b/setup.cfg
@@ -1,113 +1,113 @@
-; [metadata]
-; name = cu_cat
-; version = file: cu_cat/VERSION.txt
-; description = Prepping tables for machine learning
-; long_description = file: README.md
-; license = BSD
-; license_files = LICENSE.txt
-; classifiers = 
-; 	Development Status :: 5 - Production/Stable
-; 	Environment :: Console
-; 	Intended Audience :: Science/Research
-; 	License :: OSI Approved :: BSD License
-; 	Operating System :: OS Independent
-; 	Programming Language :: Python :: 3.10
-; 	Programming Language :: Python :: 3.11
-; 	Topic :: Scientific/Engineering
-; 	Topic :: Software Development :: Libraries
-; project_urls = 
-; 	Homepage = http://github.com/graphistry/cu-cat/
-; 	Source = https://github.com/graphistry/cu-cat
+[metadata]
+name = cu_cat
+version = file: cu_cat/VERSION.txt
+description = Prepping tables for machine learning
+long_description = file: README.md
+license = BSD
+license_files = LICENSE.txt
+classifiers = 
+	Development Status :: 5 - Production/Stable
+	Environment :: Console
+	Intended Audience :: Science/Research
+	License :: OSI Approved :: BSD License
+	Operating System :: OS Independent
+	Programming Language :: Python :: 3.10
+	Programming Language :: Python :: 3.11
+	Topic :: Scientific/Engineering
+	Topic :: Software Development :: Libraries
+project_urls = 
+	Homepage = http://github.com/graphistry/cu-cat/
+	Source = https://github.com/graphistry/cu-cat
 
-; [options]
-; include_package_data = True
-; packages = find:
-; install_requires = 
-; 	scikit-learn>=1.2.1
-; 	numpy>=1.23.5
-; 	scipy>=1.9.3
-; 	pandas>=1.5.3
-; 	packaging>=23.1
-; python_requires = >=3.10
+[options]
+include_package_data = True
+packages = find:
+install_requires = 
+	scikit-learn>=1.2.1
+	numpy>=1.23.5
+	scipy>=1.9.3
+	pandas>=1.5.3
+	packaging>=23.1
+python_requires = >=3.10
 
-; [options.extras_require]
-; dev = 
-; 	pytest
-; 	pytest-cov
-; 	pytest-xdist==2.5.0
-; 	pytest-xdist[psutil]
-; 	coverage
-; 	mypy
-; 	numpydoc
-; 	flake8
-; 	openml
-; 	pre-commit
-; pyarrow = 
-; 	pyarrow
-; polars = 
-; 	pyarrow
-; 	polars
-; doc = 
-; 	pydata-sphinx-theme
-; 	sphinxext-opengraph
-; 	sphinx-copybutton
-; 	matplotlib
-; 	seaborn
-; 	statsmodels
-; 	numpydoc
-; 	jupyterlite-sphinx
-; 	jupyterlite-pyodide-kernel
-; 	pyarrow
-; benchmarks = 
-; 	numpy
-; 	pandas
-; 	matplotlib
-; 	seaborn
-; 	tqdm
-; 	thefuzz
-; 	autofj
-; 	pyarrow
-; 	loguru
-; min-py310 = 
-; 	scikit-learn==1.2.1
-; 	numpy==1.23.5
-; 	scipy==1.9.3
-; 	pandas==1.5.3
+[options.extras_require]
+dev = 
+	pytest
+	pytest-cov
+	pytest-xdist==2.5.0
+	pytest-xdist[psutil]
+	coverage
+	mypy
+	numpydoc
+	flake8
+	openml
+	pre-commit
+pyarrow = 
+	pyarrow
+polars = 
+	pyarrow
+	polars
+doc = 
+	pydata-sphinx-theme
+	sphinxext-opengraph
+	sphinx-copybutton
+	matplotlib
+	seaborn
+	statsmodels
+	numpydoc
+	jupyterlite-sphinx
+	jupyterlite-pyodide-kernel
+	pyarrow
+benchmarks = 
+	numpy
+	pandas
+	matplotlib
+	seaborn
+	tqdm
+	thefuzz
+	autofj
+	pyarrow
+	loguru
+min-py310 = 
+	scikit-learn==1.2.1
+	numpy==1.23.5
+	scipy==1.9.3
+	pandas==1.5.3
 
-; [flake8]
-; max-line-length = 88
-; target-version = ['py310']
-; ignore = 
-; 	E24,
-; 	E121,
-; 	E123,
-; 	E126,
-; 	E203,
-; 	E226,
-; 	E704,
-; 	E731,
-; 	E741,
-; 	W503,
-; 	W504
-; per-file-ignores = 
-; 	examples/*:E402
-; 	doc/conf.py:E402
-; exclude = 
-; 	.git,
-; 	__pycache__,
-; 	dist,
-; 	build
+[flake8]
+max-line-length = 88
+target-version = ['py310']
+ignore = 
+	E24,
+	E121,
+	E123,
+	E126,
+	E203,
+	E226,
+	E704,
+	E731,
+	E741,
+	W503,
+	W504
+per-file-ignores = 
+	examples/*:E402
+	doc/conf.py:E402
+exclude = 
+	.git,
+	__pycache__,
+	dist,
+	build
 
-; [mypy]
-; ignore_missing_imports = True
-; allow_redefinition = True
+[mypy]
+ignore_missing_imports = True
+allow_redefinition = True
 
-; [codespell]
-; skip = ./.git,./.mypy_cache
+[codespell]
+skip = ./.git,./.mypy_cache
 
-; [egg_info]
-; tag_build = 
-; tag_date = 0
+[egg_info]
+tag_build = 
+tag_date = 0
 
 
 ##github