Skip to content

Commit

Permalink
lints
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Jan 2, 2024
1 parent 11d814d commit e317e6a
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 131 deletions.
2 changes: 1 addition & 1 deletion cu_cat/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.07.09
0.07.10
15 changes: 8 additions & 7 deletions cu_cat/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,12 +635,12 @@ def fit_transform(self, X, y=None):
_nunique_values = { # Cache results
col: X[col].nunique() for col in categorical_columns
}
low_cardinality_columns = [
low_card_cat_columns = [
col
for col in categorical_columns
if _nunique_values[col] < self.cardinality_threshold
]
high_cardinality_columns = [
high_card_cat_columns = [
col
for col in categorical_columns
if _nunique_values[col] >= self.cardinality_threshold
Expand All @@ -654,15 +654,15 @@ def fit_transform(self, X, y=None):
all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [ # type: ignore
("numeric", self.numerical_transformer, numeric_columns),
("datetime", self.datetime_transformer_, datetime_columns),
("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns),
("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns),
("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns),
("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns),
]
else:
all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [ # type: ignore
("numeric", self.numerical_transformer, numeric_columns),
# ("datetime", self.datetime_transformer_, datetime_columns), ## commented out if in dt format so pyg can handle
("low_cardinarlity", self.low_card_cat_transformer_, low_cardinality_columns),
("high_cardinarlity", self.high_card_cat_transformer_, high_cardinality_columns),
("low_card_str", self.low_card_cat_transformer_, low_card_cat_columns),
("high_card_str", self.high_card_cat_transformer_, high_card_cat_columns),
]
# We will now filter this list, by keeping only the ones with:
# - at least one column
Expand Down Expand Up @@ -768,6 +768,7 @@ def transform(self, X) -> np.ndarray:
f"array seen during fit. Got {X.shape[1]} "
f"columns, expected {len(self.columns_)}"
)
self.Xt_= df_type(X)
X, y = make_safe_gpu_dataframes(X, None, self.engine_)
if not isinstance(X, pd.DataFrame) and not 'cudf' in self.Xt_:
X = pd.DataFrame(X)
Expand Down Expand Up @@ -800,7 +801,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
typing.List[str]
Feature names.
"""
if 'cudf' not in self.Xt_ and not deps.cudf:
if not deps.cudf:
if parse_version(sklearn_version) > parse_version("1.0"):
ct_feature_names = super().get_feature_names()
else:
Expand Down
32 changes: 16 additions & 16 deletions cu_cat/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,15 +201,15 @@
# # Warning: order-dependant
# expected_transformers_df = {
# "numeric": ["int", "float"],
# "low_cardinarlity": ["str1", "cat1"],
# "high_cardinarlity": ["str2", "cat2"],
# "low_card_cat": ["str1", "cat1"],
# "high_card_cat": ["str2", "cat2"],
# }
# vectorizer_base.fit_transform(X)
# check_same_transformers(expected_transformers_df, vectorizer_base.transformers_)

# # Test with higher cardinality threshold and no numeric transformer
# expected_transformers_2 = {
# "low_cardinarlity": ["str1", "str2", "cat1", "cat2"],
# "low_card_cat": ["str1", "str2", "cat1", "cat2"],
# "numeric": ["int", "float"],
# }
# vectorizer_default = TableVectorizer() # Using default values
Expand All @@ -220,8 +220,8 @@
# arr = X.to_numpy()
# # Instead of the columns names, we'll have the column indices.
# expected_transformers_np_no_cast = {
# "low_cardinarlity": [2, 4],
# "high_cardinarlity": [3, 5],
# "low_card_cat": [2, 4],
# "high_card_cat": [3, 5],
# "numeric": [0, 1],
# }
# vectorizer_base.fit_transform(arr)
Expand All @@ -231,7 +231,7 @@

# # Test with single column dataframe
# expected_transformers_series = {
# "low_cardinarlity": ["cat1"],
# "low_card_cat": ["cat1"],
# }
# vectorizer_base.fit_transform(X[["cat1"]])
# check_same_transformers(expected_transformers_series, vectorizer_base.transformers_)
Expand All @@ -246,17 +246,17 @@
# X_str = X.astype("object")
# # With pandas
# expected_transformers_plain = {
# "high_cardinarlity": ["str2", "cat2"],
# "low_cardinarlity": ["str1", "cat1"],
# "high_card_cat": ["str2", "cat2"],
# "low_card_cat": ["str1", "cat1"],
# "numeric": ["int", "float"],
# }
# vectorizer_cast.fit_transform(X_str)
# check_same_transformers(expected_transformers_plain, vectorizer_cast.transformers_)
# # With numpy
# expected_transformers_np_cast = {
# "numeric": [0, 1],
# "low_cardinarlity": [2, 4],
# "high_cardinarlity": [3, 5],
# "low_card_cat": [2, 4],
# "high_card_cat": [3, 5],
# }
# vectorizer_cast.fit_transform(X_str.to_numpy())
# check_same_transformers(
Expand Down Expand Up @@ -357,8 +357,8 @@
# """
# expected_transformers = {
# "numeric": [0, 1],
# "low_cardinarlity": [2, 4],
# "high_cardinarlity": [3, 5],
# "low_card_cat": [2, 4],
# "high_card_cat": [3, 5],
# }
# vectorizer = TableVectorizer(
# cardinality_threshold=4,
Expand Down Expand Up @@ -668,15 +668,15 @@
# # [
# # ("numeric", "passthrough", ["int", "float"]),
# # ("minhashencoder", "MinHashEncoder", ["str1", "str2"]),
# # ("low_cardinarlity", "OneHotEncoder", ["cat1", "cat2"]),
# # ("low_card_cat", "OneHotEncoder", ["cat1", "cat2"]),
# # ],
# # ),
# # (
# # ("mh_cat1", MinHashEncoder(), ["cat1"]),
# # [
# # ("numeric", "passthrough", ["int", "float"]),
# # ("mh_cat1", "MinHashEncoder", ["cat1"]),
# # ("low_cardinarlity", "OneHotEncoder", ["str1", "str2", "cat2"]),
# # ("low_card_cat", "OneHotEncoder", ["str1", "str2", "cat2"]),
# # ],
# # ),
# # ],
Expand Down Expand Up @@ -761,7 +761,7 @@
# table_vec.fit_transform(df)
# expected_transformers_df = {
# "numeric": ["int_str", "float_str", "int_float"],
# "low_cardinarlity": ["bool_str"],
# "low_card_cat": ["bool_str"],
# }
# check_same_transformers(expected_transformers_df, table_vec.transformers_)

Expand All @@ -772,7 +772,7 @@
# table_vec.fit_transform(X)
# expected_transformers_array = {
# "numeric": [0, 1, 2],
# "low_cardinarlity": [3],
# "low_card_cat": [3],
# }
# check_same_transformers(expected_transformers_array, table_vec.transformers_)

Expand Down
4 changes: 2 additions & 2 deletions examples/01_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@
# - The |OneHotEncoder| for low cardinality string variables, the columns
# ``'gender'``, ``'department'``, ``'department_name'`` and ``'assignment_category'``.

tv.named_transformers_["low_cardinarlity"].get_feature_names_out()
tv.named_transformers_["low_card_cat"].get_feature_names_out()

###############################################################################
# - The |GapEncoder| for high cardinality string columns, ``'employee_position_title'``
# and ``'division'``. The |GapEncoder| is a powerful encoder that can handle dirty
# categorical columns.

tv.named_transformers_["high_cardinarlity"].get_feature_names_out()
tv.named_transformers_["high_card_cat"].get_feature_names_out()

###############################################################################
# - The |DatetimeEncoder| to the ``'date_first_hired'`` column. The |DatetimeEncoder|
Expand Down
208 changes: 104 additions & 104 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,113 +1,113 @@
; [metadata]
; name = cu_cat
; version = file: cu_cat/VERSION.txt
; description = Prepping tables for machine learning
; long_description = file: README.md
; license = BSD
; license_files = LICENSE.txt
; classifiers =
; Development Status :: 5 - Production/Stable
; Environment :: Console
; Intended Audience :: Science/Research
; License :: OSI Approved :: BSD License
; Operating System :: OS Independent
; Programming Language :: Python :: 3.10
; Programming Language :: Python :: 3.11
; Topic :: Scientific/Engineering
; Topic :: Software Development :: Libraries
; project_urls =
; Homepage = http://github.com/graphistry/cu-cat/
; Source = https://github.com/graphistry/cu-cat
[metadata]
name = cu_cat
version = file: cu_cat/VERSION.txt
description = Prepping tables for machine learning
long_description = file: README.md
license = BSD
license_files = LICENSE.txt
classifiers =
Development Status :: 5 - Production/Stable
Environment :: Console
Intended Audience :: Science/Research
License :: OSI Approved :: BSD License
Operating System :: OS Independent
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Topic :: Scientific/Engineering
Topic :: Software Development :: Libraries
project_urls =
Homepage = http://github.com/graphistry/cu-cat/
Source = https://github.com/graphistry/cu-cat

; [options]
; include_package_data = True
; packages = find:
; install_requires =
; scikit-learn>=1.2.1
; numpy>=1.23.5
; scipy>=1.9.3
; pandas>=1.5.3
; packaging>=23.1
; python_requires = >=3.10
[options]
include_package_data = True
packages = find:
install_requires =
scikit-learn>=1.2.1
numpy>=1.23.5
scipy>=1.9.3
pandas>=1.5.3
packaging>=23.1
python_requires = >=3.10

; [options.extras_require]
; dev =
; pytest
; pytest-cov
; pytest-xdist==2.5.0
; pytest-xdist[psutil]
; coverage
; mypy
; numpydoc
; flake8
; openml
; pre-commit
; pyarrow =
; pyarrow
; polars =
; pyarrow
; polars
; doc =
; pydata-sphinx-theme
; sphinxext-opengraph
; sphinx-copybutton
; matplotlib
; seaborn
; statsmodels
; numpydoc
; jupyterlite-sphinx
; jupyterlite-pyodide-kernel
; pyarrow
; benchmarks =
; numpy
; pandas
; matplotlib
; seaborn
; tqdm
; thefuzz
; autofj
; pyarrow
; loguru
; min-py310 =
; scikit-learn==1.2.1
; numpy==1.23.5
; scipy==1.9.3
; pandas==1.5.3
[options.extras_require]
dev =
pytest
pytest-cov
pytest-xdist==2.5.0
pytest-xdist[psutil]
coverage
mypy
numpydoc
flake8
openml
pre-commit
pyarrow =
pyarrow
polars =
pyarrow
polars
doc =
pydata-sphinx-theme
sphinxext-opengraph
sphinx-copybutton
matplotlib
seaborn
statsmodels
numpydoc
jupyterlite-sphinx
jupyterlite-pyodide-kernel
pyarrow
benchmarks =
numpy
pandas
matplotlib
seaborn
tqdm
thefuzz
autofj
pyarrow
loguru
min-py310 =
scikit-learn==1.2.1
numpy==1.23.5
scipy==1.9.3
pandas==1.5.3

; [flake8]
; max-line-length = 88
; target-version = ['py310']
; ignore =
; E24,
; E121,
; E123,
; E126,
; E203,
; E226,
; E704,
; E731,
; E741,
; W503,
; W504
; per-file-ignores =
; examples/*:E402
; doc/conf.py:E402
; exclude =
; .git,
; __pycache__,
; dist,
; build
[flake8]
max-line-length = 88
target-version = ['py310']
ignore =
E24,
E121,
E123,
E126,
E203,
E226,
E704,
E731,
E741,
W503,
W504
per-file-ignores =
examples/*:E402
doc/conf.py:E402
exclude =
.git,
__pycache__,
dist,
build

; [mypy]
; ignore_missing_imports = True
; allow_redefinition = True
[mypy]
ignore_missing_imports = True
allow_redefinition = True

; [codespell]
; skip = ./.git,./.mypy_cache
[codespell]
skip = ./.git,./.mypy_cache

; [egg_info]
; tag_build =
; tag_date = 0
[egg_info]
tag_build =
tag_date = 0


##github
Expand Down
Loading

0 comments on commit e317e6a

Please sign in to comment.