Skip to content

Commit

Permalink
Merge pull request #11 from momijiame/feature/support-polars-v1
Browse files Browse the repository at this point in the history
bump up version to v0.0.4
  • Loading branch information
momijiame authored Jul 23, 2024
2 parents 337231f + 11a0a91 commit e9db218
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 24 deletions.
1 change: 1 addition & 0 deletions .github/workflows/python-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
- "3.9"
- "3.10"
- "3.11"
- "3.12"

steps:

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"polars>=0.19.16",
"polars>=1.0.0",
"scikit-learn",
"scipy",
]
Expand Down
2 changes: 1 addition & 1 deletion shirokumas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ._ordinal import OrdinalEncoder # noqa: F401
from ._target import TargetEncoder # noqa: F401

__version__ = "0.0.3"
__version__ = "0.0.4"
__all__ = [
"AggregateEncoder",
"CountEncoder",
Expand Down
13 changes: 9 additions & 4 deletions shirokumas/_agg.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,21 @@ def _transform(self, X: pl.DataFrame, **transform_params) -> pl.DataFrame:
X_lazy: pl.LazyFrame = X.select(self.mappings.keys()).lazy()

for col, mapping in self.mappings.items():
col_remappings: dict[str, dict[str | None, float | int]] = defaultdict(
lambda: {None: missing_value}
)
col_remappings: dict[str, dict[str | None, float | int]] = defaultdict(dict)

for category, *agg_values in mapping.rows():
for agg_name, agg_value in zip(mapping.columns[1:], agg_values):
col_remappings[agg_name][category] = agg_value

for agg_name, replace_map in col_remappings.items():
value_sample = next(iter(replace_map.values()))
cast_type = type(value_sample)
col_remappings[agg_name][None] = cast_type(missing_value)

X_lazy = X_lazy.with_columns(
[
pl.col(col)
.replace(remapping, default=unknown_value)
.replace_strict(remapping, default=unknown_value)
.alias(agg_name)
for agg_name, remapping in col_remappings.items()
]
Expand Down
4 changes: 2 additions & 2 deletions shirokumas/_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
def _fit(self, X: pl.DataFrame, y: pl.Series | None = None, **fit_params):
cols = self.cols or X.columns
for col in cols:
self.mappings[col] = X.group_by(col).count()
self.mappings[col] = X.group_by(col).len()

def _transform(self, X: pl.DataFrame, **transform_params) -> pl.DataFrame:
unknown_value = -1
Expand All @@ -47,7 +47,7 @@ def _transform(self, X: pl.DataFrame, **transform_params) -> pl.DataFrame:
for col, mapping in self.mappings.items():
remapping = {category: count for category, count in mapping.rows()}
remapping[None] = missing_value
expr = pl.col(col).replace(
expr = pl.col(col).replace_strict(
remapping,
default=unknown_value,
)
Expand Down
2 changes: 1 addition & 1 deletion shirokumas/_ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _transform(self, X: pl.DataFrame, **transform_params) -> pl.DataFrame:
for col in self.mappings.keys():
remapping = self.mappings[col]
remapping[None] = missing_value
expr = pl.col(col).replace(
expr = pl.col(col).replace_strict(
remapping,
default=unknown_value,
)
Expand Down
12 changes: 6 additions & 6 deletions shirokumas/_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(
self.encoder: BaseEncoder = encoder_cls(
**(self.smoothing_params or {}),
)
self.global_mean: float | None = None
self.global_mean: pl.PythonLiteral | None = None

def _fit(self, X: pl.DataFrame, y: pl.Series | None = None, **fit_params):
if y is None:
Expand Down Expand Up @@ -109,7 +109,7 @@ def transform(self, X: pl.DataFrame) -> pl.DataFrame:
for category, local_mean in self.mappings[col].rows()
}
remapping[None] = _MISSING_VALUE
expr = pl.col(col).replace(
expr = pl.col(col).replace_strict(
remapping,
default=_UNKNOWN_VALUE,
)
Expand All @@ -119,12 +119,12 @@ def transform(self, X: pl.DataFrame) -> pl.DataFrame:


class _MEstimateStrategy(BaseEstimator, TransformerMixin):
global_mean: float

def __init__(self, m: float = 1.0):
self.m = m

self.mappings: dict[str, pl.DataFrame] = {}
self.global_mean: pl.PythonLiteral | None = None

def fit(self, X: pl.DataFrame, y: pl.Series):
self.global_mean = y.mean()
Expand Down Expand Up @@ -155,7 +155,7 @@ def transform(self, X: pl.DataFrame) -> pl.DataFrame:
for category, local_count, local_sum in self.mappings[col].rows()
}
remapping[None] = _MISSING_VALUE
expr = pl.col(col).replace(
expr = pl.col(col).replace_strict(
remapping,
default=_UNKNOWN_VALUE,
)
Expand All @@ -170,7 +170,7 @@ def __init__(self, k: int = 20, f: int = 10):
self.f = f

self.mappings: dict[str, pl.DataFrame] = {}
self.global_mean: float | None = None
self.global_mean: pl.PythonLiteral | None = None

def fit(self, X: pl.DataFrame, y: pl.Series):
self.global_mean = y.mean()
Expand Down Expand Up @@ -208,7 +208,7 @@ def transform(self, X: pl.DataFrame) -> pl.DataFrame:
for category, local_mean, smoothing_factor in self.mappings[col].rows()
}
remapping[None] = _MISSING_VALUE
expr = pl.col(col).replace(
expr = pl.col(col).replace_strict(
remapping,
default=_UNKNOWN_VALUE,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/shirokumas/test_agg.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test(self):

expected_df = pl.DataFrame(
{
"fruits_mean": [unknown, missing, 250.0],
"fruits_mean": [float(unknown), float(missing), 250.0],
"fruits_max": [unknown, missing, 300],
"vegetables_mean": [100.0, 100.0, 300.0],
"vegetables_max": [100, 100, 300],
Expand Down
4 changes: 2 additions & 2 deletions tests/shirokumas/test_binarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def test(self):
test_df = pl.DataFrame(
{
"fruits": ["unseen", None, "banana"],
"users": ["alice", "unseen", np.nan],
}
"users": ["alice", "unseen", None],
},
)
encoded_df = encoder.transform(test_df)

Expand Down
3 changes: 1 addition & 2 deletions tests/shirokumas/test_count.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pickle
import tempfile

import numpy as np
import polars as pl
import pytest
from polars.testing import assert_frame_equal
Expand Down Expand Up @@ -33,7 +32,7 @@ def test(self):
test_df = pl.DataFrame(
{
"fruits": ["unseen", None, "banana"],
"users": ["alice", "unseen", np.nan],
"users": ["alice", "unseen", None],
},
)
encoded_df = encoder.transform(test_df)
Expand Down
2 changes: 1 addition & 1 deletion tests/shirokumas/test_null.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test(self):
test_df = pl.DataFrame(
{
"fruits": ["unseen", None, "banana"],
"prices": [300, 400, np.nan],
"prices": [300., 400., np.nan],
},
)
encoded_df = encoder.transform(test_df)
Expand Down
5 changes: 2 additions & 3 deletions tests/shirokumas/test_ordinal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pickle
import tempfile

import numpy as np
import polars as pl
import pytest
from polars.testing import assert_frame_equal
Expand Down Expand Up @@ -40,8 +39,8 @@ def test(self):
test_df = pl.DataFrame(
{
"fruits": ["unseen", None, "apple"],
"users": ["alice", "unseen", np.nan],
}
"users": ["alice", "unseen", None],
},
)
encoded_df = encoder.transform(test_df)

Expand Down

0 comments on commit e9db218

Please sign in to comment.