-
Notifications
You must be signed in to change notification settings - Fork 86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Woodwork 0.17.2 compatibility :( #3626
Changes from 11 commits
c41d1de
3b18083
97157e0
6dfd577
5bbf65d
70d038c
43ece37
638705d
4e47f78
a3aa645
7cdeb1d
b50ad59
c0a7566
ca24080
a55ec4f
e294000
c69ec06
263b9a0
54ec0cc
625681b
bbb6820
2a92d38
79cb5c7
3c1ebee
3bc8191
a6db357
573fc6f
e6363aa
35435f8
dc61825
b661aab
d261e71
b95b3d6
33b8a9e
ea97418
7852c11
3ebbafd
0341777
50ccf33
8ebcb48
798d2e7
ee88e89
dd21c9a
e9aec48
0b0cd59
dde9c77
b94bb37
63c3168
7a8ba71
664e4f9
9acd1bc
9db71c0
7d78f9f
dda2b05
2db0718
2a1b8b8
ec47fc2
426ed33
21d987d
3cde5c4
7b81eb3
187d7ff
9c64c4c
9b5d73c
3d6cc07
b42dff0
c23443e
8227596
cd9ff14
2812e99
e920710
e029d15
4f8eb17
654ca9b
a8039a6
1ec5aee
7f27e49
8dbf0d5
3c734b0
c5fc0b5
3904a3c
295e701
6c5bd5d
6132cb2
71a367c
649bc33
82ae057
c325c39
4579127
7325a4d
1b7434e
b95207a
969eba3
dd1346b
860ba3a
ce7af81
e3649ad
dff1c4a
f324962
49f2d5d
2412b2c
64a63ee
022e4f7
8962f70
d8bd307
c86479c
e7d6ff0
57366ea
69156a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -288,10 +288,17 @@ def test_datetime_featurizer_with_inconsistent_date_format(): | |
answer = pd.DataFrame( | ||
{ | ||
"numerical": [0] * len(dates), | ||
"date col_year": [2021.0] * 18 + [np.nan] * 2, | ||
"date col_month": [9.0] * 18 + [np.nan] * 2, | ||
"date col_year": [2021] * 18 + [pd.NA] * 2, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new pandas nullable types return |
||
"date col_month": [9] * 18 + [pd.NA] * 2, | ||
"date col_day_of_week": expected_dow, | ||
"date col_hour": [0.0] * 18 + [np.nan] * 2, | ||
"date col_hour": [0] * 18 + [pd.NA] * 2, | ||
}, | ||
).astype( | ||
dtype={ | ||
"date col_year": "Int64", | ||
"date col_month": "Int64", | ||
"date col_day_of_week": "Int64", | ||
"date col_hour": "Int64", | ||
}, | ||
) | ||
pd.testing.assert_frame_equal(answer, expected) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,8 +16,7 @@ def test_drop_rows_transformer(): | |
X_expected = pd.DataFrame( | ||
{"a column": [3], "another col": [6]}, | ||
index=[2], | ||
dtype=np.float64, | ||
) | ||
).astype("Int64") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of returning the numpy version of nan, which is a float, woodwork inference returns the new |
||
drop_rows_transformer = DropNaNRowsTransformer() | ||
drop_rows_transformer.fit(X) | ||
transformed_X, _ = drop_rows_transformer.transform(X) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,9 +8,11 @@ | |
from pandas.testing import assert_frame_equal | ||
from woodwork.logical_types import ( | ||
Boolean, | ||
BooleanNullable, | ||
Categorical, | ||
Double, | ||
Integer, | ||
IntegerNullable, | ||
NaturalLanguage, | ||
) | ||
|
||
|
@@ -512,7 +514,7 @@ def test_imputer_all_bool_return_original(data_type, make_data_type): | |
def test_imputer_bool_dtype_object(data_type, make_data_type): | ||
X = pd.DataFrame([True, np.nan, False, np.nan, True] * 4) | ||
y = pd.Series([1, 0, 0, 1, 0] * 4) | ||
X_expected_arr = pd.DataFrame([True, True, False, True, True] * 4, dtype="category") | ||
X_expected_arr = pd.DataFrame([True, True, False, True, True] * 4, dtype="boolean") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the new woodwork behavior we would expect - the returning of the new |
||
X = make_data_type(data_type, X) | ||
y = make_data_type(data_type, y) | ||
imputer = Imputer() | ||
|
@@ -537,7 +539,7 @@ def test_imputer_multitype_with_one_bool(data_type, make_data_type): | |
{ | ||
"bool with nan": pd.Series( | ||
[True, False, False, False, False] * 4, | ||
dtype="category", | ||
dtype="boolean", | ||
), | ||
"bool no nan": pd.Series( | ||
[False, False, False, False, True] * 4, | ||
|
@@ -563,7 +565,9 @@ def test_imputer_int_preserved(): | |
transformed, | ||
pd.DataFrame(pd.Series([1, 2, 11, 14 / 3])), | ||
) | ||
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Double} | ||
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { | ||
0: IntegerNullable, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just how the new woodwork inference manifests in terms of woodwork dtypes. Before, none types were the numpy.nan but are now pd.NA and, depending on the other values in the feature, will be the woodwork dtype of IntegerNullable or BooleanNullable. |
||
} | ||
|
||
X = pd.DataFrame(pd.Series([1, 2, 3, np.nan])) | ||
imputer = Imputer(numeric_impute_strategy="mean") | ||
|
@@ -573,7 +577,9 @@ def test_imputer_int_preserved(): | |
pd.DataFrame(pd.Series([1, 2, 3, 2])), | ||
check_dtype=False, | ||
) | ||
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Double} | ||
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { | ||
0: IntegerNullable, | ||
} | ||
|
||
X = pd.DataFrame(pd.Series([1, 2, 3, 4], dtype="int")) | ||
imputer = Imputer(numeric_impute_strategy="mean") | ||
|
@@ -595,9 +601,9 @@ def test_imputer_bool_preserved(test_case, null_type): | |
] | ||
X = pd.DataFrame(pd.Series([True, False, True, null_type] * 4)) | ||
expected = pd.DataFrame( | ||
pd.Series([True, False, True, True] * 4, dtype="category"), | ||
pd.Series([True, False, True, True] * 4, dtype="boolean"), | ||
) | ||
expected_ww_dtype = Categorical | ||
expected_ww_dtype = BooleanNullable | ||
check_dtype = True | ||
elif test_case == "boolean_without_null": | ||
X = pd.DataFrame(pd.Series([True, False, True, False] * 4)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -219,15 +219,18 @@ def test_fit_transform_drop_all_nan_columns(): | |
"another_col": {"impute_strategy": "most_frequent"}, | ||
} | ||
transformer = PerColumnImputer(impute_strategies=strategies) | ||
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) | ||
X_expected_arr = pd.DataFrame( | ||
{"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}, | ||
).astype({"some_nan": "Int64"}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The imputer returns the new nullable boolean. |
||
X_t = transformer.fit_transform(X) | ||
assert_frame_equal(X_expected_arr, X_t, check_dtype=False) | ||
# Check that original dataframe remains unchanged | ||
assert_frame_equal( | ||
X, | ||
pd.DataFrame( | ||
{ | ||
"all_nan": [np.nan, np.nan, np.nan], | ||
"some_nan": [0.0, 1.0, 0.0], | ||
"some_nan": [0, 1, 0], | ||
"another_col": [0, 1, 2], | ||
}, | ||
), | ||
|
@@ -259,7 +262,7 @@ def test_transform_drop_all_nan_columns(): | |
pd.DataFrame( | ||
{ | ||
"all_nan": [np.nan, np.nan, np.nan], | ||
"some_nan": [0.0, 1.0, 0.0], | ||
"some_nan": [0, 1, 0], | ||
"another_col": [0, 1, 2], | ||
}, | ||
), | ||
|
@@ -347,8 +350,9 @@ def test_per_column_imputer_column_subset(): | |
) | ||
X_expected.ww.init( | ||
logical_types={ | ||
"all_nan_not_included": "double", | ||
"column_with_nan_included": "double", | ||
"all_nan_not_included": "Double", | ||
"column_with_nan_not_included": "IntegerNullable", | ||
"column_with_nan_included": "IntegerNullable", | ||
}, | ||
) | ||
X.ww.init( | ||
|
@@ -362,11 +366,10 @@ def test_per_column_imputer_column_subset(): | |
{ | ||
"all_nan_not_included": [np.nan, np.nan, np.nan], | ||
"all_nan_included": [np.nan, np.nan, np.nan], | ||
"column_with_nan_not_included": [np.nan, 1, 0], | ||
# Because of https://github.com/alteryx/evalml/issues/2055 | ||
"column_with_nan_included": [0.0, 1.0, 0.0], | ||
"column_with_nan_not_included": [pd.NA, 1, 0], | ||
"column_with_nan_included": [0, 1, 0], | ||
}, | ||
), | ||
).astype({"column_with_nan_not_included": "Int64"}), | ||
) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1982,7 +1982,7 @@ def imputer_test_data(): | |
), | ||
"int col": [0, 1, 2, 0, 3] * 4, | ||
"object col": ["b", "b", "a", "c", "d"] * 4, | ||
"float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4, | ||
"float col": [0.1, 1.0, 0.0, -2.0, 5.0] * 4, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that Woodwork is now is rounding columns that have pseudo-floats and identifying them as actually integers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed this |
||
"bool col": [True, False, False, True, True] * 4, | ||
"categorical with nan": pd.Series( | ||
[np.nan, "1", "0", "0", "3"] * 4, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The failure here was during a time series regression problem, where the predictions came out as integers, rather than floats. Given that it's a time series regression problem, I would expect the predictions to be floats.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This worries me slightly - are there any scenarios where this would cause us issues down the road?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a little confused here - is
preds
coming out as an integers here and why if so?