From 3c1450b951a980cb0560e6fc28a5ca2446c53be1 Mon Sep 17 00:00:00 2001 From: Lucas Cardozo Date: Wed, 18 Oct 2023 17:18:16 +0200 Subject: [PATCH 1/2] fix: IndexError with columns full of NaNs --- nvtabular/ops/categorify.py | 5 ++++- tests/unit/ops/test_categorify.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 556e2a005a..ebdbc5e45f 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1704,7 +1704,10 @@ def _encode( codes = type(df)({"order": dispatch.arange(len(df), like_df=df)}, index=df.index) for cl, cr in zip(selection_l.names, selection_r.names): - if isinstance(df[cl].dropna().iloc[0], (np.ndarray, list)): + column_without_nans = df[cl].dropna() + if len(column_without_nans) and isinstance( + column_without_nans.iloc[0], (np.ndarray, list) + ): ser = df[cl].copy() codes[cl] = dispatch.flatten_list_column_values(ser).astype(value[cr].dtype) else: diff --git a/tests/unit/ops/test_categorify.py b/tests/unit/ops/test_categorify.py index 41a69ef346..2db9ef0f7b 100644 --- a/tests/unit/ops/test_categorify.py +++ b/tests/unit/ops/test_categorify.py @@ -734,3 +734,17 @@ def test_categorify_inference(): output_tensors = inference_op.transform(cats.input_columns, input_tensors) for key in input_tensors: assert output_tensors[key].dtype == np.dtype("int64") + + +def test_categorify_transform_only_nans_column(): + train_df = make_df({"cat_column": ["a", "a", "b", "c", np.nan]}) + cat_features = ["cat_column"] >> nvt.ops.Categorify(max_size=4) + train_dataset = nvt.Dataset(train_df) + + workflow = nvt.Workflow(cat_features) + workflow.fit(train_dataset) + + inference_df = make_df({"cat_column": [np.nan] * 10}) + inference_dataset = nvt.Dataset(inference_df) + + workflow.transform(inference_dataset).compute() From 72ac410de788243d7c532625f74f7f15ac636789 Mon Sep 17 00:00:00 2001 From: Lucas Cardozo Date: Wed, 18 Oct 2023 17:25:05 +0200 Subject: [PATCH 2/2] chore: add assertion --- tests/unit/ops/test_categorify.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/ops/test_categorify.py b/tests/unit/ops/test_categorify.py index 2db9ef0f7b..5092bb7fa7 100644 --- a/tests/unit/ops/test_categorify.py +++ b/tests/unit/ops/test_categorify.py @@ -738,7 +738,7 @@ def test_categorify_inference(): def test_categorify_transform_only_nans_column(): train_df = make_df({"cat_column": ["a", "a", "b", "c", np.nan]}) - cat_features = ["cat_column"] >> nvt.ops.Categorify(max_size=4) + cat_features = ["cat_column"] >> nvt.ops.Categorify() train_dataset = nvt.Dataset(train_df) workflow = nvt.Workflow(cat_features) @@ -747,4 +747,5 @@ def test_categorify_transform_only_nans_column(): inference_df = make_df({"cat_column": [np.nan] * 10}) inference_dataset = nvt.Dataset(inference_df) - workflow.transform(inference_dataset).compute() + output = workflow.transform(inference_dataset).compute() + assert len(output) == len(inference_df)