From 11f74de13bd63dee0eb4f239c2987704965b6191 Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 10:33:48 -0400 Subject: [PATCH 1/6] rename temporary column to something less likely to be used --- nvtabular/ops/categorify.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 556e2a005a..f82341735c 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1615,6 +1615,8 @@ def _encode( selection_r = ColumnSelector(name if isinstance(name, list) else [storage_name]) list_col = is_list_col(selection_l, df) + tmp_label_column = "__labels_tmp" + # Find number of oov buckets if buckets and storage_name in buckets: num_oov_buckets = buckets[storage_name] @@ -1642,9 +1644,9 @@ def _encode( cats_only=True, reader=read_pq_func, ) - if len(value) and value["labels"].iloc[0] < OOV_OFFSET + num_oov_buckets: + if len(value) and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets: # See: https://github.com/rapidsai/cudf/issues/12837 - value["labels"] += OOV_OFFSET + num_oov_buckets + value[tmp_label_column] += OOV_OFFSET + num_oov_buckets else: value = read_pq_func( # pylint: disable=unexpected-keyword-arg path, @@ -1652,7 +1654,7 @@ def _encode( **({"split_row_groups": False} if split_out > 1 else {}), ) - value.index = value.index.rename("labels") + value.index = value.index.rename(tmp_label_column) if split_out > 1: value = value.reset_index(drop=False) if type(df).__module__.split(".")[0] == "cudf": @@ -1665,7 +1667,7 @@ def _encode( part_size = file_frag.metadata.num_rows ranges.append((size, size + part_size)) size += part_size - value["labels"] = dd.from_map(lambda r: pd.RangeIndex(*r), ranges) + value[tmp_label_column] = dd.from_map(lambda r: pd.RangeIndex(*r), ranges) else: value.reset_index(drop=False, inplace=True) @@ -1674,7 +1676,7 @@ def _encode( for c in selection_r.names: typ = df[selection_l.names[0]].dtype if len(selection_l.names) == 1 else df[c].dtype value[c] = nullable_series([None], df, typ) - value.index = value.index.rename("labels") + value.index = value.index.rename(tmp_label_column) value.reset_index(drop=False, inplace=True) use_collection = isinstance(value, DaskDataFrame) @@ -1684,7 +1686,7 @@ def _encode( use_collection = False # Determine encoding offsets - null_encoding_offset = value["labels"].head(1).iloc[0] if single_table else NULL_OFFSET + null_encoding_offset = value[tmp_label_column].head(1).iloc[0] if single_table else NULL_OFFSET bucket_encoding_offset = null_encoding_offset + 1 # 2 (if not single_table) distinct_encoding_offset = bucket_encoding_offset + num_oov_buckets @@ -1727,7 +1729,7 @@ def _encode( left_on=selection_l.names, right_on=selection_r.names, how="left", - ).dropna(subset=["labels"]) + ).dropna(subset=[tmp_label_column]) for part in value.partitions ], ignore_index=False, @@ -1741,11 +1743,11 @@ def _encode( if len(merged_df) < len(codes): # Missing nulls labels = df._constructor_sliced(indistinct) - labels.iloc[merged_df["order"]] = merged_df["labels"] + labels.iloc[merged_df["order"]] = merged_df[tmp_label_column] labels = labels.values else: - merged_df["labels"].fillna(df._constructor_sliced(indistinct), inplace=True) - labels = merged_df["labels"].values + merged_df[tmp_label_column].fillna(df._constructor_sliced(indistinct), inplace=True) + labels = merged_df[tmp_label_column].values else: # no hashing if use_collection: @@ -1757,7 +1759,7 @@ def _encode( left_on=selection_l.names, right_on=selection_r.names, how="left", - ).dropna(subset=["labels"]) + ).dropna(subset=[tmp_label_column) for part in value.partitions ], ignore_index=True, @@ -1768,16 +1770,16 @@ def _encode( np.full( len(codes), indistinct, - like=merged_df["labels"].values, + like=merged_df[tmp_column_name].values, ), ) - labels.iloc[merged_df["order"]] = merged_df["labels"] + labels.iloc[merged_df["order"]] = merged_df[tmp_column_name] else: - labels = merged_df.sort_values("order")["labels"].reset_index(drop=True) + labels = merged_df.sort_values("order")[tmp_column_name].reset_index(drop=True) else: labels = codes.merge( value, left_on=selection_l.names, right_on=selection_r.names, how="left" - ).sort_values("order")["labels"] + ).sort_values("order")[tmp_column_name] labels.fillna(indistinct, inplace=True) labels = labels.values else: From 84ae407cea97ac1acc901aa929a2834c368de1b2 Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 10:39:33 -0400 Subject: [PATCH 2/6] fix syntax --- nvtabular/ops/categorify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index f82341735c..b3d0933137 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1759,7 +1759,7 @@ def _encode( left_on=selection_l.names, right_on=selection_r.names, how="left", - ).dropna(subset=[tmp_label_column) + ).dropna(subset=[tmp_label_column]) for part in value.partitions ], ignore_index=True, From 5d8d4606a523095e49ed4e8b04d205620f530fd6 Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 10:46:19 -0400 Subject: [PATCH 3/6] catch renaming error --- nvtabular/ops/categorify.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index b3d0933137..ba63b72778 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1644,7 +1644,10 @@ def _encode( cats_only=True, reader=read_pq_func, ) - if len(value) and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets: + if ( + len(value) + and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets + ): # See: https://github.com/rapidsai/cudf/issues/12837 value[tmp_label_column] += OOV_OFFSET + num_oov_buckets else: @@ -1770,16 +1773,16 @@ def _encode( np.full( len(codes), indistinct, - like=merged_df[tmp_column_name].values, + like=merged_df[tmp_label_column].values, ), ) - labels.iloc[merged_df["order"]] = merged_df[tmp_column_name] + labels.iloc[merged_df["order"]] = merged_df[tmp_label_column] else: - labels = merged_df.sort_values("order")[tmp_column_name].reset_index(drop=True) + labels = merged_df.sort_values("order")[tmp_label_column].reset_index(drop=True) else: labels = codes.merge( value, left_on=selection_l.names, right_on=selection_r.names, how="left" - ).sort_values("order")[tmp_column_name] + ).sort_values("order")[tmp_label_column] labels.fillna(indistinct, inplace=True) labels = labels.values else: From 485081ebb2cc9b19566897c772f2276803cd5ace Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 11:53:57 -0400 Subject: [PATCH 4/6] update col name in transform path --- cpp/nvtabular/inference/categorify.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc index e9b50c0cdd..88b93da323 100644 --- a/cpp/nvtabular/inference/categorify.cc +++ b/cpp/nvtabular/inference/categorify.cc @@ -38,7 +38,7 @@ namespace nvtabular py::object pandas = py::module_::import("pandas"); py::object df = pandas.attr("read_parquet")(filename); py::object isnull = pandas.attr("isnull"); - py::array values = df[column_name.c_str()].attr("values"); + py::array values = df[column_name.c_str()].attr("__values_tmp"); auto dtype = values.dtype(); if ((dtype.kind() == 'O') || (dtype.kind() == 'U')) From 5d1f5386119371f02387ab93455c41ea328d5dea Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 12:01:26 -0400 Subject: [PATCH 5/6] fix name --- cpp/nvtabular/inference/categorify.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc index 88b93da323..4fada831a6 100644 --- a/cpp/nvtabular/inference/categorify.cc +++ b/cpp/nvtabular/inference/categorify.cc @@ -38,7 +38,7 @@ namespace nvtabular py::object pandas = py::module_::import("pandas"); py::object df = pandas.attr("read_parquet")(filename); py::object isnull = pandas.attr("isnull"); - py::array values = df[column_name.c_str()].attr("__values_tmp"); + py::array values = df[column_name.c_str()].attr("__labels_tmp"); auto dtype = values.dtype(); if ((dtype.kind() == 'O') || (dtype.kind() == 'U')) From b5a45f6490993024134e471a892059e7fd50352c Mon Sep 17 00:00:00 2001 From: Adam Laiacano Date: Wed, 21 Jun 2023 12:01:51 -0400 Subject: [PATCH 6/6] undo --- cpp/nvtabular/inference/categorify.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc index 4fada831a6..e9b50c0cdd 100644 --- a/cpp/nvtabular/inference/categorify.cc +++ b/cpp/nvtabular/inference/categorify.cc @@ -38,7 +38,7 @@ namespace nvtabular py::object pandas = py::module_::import("pandas"); py::object df = pandas.attr("read_parquet")(filename); py::object isnull = pandas.attr("isnull"); - py::array values = df[column_name.c_str()].attr("__labels_tmp"); + py::array values = df[column_name.c_str()].attr("values"); auto dtype = values.dtype(); if ((dtype.kind() == 'O') || (dtype.kind() == 'U'))