From 6a55cfc6206b13f685b695ca66661925601c5d16 Mon Sep 17 00:00:00 2001 From: xpai Date: Mon, 12 Aug 2024 16:16:15 +0800 Subject: [PATCH] Add support to parquet input, like csv format --- README.md | 4 ++-- fuxictr/preprocess/feature_processor.py | 27 ++++++++++++++++++------- model_zoo/__init__.py | 4 +--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3612c6b..2d4fb21 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Click-through rate (CTR) prediction is a critical task for various industrial ap | 35 | KDD'21 | [AOANet](./model_zoo/AOANet) | [Architecture and Operation Adaptive Network for Online Recommendations](https://dl.acm.org/doi/10.1145/3447548.3467133) :triangular_flag_on_post:**Didi Chuxing** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/AOANet) | `torch` | | 36 | AAAI'23 | [FinalMLP](./model_zoo/FinalMLP) | [FinalMLP: An Enhanced Two-Stream MLP Model for CTR Prediction](https://arxiv.org/abs/2304.00902) :triangular_flag_on_post:**Huawei** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/FinalMLP) | `torch` | | 37 | SIGIR'23 | [FinalNet](./model_zoo/FinalNet) | [FINAL: Factorized Interaction Layer for CTR Prediction](https://dl.acm.org/doi/10.1145/3539618.3591988) :triangular_flag_on_post:**Huawei** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/FinalNet) | `torch` | -| 38 | SIGIR'23 | [EulerNet](./model_zoo/EulerNet) | [EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for CTR Prediction](https://dl.acm.org/doi/10.1145/3539618.3591681) :triangular_flag_on_post:**Huawei** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/EulerNet) | `torch` | +| 38 | SIGIR'23 | [EulerNet](./model_zoo/EulerNet) | [EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for CTR Prediction](https://dl.acm.org/doi/10.1145/3539618.3591681) :triangular_flag_on_post:**Huawei** | [:arrow_upper_right:](https://github.com/Ethan-TZ/EulerNet/tree/main/%23Code4FuxiCTR%23) | `torch` | | 39 | CIKM'23 | [GDCN](./model_zoo/GDCN) | [Towards Deeper, Lighter and Interpretable Cross Network for CTR Prediction](https://dl.acm.org/doi/pdf/10.1145/3583780.3615089) :triangular_flag_on_post:**Microsoft** | | `torch` | | 40 | Arxiv'24 | [DCNv3](./model_zoo/DCNv3) | [DCNv3: Towards Next Generation Deep Cross Network for Click-Through Rate Prediction](https://arxiv.org/abs/2407.13349) :triangular_flag_on_post:**AHU, Huawei** | [:arrow_upper_right:](https://github.com/salmon1802/DCNv3/tree/master/checkpoints) | `torch` | |:open_file_folder: **Behavior Sequence Modeling**| @@ -82,7 +82,7 @@ Click-through rate (CTR) prediction is a critical task for various industrial ap | 45 | AAAI'20 | [DMR](./model_zoo/DMR) | [Deep Match to Rank Model for Personalized Click-Through Rate Prediction](https://ojs.aaai.org/index.php/AAAI/article/view/5346) :triangular_flag_on_post:**Alibaba** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/DMR) | `torch` | | 46 | DLP-KDD'22 | [ETA](./model_zoo/ETA) | [Efficient Long Sequential User Data Modeling for Click-Through Rate Prediction](https://arxiv.org/abs/2209.12212) :triangular_flag_on_post:**Alibaba** | | `torch` | | 47 | CIKM'22 | [SDIM](./model_zoo/SDIM) | [Sampling Is All You Need on Modeling Long-Term User Behaviors for CTR Prediction](https://arxiv.org/abs/2205.10249) :triangular_flag_on_post:**Meituan** | | `torch` | -| 48 | KDD'23 | [TransAct](./model_zoo/TransAct) | [TransAct: Transformer-based Realtime User Action Model for Recommendation at Pinterest](https://arxiv.org/abs/2306.00248) :triangular_flag_on_post:**Pinterest** | | `torch` | +| 48 | KDD'23 | [TransAct](./model_zoo/TransAct) | [TransAct: Transformer-based Realtime User Action Model for Recommendation at Pinterest](https://arxiv.org/abs/2306.00248) :triangular_flag_on_post:**Pinterest** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/TransAct) | `torch` | |:open_file_folder: **Dynamic Weight Network**| | 49 | NeurIPS'22 | [APG](./model_zoo/APG) | [APG: Adaptive Parameter Generation Network for Click-Through Rate Prediction](https://arxiv.org/abs/2203.16218) :triangular_flag_on_post:**Alibaba** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/APG) | `torch` | | 50 | KDD'23 | [PPNet](./model_zoo/PEPNet) | [PEPNet: Parameter and Embedding Personalized Network for Infusing with Personalized Prior Information](https://arxiv.org/abs/2302.01115) :triangular_flag_on_post:**KuaiShou** | [:arrow_upper_right:](https://github.com/reczoo/BARS/tree/main/ranking/ctr/PPNet) | `torch` | diff --git a/fuxictr/preprocess/feature_processor.py b/fuxictr/preprocess/feature_processor.py index 26865a1..3bd7868 100644 --- a/fuxictr/preprocess/feature_processor.py +++ b/fuxictr/preprocess/feature_processor.py @@ -76,12 +76,25 @@ def read_data(self, data_path, data_format="csv", sep=",", n_rows=None, **kwargs logging.info("Reading files: " + data_path) file_names = sorted(glob.glob(data_path)) assert len(file_names) > 0, f"Invalid data path: {data_path}" - dfs = [ - pl.scan_csv(source=file_name, separator=sep, dtypes=self.dtype_dict, - low_memory=False, n_rows=n_rows) - for file_name in file_names - ] - ddf = pl.concat(dfs) + if data_format == "csv": + dfs = [ + pl.scan_csv(source=file_name, separator=sep, dtypes=self.dtype_dict, + low_memory=False, n_rows=n_rows) + for file_name in file_names + ] + ddf = pl.concat(dfs) + elif data_format == "parquet": + dfs = [ + pl.scan_parquet(source=file_name, low_memory=False, n_rows=n_rows) + for file_name in file_names + ] + ddf = pl.concat(dfs) + seq_cols = [x for x in ddf.columns if isinstance(ddf.select(x).dtypes[0], pl.List)] + for col in seq_cols: + # Convert list to "^" seperated string for the same preprocessing as csv format + ddf = ddf.with_columns(pl.col(col).apply(lambda x: "^".join(map(str, x)))) + else: + NotImplementedError(f"data_format={data_format} not supported.") return ddf def preprocess(self, ddf): @@ -283,7 +296,7 @@ def fit_sequence_col(self, col, col_series, min_categr_count=1): self.feature_map.features[name]["embedding_dim"] = col["embedding_dim"] if "emb_output_dim" in col: self.feature_map.features[name]["emb_output_dim"] = col["emb_output_dim"] - splitter = col.get("splitter") + splitter = col.get("splitter", "^") na_value = col.get("fill_na", "") max_len = col.get("max_len", 0) padding = col.get("padding", "post") # "post" or "pre" diff --git a/model_zoo/__init__.py b/model_zoo/__init__.py index bb73a70..35e56d6 100644 --- a/model_zoo/__init__.py +++ b/model_zoo/__init__.py @@ -18,7 +18,6 @@ from .DNN.DNN_torch.src import DNN from .DSSM.src import DSSM from .EDCN.src import EDCN -from .ETA.src import ETA from .FFM.src import FFM, FFMv2 from .FGCNN.src import FGCNN from .FiBiNET.src import FiBiNET @@ -39,10 +38,9 @@ from .ONN.ONN_torch.src import ONN, ONNv2 from .PNN.src import PNN from .SAM.src import SAM -from .SDIM.src import SDIM from .WideDeep.WideDeep_torch.src import WideDeep from .xDeepFM.src import xDeepFM from .PEPNet.src import PPNet from .TransAct.src import TransAct from .multitask import SharedBottom, MMoE -from .EulerNet.src import EulerNet \ No newline at end of file +from .EulerNet.src import EulerNet