1. Add embedding_dim to numeric features

2. Update examples in the demo
reczoo · May 20, 2024 · 3610df5 · 3610df5
1 parent 7028d3e
commit 3610df5
Show file tree

Hide file tree

Showing 33 changed files with 339 additions and 112 deletions.
diff --git a/README.md b/README.md
@@ -103,8 +103,8 @@ We have benchmarked FuxiCTR models on a set of open datasets as follows:
 FuxiCTR has the following dependencies:
 
 + python 3.9+
-+ pytorch 1.10+ (required only for Torch models)
-+ tensorflow 2.1+ (required only for TF models)
++ pytorch 1.10+ (required only for torch models)
++ tensorflow 2.1+ (required only for tensorflow models)
 
 Please install other required packages via `pip install -r requirements.txt`.
 
@@ -116,8 +116,8 @@ Please install other required packages via `pip install -r requirements.txt`.
 
    ```
    cd demo
-   python example1_build_dataset_to_h5.py
-   python example2_DeepFM_with_h5_input.py
+   python example1_build_dataset_to_parquet.py
+   python example2_DeepFM_with_parquet_input.py
    ```
 
 2. Run a model on tiny data
@@ -155,7 +155,7 @@ Please install other required packages via `pip install -r requirements.txt`.
 
 If you find our code or benchmarks helpful in your research, please cite the following papers.
 
-+ Jieming Zhu, Jinyang Liu, Shuai Yang, Qi Zhang, Xiuqiang He. [BARS-CTR: Open Benchmarking for Click-Through Rate Prediction](https://arxiv.org/abs/2009.05794). *The 30th ACM International Conference on Information and Knowledge Management (CIKM)*, 2021. [[Bibtex](https://dblp.org/rec/conf/cikm/ZhuLYZH21.html?view=bibtex)]
++ Jieming Zhu, Jinyang Liu, Shuai Yang, Qi Zhang, Xiuqiang He. [Open Benchmarking for Click-Through Rate Prediction](https://arxiv.org/abs/2009.05794). *The 30th ACM International Conference on Information and Knowledge Management (CIKM)*, 2021. [[Bibtex](https://dblp.org/rec/conf/cikm/ZhuLYZH21.html?view=bibtex)]
 + Jieming Zhu, Quanyu Dai, Liangcai Su, Rong Ma, Jinyang Liu, Guohao Cai, Xi Xiao, Rui Zhang. [BARS: Towards Open Benchmarking for Recommender Systems](https://arxiv.org/abs/2205.09626). *The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)*, 2022. [[Bibtex](https://dblp.org/rec/conf/sigir/ZhuDSMLCXZ22.html?view=bibtex)]
 
 ## Discussion

diff --git a/data/README.md b/data/README.md
@@ -1,4 +1,3 @@
 ## Datasets
 
-A list of benchmark datasets for CTR prediction are available at https://openbenchmark.github.io/BARS/datasets/README.html#
-
+A list of benchmark datasets for CTR prediction https://github.com/reczoo/Datasets
diff --git a/data/tiny_parquet/feature_map.json b/data/tiny_parquet/feature_map.json
@@ -0,0 +1,123 @@
+{
+    "dataset_id": "tiny_parquet",
+    "num_fields": 14,
+    "total_features": 485,
+    "input_length": 14,
+    "labels": [
+        "clk"
+    ],
+    "features": [
+        {
+            "userid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 26
+            }
+        },
+        {
+            "adgroup_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 96
+            }
+        },
+        {
+            "pid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "cate_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 49
+            }
+        },
+        {
+            "campaign_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 99
+            }
+        },
+        {
+            "customer": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 98
+            }
+        },
+        {
+            "brand": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 67
+            }
+        },
+        {
+            "cms_segid": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 11
+            }
+        },
+        {
+            "cms_group_id": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 11
+            }
+        },
+        {
+            "final_gender_code": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "age_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 7
+            }
+        },
+        {
+            "pvalue_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        },
+        {
+            "shopping_level": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 5
+            }
+        },
+        {
+            "occupation": {
+                "source": "",
+                "type": "categorical",
+                "padding_idx": 0,
+                "vocab_size": 4
+            }
+        }
+    ]
+}
diff --git a/data/tiny_parquet/test.parquet b/data/tiny_parquet/test.parquet
diff --git a/data/tiny_parquet/train.parquet b/data/tiny_parquet/train.parquet
diff --git a/data/tiny_parquet/valid.parquet b/data/tiny_parquet/valid.parquet
diff --git a/demo/config/example1_config/dataset_config.yaml b/demo/config/example1_config/dataset_config.yaml
@@ -11,4 +11,3 @@ tiny_example1:
                  "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
                  active: True, dtype: str, type: categorical}]
     label_col: {name: clk, dtype: float}
-
diff --git a/demo/config/example2_config/dataset_config.yaml b/demo/config/example2_config/dataset_config.yaml
@@ -1,7 +1,7 @@
 ### Tiny data for demo only
-tiny_npz:
+tiny_parquet:
     data_root: ../data/
-    data_format: npz
-    train_data: ../data/tiny_npz/train.npz
-    valid_data: ../data/tiny_npz/valid.npz
-    test_data: ../data/tiny_npz/test.npz
+    data_format: parquet
+    train_data: ../data/tiny_parquet/train.parquet
+    valid_data: ../data/tiny_parquet/valid.parquet
+    test_data: ../data/tiny_parquet/test.parquet
diff --git a/demo/config/example2_config/model_config.yaml b/demo/config/example2_config/model_config.yaml
@@ -12,9 +12,9 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_npz:
+DeepFM_test_parquet:
     model: DeepFM
-    dataset_id: tiny_npz
+    dataset_id: tiny_parquet
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification

diff --git a/demo/config/example3_config/dataset_config.yaml b/demo/config/example3_config/dataset_config.yaml
@@ -1,14 +1,7 @@
 ### Tiny data for demo only
-tiny_example3:
+tiny_npz:
     data_root: ../data/
-    data_format: csv
-    train_data: ../data/tiny_csv/train_sample.csv
-    valid_data: ../data/tiny_csv/valid_sample.csv
-    test_data: ../data/tiny_csv/test_sample.csv
-    min_categr_count: 1
-    feature_cols:
-        [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
-                 "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
-                 active: True, dtype: str, type: categorical}]
-    label_col: {name: clk, dtype: float}
-
+    data_format: npz
+    train_data: ../data/tiny_npz/train.npz
+    valid_data: ../data/tiny_npz/valid.npz
+    test_data: ../data/tiny_npz/test.npz
diff --git a/demo/config/example3_config/model_config.yaml b/demo/config/example3_config/model_config.yaml
@@ -12,9 +12,9 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_csv:
+DeepFM_test_npz:
     model: DeepFM
-    dataset_id: tiny_example3
+    dataset_id: tiny_npz
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification
@@ -30,6 +30,6 @@ DeepFM_test_csv:
     embedding_dim: 4
     epochs: 1
     shuffle: True
-    seed: 2019
+    seed: 2023
     monitor: 'AUC'
     monitor_mode: 'max'
diff --git a/demo/config/example4_config/dataset_config.yaml b/demo/config/example4_config/dataset_config.yaml
@@ -7,9 +7,7 @@ tiny_example4:
     test_data: ../data/tiny_csv/test_sample.csv
     min_categr_count: 1
     feature_cols:
-        [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz",
-          embedding_dim: 8, freeze_emb: True},
-         {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
+        [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
                  "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
                  active: True, dtype: str, type: categorical}]
     label_col: {name: clk, dtype: float}
diff --git a/demo/config/example4_config/model_config.yaml b/demo/config/example4_config/model_config.yaml
@@ -12,7 +12,7 @@ Base:
     feature_specs: null
     feature_config: null
 
-DeepFM_test_pretrain: 
+DeepFM_test_csv:
     model: DeepFM
     dataset_id: tiny_example4
     loss: 'binary_crossentropy'
@@ -27,10 +27,9 @@ DeepFM_test_pretrain:
     batch_norm: False
     net_dropout: 0
     batch_size: 128
-    embedding_dim: 8
+    embedding_dim: 4
     epochs: 1
     shuffle: True
-    seed: 2023
+    seed: 2019
     monitor: 'AUC'
     monitor_mode: 'max'
-
diff --git a/demo/config/example5_config/dataset_config.yaml b/demo/config/example5_config/dataset_config.yaml
@@ -1,7 +1,15 @@
 ### Tiny data for demo only
-tiny_seq:
+tiny_example5:
     data_root: ../data/
-    data_format: npz
-    train_data: ../data/tiny_seq/train.npz
-    valid_data: ../data/tiny_seq/valid.npz
-    test_data: ../data/tiny_seq/test.npz
+    data_format: csv
+    train_data: ../data/tiny_csv/train_sample.csv
+    valid_data: ../data/tiny_csv/valid_sample.csv
+    test_data: ../data/tiny_csv/test_sample.csv
+    min_categr_count: 1
+    feature_cols:
+        [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz",
+          embedding_dim: 8, freeze_emb: True},
+         {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
+                 "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], 
+                 active: True, dtype: str, type: categorical}]
+    label_col: {name: clk, dtype: float}
diff --git a/demo/config/example5_config/model_config.yaml b/demo/config/example5_config/model_config.yaml
@@ -12,32 +12,25 @@ Base:
     feature_specs: null
     feature_config: null
 
-DIN_test:
-    model: DIN
-    dataset_id: tiny_seq
+DeepFM_test_pretrain: 
+    model: DeepFM
+    dataset_id: tiny_example5
     loss: 'binary_crossentropy'
     metrics: ['logloss', 'AUC']
     task: binary_classification
     optimizer: adam
-    learning_rate: 1.0e-3
-    embedding_regularizer: 0
+    hidden_units: [64, 32]
+    hidden_activations: relu
     net_regularizer: 0
-    batch_size: 128
-    embedding_dim: 4
-    dnn_hidden_units: [64, 32]
-    dnn_activations: relu
-    attention_hidden_units: [64]
-    attention_hidden_activations: "Dice"
-    attention_output_activation: null
-    attention_dropout: 0
-    din_target_field: adgroup_id
-    din_sequence_field: click_sequence
-    feature_specs: [{name: click_sequence, feature_encoder: null}]
-    net_dropout: 0
+    embedding_regularizer: 1.e-8
+    learning_rate: 1.e-3
     batch_norm: False
+    net_dropout: 0
+    batch_size: 128
+    embedding_dim: 8
     epochs: 1
     shuffle: True
-    seed: 2019
+    seed: 2023
     monitor: 'AUC'
     monitor_mode: 'max'
-    
+
diff --git a/demo/config/example6_config/dataset_config.yaml b/demo/config/example6_config/dataset_config.yaml
@@ -1,21 +1,7 @@
 ### Tiny data for demo only
-tiny_example6:
+tiny_seq:
     data_root: ../data/
-    data_format: csv 
-    train_data: ../data/tiny_csv/custom_preprocess_train_sample.csv
-    valid_data: ../data/tiny_csv/custom_preprocess_valid_sample.csv
-    test_data: ../data/tiny_csv/custom_preprocess_test_sample.csv
-    min_categr_count: 1
-    feature_cols:
-        -   active: true
-            dtype: str
-            name: [msno, song_id, source_system_tab, source_screen_name, source_type,
-                city, gender, registered_via, language]
-            type: categorical
-        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: genre_ids,
-            type: sequence}
-        - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: artist_name,
-            type: sequence}
-        - {active: true, dtype: str, name: isrc, preprocess: extract_country_code, type: categorical}
-        - {active: true, dtype: str, name: bd, preprocess: bucketize_age, type: categorical}
-    label_col: {dtype: float, name: label}
+    data_format: npz
+    train_data: ../data/tiny_seq/train.npz
+    valid_data: ../data/tiny_seq/valid.npz
+    test_data: ../data/tiny_seq/test.npz
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,4 +11,3 @@ tiny_example1:
		"cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
		active: True, dtype: str, type: categorical}]
		label_col: {name: clk, dtype: float}