From 3610df5b21ace4737e272ceb1ebb59325a361f10 Mon Sep 17 00:00:00 2001 From: xpai Date: Mon, 20 May 2024 13:29:34 +0800 Subject: [PATCH] 1. Add embedding_dim to numeric features 2. Update examples in the demo --- README.md | 10 +- data/README.md | 3 +- data/tiny_parquet/feature_map.json | 123 ++++++++++++++++++ data/tiny_parquet/test.parquet | Bin 0 -> 13283 bytes data/tiny_parquet/train.parquet | Bin 0 -> 13279 bytes data/tiny_parquet/valid.parquet | Bin 0 -> 13283 bytes .../example1_config/dataset_config.yaml | 1 - .../example2_config/dataset_config.yaml | 10 +- demo/config/example2_config/model_config.yaml | 4 +- .../example3_config/dataset_config.yaml | 17 +-- demo/config/example3_config/model_config.yaml | 6 +- .../example4_config/dataset_config.yaml | 4 +- demo/config/example4_config/model_config.yaml | 7 +- .../example5_config/dataset_config.yaml | 18 ++- demo/config/example5_config/model_config.yaml | 31 ++--- .../example6_config/dataset_config.yaml | 24 +--- demo/config/example6_config/model_config.yaml | 26 ++-- .../example7_config/dataset_config.yaml | 21 +++ demo/config/example7_config/model_config.yaml | 35 +++++ ...y => example1_build_dataset_to_parquet.py} | 0 demo/example2_DeepFM_with_parquet_input.py | 55 ++++++++ ...t.py => example3_DeepFM_with_npz_input.py} | 4 +- ...t.py => example4_DeepFM_with_csv_input.py} | 4 +- ...=> example5_DeepFM_with_pretrained_emb.py} | 4 +- ... => example6_DIN_with_sequence_feature.py} | 4 +- ...example7_DeepFM_with_custom_preprocess.py} | 19 ++- .../config/DCN_tiny_npz_tuner_config.yaml | 1 - fuxictr/preprocess/build_dataset.py | 4 +- fuxictr/preprocess/feature_processor.py | 6 +- .../layers/embeddings/feature_embedding.py | 5 +- fuxictr/version.py | 2 +- .../DCN/DCN_torch/config/dataset_config.yaml | 1 + setup.py | 2 +- 33 files changed, 339 insertions(+), 112 deletions(-) create mode 100644 data/tiny_parquet/feature_map.json create mode 100644 data/tiny_parquet/test.parquet create mode 100644 data/tiny_parquet/train.parquet create mode 100644 data/tiny_parquet/valid.parquet create mode 100644 demo/config/example7_config/dataset_config.yaml create mode 100644 demo/config/example7_config/model_config.yaml rename demo/{example1_build_dataset_to_npz.py => example1_build_dataset_to_parquet.py} (100%) create mode 100644 demo/example2_DeepFM_with_parquet_input.py rename demo/{example2_DeepFM_with_npz_input.py => example3_DeepFM_with_npz_input.py} (91%) rename demo/{example3_DeepFM_with_csv_input.py => example4_DeepFM_with_csv_input.py} (93%) rename demo/{example4_DeepFM_with_pretrained_emb.py => example5_DeepFM_with_pretrained_emb.py} (93%) rename demo/{example5_DIN_with_sequence_feature.py => example6_DIN_with_sequence_feature.py} (91%) rename demo/{example6_DeepFM_with_custom_preprocess.py => example7_DeepFM_with_custom_preprocess.py} (87%) diff --git a/README.md b/README.md index 7fa7733..f5f414b 100644 --- a/README.md +++ b/README.md @@ -103,8 +103,8 @@ We have benchmarked FuxiCTR models on a set of open datasets as follows: FuxiCTR has the following dependencies: + python 3.9+ -+ pytorch 1.10+ (required only for Torch models) -+ tensorflow 2.1+ (required only for TF models) ++ pytorch 1.10+ (required only for torch models) ++ tensorflow 2.1+ (required only for tensorflow models) Please install other required packages via `pip install -r requirements.txt`. @@ -116,8 +116,8 @@ Please install other required packages via `pip install -r requirements.txt`. ``` cd demo - python example1_build_dataset_to_h5.py - python example2_DeepFM_with_h5_input.py + python example1_build_dataset_to_parquet.py + python example2_DeepFM_with_parquet_input.py ``` 2. Run a model on tiny data @@ -155,7 +155,7 @@ Please install other required packages via `pip install -r requirements.txt`. If you find our code or benchmarks helpful in your research, please cite the following papers. -+ Jieming Zhu, Jinyang Liu, Shuai Yang, Qi Zhang, Xiuqiang He. [BARS-CTR: Open Benchmarking for Click-Through Rate Prediction](https://arxiv.org/abs/2009.05794). *The 30th ACM International Conference on Information and Knowledge Management (CIKM)*, 2021. [[Bibtex](https://dblp.org/rec/conf/cikm/ZhuLYZH21.html?view=bibtex)] ++ Jieming Zhu, Jinyang Liu, Shuai Yang, Qi Zhang, Xiuqiang He. [Open Benchmarking for Click-Through Rate Prediction](https://arxiv.org/abs/2009.05794). *The 30th ACM International Conference on Information and Knowledge Management (CIKM)*, 2021. [[Bibtex](https://dblp.org/rec/conf/cikm/ZhuLYZH21.html?view=bibtex)] + Jieming Zhu, Quanyu Dai, Liangcai Su, Rong Ma, Jinyang Liu, Guohao Cai, Xi Xiao, Rui Zhang. [BARS: Towards Open Benchmarking for Recommender Systems](https://arxiv.org/abs/2205.09626). *The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)*, 2022. [[Bibtex](https://dblp.org/rec/conf/sigir/ZhuDSMLCXZ22.html?view=bibtex)] ## Discussion diff --git a/data/README.md b/data/README.md index 6b64500..ff31374 100644 --- a/data/README.md +++ b/data/README.md @@ -1,4 +1,3 @@ ## Datasets -A list of benchmark datasets for CTR prediction are available at https://openbenchmark.github.io/BARS/datasets/README.html# - +A list of benchmark datasets for CTR prediction https://github.com/reczoo/Datasets diff --git a/data/tiny_parquet/feature_map.json b/data/tiny_parquet/feature_map.json new file mode 100644 index 0000000..38f53a1 --- /dev/null +++ b/data/tiny_parquet/feature_map.json @@ -0,0 +1,123 @@ +{ + "dataset_id": "tiny_parquet", + "num_fields": 14, + "total_features": 485, + "input_length": 14, + "labels": [ + "clk" + ], + "features": [ + { + "userid": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 26 + } + }, + { + "adgroup_id": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 96 + } + }, + { + "pid": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 4 + } + }, + { + "cate_id": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 49 + } + }, + { + "campaign_id": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 99 + } + }, + { + "customer": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 98 + } + }, + { + "brand": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 67 + } + }, + { + "cms_segid": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 11 + } + }, + { + "cms_group_id": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 11 + } + }, + { + "final_gender_code": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 4 + } + }, + { + "age_level": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 7 + } + }, + { + "pvalue_level": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 4 + } + }, + { + "shopping_level": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 5 + } + }, + { + "occupation": { + "source": "", + "type": "categorical", + "padding_idx": 0, + "vocab_size": 4 + } + } + ] +} \ No newline at end of file diff --git a/data/tiny_parquet/test.parquet b/data/tiny_parquet/test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d893734bd626b54c600a5af6af39f6e9eae6b86d GIT binary patch literal 13283 zcmche4Qw0ddB^WaN}^<0mTVtQS+Y&5^3|0Ui=-sm^yFEjBz`E8EYUK*Y)9gkB+4Yk zh@?IobWj2}c7isr-6U<&E_1y!NrS@Zw#37VB1i`h$TlZPf^1pRU|T(4?V4g!Hmu9G zpxE=jJARyw%34^akiV#V-}ml$-{<)~&%JkaMy+lWr{N4->kxe~TN$RlhmrlP9_JXY zjXT)Ry}@$zTqRf8tk;SoR*&cYVpep_{t~S;jvH0e?>AVMt!LD1gi)(^G8HVt|2x~v z)iie3GnI{cbvS;GQ|aDdb$3|ZFIe8mYkwOfjqR8_r()<=J$;Z-4QJ#I4$*9}B)x`Z zs@WdQ13wsm9~ym#J1SdOZirsXienh8OW8`DO*7KER5oS{$FR(HW|UzJTE;-<@HZ-& zIgPZsnz?W|n+j%Pb4hWLzh-qSD*jV7e?v{nR9<}@n8(y-l`!df?TZS1w}jIZ`K^SW^>Q=ukX{8y_B%dJb#%%xJX zT^Tnt5>$tm5e`1*}xiGdkO6CB@L~U71h|Um!o@LYx z4CPyIf3&mLkUhN6Z^#<#7alcjTVk}AADw(yt8dVM{m6rlZ6kmEu(9B;+HfL0F+D$* zO-;liqN{GIbr;n9k9P1kb>u4L6u^eEwKN+gTt$UY!>GtLa1&exSE)&GRfXh+d*CXj zo;A>2fN}Olyl58Znu!FRP zWF+k-ainh2X3{27BWXYB0BINLAjv@5Luw`Ul6I0BNKK@DBnzpQw1e~o5<>L5q{AdL z=|0kzNO}^5j@eGyO6nmYv|3UZ=@99QB&vzbUQ#`&ophYkOtO-YDJD`K=_pA@DDV^| z{8Wn{$Qs(}yd_T?o75>!8)YjEi>!vF5?a-?vNGAbUtf2r-*!dYs;jnNzmn=PG`Lz! z-x+Vec&p;;@5GPnI?;0F(-qtGu@-CFzM(IUw|&R`ecyfWw|(-+Uqm_sUwiro`?fyc zf9KJ^-j{i7SI3o~|M%qa$y61Y;eW*!kf{jG2a{4-J-$izY!&}fEC10R!bLenq?NL@ zbXu*{a?e%{aj%QGtW;{w0v8sUg%Bfj2+=rckOX{&NC+`pk5ItPK;#6;L2{80Dui{E z1f(WOz{^eo3J8;%1bhZazzV2%Nyz9n(l7~F0I3NQ!t5jgU4$AK0%70>Oamlf2Xs!7 zPLY5lFzF`&Ki~tTfF`g4il`dEYlH-JfM6d9Y2ha!ZBCPb6w<;+8Y2Ngo`f_3)*ce! zCZrP3Mj9by9v~rAkRtb!fIiX*X?TVN>M0>>vKMs{+6kLsWzNVXXX;odGQADi#pxo z)%>px@;}^9q$+0#QkAWxNxfXlT_;liTqN~UrKT}YDk6mlgAha(1p znIH*K0kMcW2nUfM7STsBL^Kg45QGSWaF7fV5p_flq$0|o24o>4Kp)5ib%;13jOZh4 zKt6~@CV_O&0Fsai$O2I9AR!Y#Am{T9war8a(wpdFMV!$3B&fanu4Q$wGiaXSeaHA({cAhwxwmV}IR5v>6VJ$@|WN9m#(lse_K zDqCsTUDNuLPtL!#XN!J&-D3m))bPys!Dohlwr%HABi}pqG54*%N&eZ{f7?+@X83D8 z1zI-*TO&1Nx&*63SgD3rGZ_o;I68NxCewA@uw;c`Ch85 zU<#Zd4&VpzMQp(<*aBCG6Jic7!5vsatPw*n3Z@Zju!fi+cBrCY4eh#6v;;|TnL zOK^htBgWtn+~@cO>);-|5QjZYQV1KQPpV2cSnd`1&9g)5l*d`wO2fRX9_i*;+0<*O z?GI{x%xavM_gDQu)Ayj}_PgzGebRfSb$C2_)^~aM(#;2df2;H2#r+SzsBxz%_AGtn zy7}sNKlrwH<$>6?mk#{o_ul;3PFCO6cIP}@ng83qLTIbP!AvwC*4MPUf3D!KTlv*v z6jtTD5LRVt>9ERPrLfY%RnjMU$H;Zb_T#xOxo?!MbOrqx$u;b@Z`bJmk*zkU57aYN zKfR;npKuh`rcUKHjz4F0t1SQb1N?IXv@Yc}u`Xq6>27wAj5|rl2jl^2A#wtB@C?aL>LDS&4wH}%xF4XNp?;yhdP&He zE)sZ09w6_KH>h{W1D1sQ333nlT0=t4Am>I%sMo09MiO!_KsrQfCk>NKB;+^hEb<$* z9l4Agzn?TfLhe04LjE16eA_FLbh<#4agr5$^OiUEwmdW1_VgzH z4?jp9eQwMC)}2=dU)a|6-M}9OZtQ61rCmVjy1M%XVHdFbmD9=hgd(N2>WpPVjFziBsh~!Kt#fG^ZvdF5LRFeRDc?gK!^$Fp4_A9=0Yf*WVvVMrB7cn6v|c_Hn{4N^1GtRaLLqb zurga!T9z^Q(i}!_TPpWH`Cg!sdpdCX<-Oy+*^6Jg@KM@0cqI1fjSv6#%$MJ`d~CQ- z_g5dcJpaCn9%}hN_7)sonN3IMb4B5m3f+?${+p-x=T4Hdl{0~}m8~V6-S}C3H>0mi zhi9URAh(<0YWAswm$~!;mi{Vw@hLW}c{M0D%3hv{O@)_8;<55wi*#!7I|b>~{Ij!o zEIt^Cf9H3f;$iXaU2MtS6~rUdQSq-LV8sT>p$xsUZftJ+Y|FiUn_Z^2$z^yZvz1~jp<=G9C;U1{_Wq|GhyjJ(?UV1;% z*siZk(R;XHTK`tvLoG4-YG8@J8i;2T$#lz6!|Ai_h8FQ~LgS0xMw2`7G8;!uqp4{ zP_x48no=h79UEwlcK%9JQsV}iDLe-&t;zc~(5Uz&SY zc#O!OU&b9RwH%zEpIe0Sh3I^m{zVL0P3D8fgQga`A+<=Ri#x?WMqg`nyH9wJiVvIm z+q*jOvt%8zTCHOw{M>J~c3ExHR=d?YZS9*D+H5vi{~{U(^8Kzcr+7@ig{uhtE~|CX zO6Sq1a1r_xPNLs_l5NrN?LO!C4O3$moo_qrO2o4Pho{SD9$5%EJ(*B)ICdgt3;9ki zPP-CPTilWgCEW3-!*kv>J2&k$8Pl%5!P$^`$rSQ-i*23hSSVr71WvjR34M-qY^cu~ z3nb$4ki$JysxR!YlWqHXU#Gh}>^MbsF@9;m?;A6Z^toUo8}XVJqaKSf>a)dR+m#5U zLuR{iWOhure#q-F`n~R%h+_crrtwSZDVlq9c`)nunxu0qv;I!cqTlCEh0I;+&1=DL z9x>6Jj6ts@>x$9(leQUJ_e|JSY(Jij^!trMf1=wI^4YqJ=O6TT|8{drj?9HRZHshX z(q~?ZM-ra2c>RWv$C72J4{Q|Kb_QmMkNE#P$5P|U;m3qM8Sq%r z0bd{<64^-vy`FTyZYknGF-FRVk@b92g1_SUo8l3No$|>&12#iRPdenJ+;GZrf#yMe zv|Ew^%DsSZaLRACoTE5^?-FwiP<-uapE(jo&V}O^$}@MoY&<4LGQ@i_oUoX}c1tR- zt}jcDD~CU4L%xxzu-TIgdc_)0vc{|#Q;r<*CkLrstUISzI6?JJ^k-T86!DE1LuOMv z99xeys&}341*)OTW#nsyYUWJP8_D|2@pJ3dI=|VTp_rut=I*InoXZkJy0=gr`8@9F zUgIh2oWhRv{gm%3PmMhn=p0;#_-y(0m&iT3mqeCBbid8Vw;Xdzdd^dAT|aK}{(F#r zrU>>s{?5TT-47Nh=MD+?0Y@BpzZ@0!MT=3deVJ-&r`Acq86J0Khx@vg zTuEVnRhVPKvm7?Z7ecsZpKEDo)|wqYIcT8`-6)ARlC+;2S?sf7=SbVW0JeX9IPOnl zZTA8Fw`me zoBs6X#$)?RKc{lX()rk^7N0LzrTbwhEW1Dpb$HFEa`}^FF;Be`_Lv4N5HRpG&IbR5M z(fNb8kl$>gJ@`mGK=l~MnbE^y{3Ud75{-uj$2GtJ4A-(%uJ*1l8n`ScC1nU5G%VXW}$1b&y;#}m%y>C__rhsUL3iJv|p zD~kA;qbu2D&n(nHp;;sftdw$sX_yl;hn!5SrG7tUjgUO30>VWx#=3-|*0uf1pdRMsG+3=O4^QGxvp4sr!V7WP|h&Y!Dugjr1)e{mZ}K{6FRob}`H- I{qH^h2RTN+RsaA1 literal 0 HcmV?d00001 diff --git a/data/tiny_parquet/train.parquet b/data/tiny_parquet/train.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bd1453c5021d71e109ff4d798a54570d822a1bf3 GIT binary patch literal 13279 zcmche4Qw0ddB^WaN}^<0mTVtGMY2t+_SKa?L{gG%X7Vgj5Yh7UNnqgHoG|RGJ z*z>cVT3Du#zo>iP_wIS$=lMO)y?1m*tR54m;SAiKA^KppGE8F+Bl}rB&M{m& zcjy53D$6x;)m(L}UMr4RJ)ZxIS=KT8%CyorZbVJLUu9Xgkx{czMykROy55qM?m?-2O)QhF`sd9TQqu56{H%P@aR^~e?7KEIYxT{CQFs#J9h z!@sk=xX`AFcq)`QKN(9!W3%TYGtrnx`bCxQQ5FByZTwqX?>g|;SZ->F+b%?Yq$Kh+ zV&s`R`m7~^glZxozbdH`qrxq5%d89qT8Ho>_lL5T1|3@!u1K|8)6!^Gv%8t5MwU@G zF_dq;{jttoL+;32zaeL^UwqWGZGq9QJUae}R^O!m!qJBw+eZHSZgbIJ^^s)e{N(IR zE`2^86xJ$<0>kI8b(E~ft%nmxJpfetEwb7+yhrJ zjjVz00+chTOR6?wr-m_lsJeA}G#;ax(M>YT%%3q#_44?{_!k?iYbVK7D?5v>s)egE zu}Qg~o@nJi*K+rM(sD0ohPaEY0M&{{(@q7H5$eewkZZP)j*{9)$4D%xlVl^og&m|V zBqM1ji6eEBHj_4yno0Xe`$;=Uhe!s}F47)SFX?_#6RCx?mt-N;lkOvZii8mTA?XOo zOgc#V3`tL-&@tOdTS+}6gjP%HA{{1unnX2`*-dIB9UvViwUVqPWQvK@Kze|rBNTXw z5`JpM4`dB(bzYaJjZNy5r;W0ehDBE+QVFeTSzaFR-KTH3+;6+8-J`3uU%#5}F*Lc_ zOkW>6aOqan7v4x5-EpGr>W9m=>!WSf_Ps-&9&7))=Ue`RKW=~jPks>X41WIUZ|~ju zZ2z4{|9)@wu^k;(fA!zv$H&t(WQPBMFCtSFo(-j>w7RlM_e>4{{2u<@U4)BribyME zYx%TVuIHYq9^zgRaapd`oCPi{G7BL_=n$eY(jW==43Q9GxE`T^n}Ns)l7r+TAyf$K z2nk4ylYp0<1QZY^4+;1Tkbo6X@sW_x?WAE6umDo$NeHu(1auK&1pI&xkOG>(3Mis#053NQ=m5b!64D|-LfV`r0V$+~pEODWf;m&wi~W3%E7;MWbh zE4BOwhxqUABT|*K1gXl_@}#cRbJvN~zZFTnT&-y?kcvnl!XN~Z1v!Wuhy!_`7Sw-tgGKFPgqIcIZpPzub2JQ|@n`dXM|^+o``f`O#V1pR9m_`zx+iM+H!b|*odm3Mq5xLeS{~TdD((sO5O+?v5r3%Ckng3+ z3Z}pb;sAaSU&I!?f-P``I3ecX65N3$#2PUKqhK1b25X2JVuvaU))0U2j+h~qd5*vz zxCAGNKVl3X!F`@zunz9Q3vt-fB89L?`lPCKgXO*>zj<~@o$@#v z_5C5u_gRf|Wnay=HGL0hZohfp_4j+P?in77o%OE_U%vU!pKo5QuFMjeB-}1Tmw&(Z%^p9Wr<^8O_z5UJ-U77!fzG7%=BB5-o5Z3Q$b^lVuU$^qB z$0)4Ic_FOI*79MMy-H!Fg{!1b@{W<~lI_RyU2@+jTj>h=Gm>lAZC|O=|BS6QsP{KA zH9x*7EE6*(UA^)z_EB`np=BsRyt7 zu@e9*l=A|QsFkQEED6l{NvLh8BYh;W4VFM3e8_(t9ACxLmeX(0`gP_Isq zz&DsjO+=joyT}F9WAF}sQHQ{}lZ2c=E}$MFFHj55knE%$5_0Sa2|0m#0%{s+7;3DK zgxu*OfqCQtau2zKnulD#{RFv(e61rPXOMGl66!VTw~>V03z80#4v>aPCKB=+bry9U zd5rvikTgI--kl>M_l{Gp?UpDyU8G65Q9UPh%6U?@(y(=E>eSJlD_Ywp>;Q%if8*`M zku&a>e(~EsIzIlJ#x*tdgVvMNZ!i5|=U=^Z@Figf@akw0&bm-k{t)=X2Hi(m{v$X4 zx|0}H&J&C(Tgx-LtmfWfhbYDH*6$ToqbihCV1V)ltbhpI4!nR8umfJe1ULZ`Ac~Lz zVITs;fC9oxDEH(aOE(uo@g&PVBQJaUvY}A6(y+l*mz3L8Er!dcUW1j{s?xHIxtHcJ za@$h9`^mS0)!fs;(=Y5E^G{#;%*A&z#=)cU7jL}tKW9GsBg=b+iw!?~ukG0%yXb+I zf5BIDcy%rlo6Q%5?^NlY)bJ~(_@_^jvz0S}vz4u7o!$I#V>hF(&P1kS$q=`b;p+CP zgcrH=@|FH7dFd%Ota&XcHp*U{iA}{9NaC^bJ&SZ|>H7rf)WUPKcq~2?iGLS%p5kHY ztzB%%-x0(k)KKxS5@5v!$)OCru5NB^{&>s1eUn|GH^~)zgIlpTr&t znHN&`{vAd|9zQGf)KG~n?7iMf@|D>QRNx+{dpAh;0A8#6pqJjwG#}7cr|BJBD5HP9 z;o-J8eJ!v+UkfC1$yBE80mJFD2MlfE;d!Aiwx(4$e7G$YO2%k-8+{|%MrTaKV~Obb zHI1T~^V!9;Fmxg@6UrX$!thisnO-c{k&IW?t_pLD; zPn9)WX#OOl@-GRMhL!yrXuS06I;H8gy&Gt_@a34&s;p}RO)9>oQkpJ1WCN#Ax>HkH zU)#IEhK27@%3CdTZJ?$t(Ep{q`~6rurfB~of~RacvVx*WT9gN z&C$+ZX-aC`Kr_YXV5K#A-v$~Lp9Pf$i|u!5Ot{O<#zNVdS-RCc+?I$F`Lm0-qoo%^ zv$HevFg_QX&CtJwL95Ap$au)qMmMB3$#iL_*vIJWtsc(_-vi>qrvCPh4*V=whpblX zC<#9gTCH7H+oaWQwN6_5CWSVeP1Zk;#(_e=Yt$(o({JG_Lchyuoww3?^eJ3~K82I$ zcYtJ@5BRz-1pLF)*hS~tj<}MET+rd|@|)drVW&46P7TLT#BE{!$@xiFQffKmL6n-@%BU$@xSnTdy!_H6K^>#)%0$i#>GtnpwnkqA3H6Xp6M z4m;VlFZnw?-4VwrvWxKxa{>RT+1=-Yja<}cnvZ!c#+cuhfNfVYmNLg zvq8USisE1lAa)K~lVia$)CV?7Y&(Nf#7E-)onyIimGEQIo(g&`nV>(I2#f3_Lq2aN zXt$Jbpco_NgL^&Sl;N*5{-#9KVW)iZOo7dC%9{y0DL0(5T%dW7AMKV@ka92RADjr7 zEf**b;JeHmgA`wT#&3=$kaLlQh4Rdks2Gn)cb0fhMUoa%#BNCk*Y#!Dah34rblC5n zh?u>pkWZ`uWoyiuF_p-XKx&Zc#kzBfN0L`|m;inWEV51Ud&3bU&D* zoI5Ps2OJ6H{bEeq7tP0f_C>0#onpQP6Ooi#*pp!IGd$+X4fk~|xKhIYsyN4_cQIm4 z%!P5yKG(v~v^6(;a?nB>x)Bm>BxyhAp6|0_=SbVWAhv&fIPTA2Z3nQdOV1aq(tWdpB{I5X>F79$fi}UYYfL&G=o2O-whDh@kQHmD{Vdf6Yc6k3vY*Kx zuQz|J&Ed}HvGs6%ymWq7{&>ClOXru?F{^E9zij@tG3huze_?-KGJiPHHeIOEg8OZw z9djKW)Ime9ajvNu=FZp9Oe=PY!c&I4hp&utX*6at+M*PX=(|=*jd<1(X^YP#;swuQ*TbKx% zy?8cR#Is4zAvk0Pk4_jP`o}sc9q0X1eto`ZVc&{V{Udai5Dt-7Z~pj`*5|aDX-=lX z9upU`_MMW?r*C-8e8i{jH*a*ZNNgZNR)Ri%!aW@%;RyC#}aA vy&)Z%eJB^p9*m^Z2Ze`ZgY*z=5FU?>^erR(tH0mkxe~TN$RlhmrlP9_JXY zjXT)Ry}@$zTqRf8tk;SoR*&cYVpep_{t~S;jvH0e?>AVMt!LD1gi)(^G8HVt|2x~v z)iie3GnI{cbvS;GQ|aDdb$3|ZFIe8mYkwOfjqR8_r()<=J$;Z-4QJ#I4$*9}B)x`Z zs@WdQ13wsm9~ym#J1SdOZirsXienh8OW8`DO*7KER5oS{$FR(HW|UzJTE;-<@HZ-& zIgPZsnz?W|n+j%Pb4hWLzh-qSD*jV7e?v{nR9<}@n8(y-l`!df?TZS1w}jIZ`K^SW^>Q=ukX{8y_B%dJb#%%xJX zT^Tnt5>$tm5e`1*}xiGdkO6CB@L~U71h|Um!o@LYx z4CPyIf3&mLkUhN6Z^#<#7alcjTVk}AADw(yt8dVM{m6rlZ6kmEu(9B;+HfL0F+D$* zO-;liqN{GIbr;n9k9P1kb>u4L6u^eEwKN+gTt$UY!>GtLa1&exSE)&GRfXh+d*CXj zo;A>2fN}Olyl58Znu!FRP zWF+k-ainh2X3{27BWXYB0BINLAjv@5Luw`Ul6I0BNKK@DBnzpQw1e~o5<>L5q{AdL z=|0kzNO}^5j@eGyO6nmYv|3UZ=@99QB&vzbUQ#`&ophYkOtO-YDJD`K=_pA@DDV^| z{8Wn{$Qs(}yd_T?o75>!8)YjEi>!vF5?a-?vNGAbUtf2r-*!dYs;jnNzmn=PG`Lz! z-x+Vec&p;;@5GPnI?;0F(-qtGu@-CFzM(IUw|&R`ecyfWw|(-+Uqm_sUwiro`?fyc zf9KJ^-j{i7SI3o~|M%qa$y61Y;eW*!kf{jG2a{4-J-$izY!&}fEC10R!bLenq?NL@ zbXu*{a?e%{aj%QGtW;{w0v8sUg%Bfj2+=rckOX{&NC+`pk5ItPK;#6;L2{80Dui{E z1f(WOz{^eo3J8;%1bhZazzV2%Nyz9n(l7~F0I3NQ!t5jgU4$AK0%70>Oamlf2Xs!7 zPLY5lFzF`&Ki~tTfF`g4il`dEYlH-JfM6d9Y2ha!ZBCPb6w<;+8Y2Ngo`f_3)*ce! zCZrP3Mj9by9v~rAkRtb!fIiX*X?TVN>M0>>vKMs{+6kLsWzNVXXX;odGQADi#pxo z)%>px@;}^9q$+0#QkAWxNxfXlT_;liTqN~UrKT}YDk6mlgAha(1p znIH*K0kMcW2nUfM7STsBL^Kg45QGSWaF7fV5p_flq$0|o24o>4Kp)5ib%;13jOZh4 zKt6~@CV_O&0Fsai$O2I9AR!Y#Am{T9war8a(wpdFMV!$3B&fanu4Q$wGiaXSeaHA({cAhwxwmV}IR5v>6VJ$@|WN9m#(lse_K zDqCsTUDNuLPtL!#XN!J&-D3m))bPys!Dohlwr%HABi}pqG54*%N&eZ{f7?+@X83D8 z1zI-*TO&1Nx&*63SgD3rGZ_o;I68NxCewA@uw;c`Ch85 zU<#Zd4&VpzMQp(<*aBCG6Jic7!5vsatPw*n3Z@Zju!fi+cBrCY4eh#6v;;|TnL zOK^htBgWtn+~@cO>);-|5QjZYQV1KQPpV2cSnd`1&9g)5l*d`wO2fRX9_i*;+0<*O z?GI{x%xavM_gDQu)Ayj}_PgzGebRfSb$C2_)^~aM(#;2df2;H2#r+SzsBxz%_AGtn zy7}sNKlrwH<$>6?mk#{o_ul;3PFCO6cIP}@ng83qLTIbP!AvwC*4MPUf3D!KTlv*v z6jtTD5LRVt>9ERPrLfY%RnjMU$H;Zb_T#xOxo?!MbOrqx$u;b@Z`bJmk*zkU57aYN zKfR;npKuh`rcUKHjz4F0t1SQb1N?IXv@Yc}u`Xq6>27wAj5|rl2jl^2A#wtB@C?aL>LDS&4wH}%xF4XNp?;yhdP&He zE)sZ09w6_KH>h{W1D1sQ333nlT0=t4Am>I%sMo09MiO!_KsrQfCk>NKB;+^hEb<$* z9l4Agzn?TfLhe04LjE16eA_FLbh<#4agr5$^OiUEwmdW1_VgzH z4?jp9eQwMC)}2=dU)a|6-M}9OZtQ61rCmVjy1M%XVHdFbmD9=hgd(N2>WpPVjFziBsh~!Kt#fG^ZvdF5LRFeRDc?gK!^$Fp4_A9=0Yf*WVvVMrB7cn6v|c_Hn{4N^1GtRaLLqb zurga!T9z^Q(i}!_TPpWH`Cg!sdpdCX<-Oy+*^6Jg@KM@0cqI1fjSv6#%$MJ`d~CQ- z_g5dcJpaCn9%}hN_7)sonN3IMb4B5m3f+?${+p-x=T4Hdl{0~}m8~V6-S}C3H>0mi zhi9URAh(<0YWAswm$~!;mi{Vw@hLW}c{M0D%3hv{O@)_8;<55wi*#!7I|b>~{Ij!o zEIt^Cf9H3f;$iXaU2MtS6~rUdQSq-LV8sT>p$xsUZftJ+Y|FiUn_Z^2$z^yZvz1~jp<=G9C;U1{_Wq|GhyjJ(?UV1;% z*siZk(R;XHTK`tvLoG4-YG8@J8i;2T$#lz6!|Ai_h8FQ~LgS0xMw2`7G8;!uqp4{ zP_x48no=h79UEwlcK%9JQsV}iDLe-&t;zc~(5Uz&SY zc#O!OU&b9RwH%zEpIe0Sh3I^m{zVL0P3D8fgQga`A+<=Ri#x?WMqg`nyH9wJiVvIm z+q*jOvt%8zTCHOw{M>J~c3ExHR=d?YZS9*D+H5vi{~{U(^8Kzcr+7@ig{uhtE~|CX zO6Sq1a1r_xPNLs_l5NrN?LO!C4O3$moo_qrO2o4Pho{SD9$5%EJ(*B)ICdgt3;9ki zPP-CPTilWgCEW3-!*kv>J2&k$8Pl%5!P$^`$rSQ-i*23hSSVr71WvjR34M-qY^cu~ z3nb$4ki$JysxR!YlWqHXU#Gh}>^MbsF@9;m?;A6Z^toUo8}XVJqaKSf>a)dR+m#5U zLuR{iWOhure#q-F`n~R%h+_crrtwSZDVlq9c`)nunxu0qv;I!cqTlCEh0I;+&1=DL z9x>6Jj6ts@>x$9(leQUJ_e|JSY(Jij^!trMf1=wI^4YqJ=O6TT|8{drj?9HRZHshX z(q~?ZM-ra2c>RWv$C72J4{Q|Kb_QmMkNE#P$5P|U;m3qM8Sq%r z0bd{<64^-vy`FTyZYknGF-FRVk@b92g1_SUo8l3No$|>&12#iRPdenJ+;GZrf#yMe zv|Ew^%DsSZaLRACoTE5^?-FwiP<-uapE(jo&V}O^$}@MoY&<4LGQ@i_oUoX}c1tR- zt}jcDD~CU4L%xxzu-TIgdc_)0vc{|#Q;r<*CkLrstUISzI6?JJ^k-T86!DE1LuOMv z99xeys&}341*)OTW#nsyYUWJP8_D|2@pJ3dI=|VTp_rut=I*InoXZkJy0=gr`8@9F zUgIh2oWhRv{gm%3PmMhn=p0;#_-y(0m&iT3mqeCBbid8Vw;Xdzdd^dAT|aK}{(F#r zrU>>s{?5TT-47Nh=MD+?0Y@BpzZ@0!MT=3deVJ-&r`Acq86J0Khx@vg zTuEVnRhVPKvm7?Z7ecsZpKEDo)|wqYIcT8`-6)ARlC+;2S?sf7=SbVW0JeX9IPOnl zZTA8Fw`me zoBs6X#$)?RKc{lX()rk^7N0LzrTbwhEW1Dpb$HFEa`}^FF;Be`_Lv4N5HRpG&IbR5M z(fNb8kl$>gJ@`mGK=l~MnbE^y{3Ud75{-uj$2GtJ4A-(%uJ*1l8n`ScC1nU5G%VXW}$1b&y;#}m%y>C__rhsUL3iJv|p zD~kA;qbu2D&n(nHp;;sftdw$sX_yl;hn!5SrG7tUjgUO30>VWx#=3-|*0uf1pdRMsG+3=O4^QGxvp4sr!V7WP|h&Y!Dugjr1)e{mZ}K{6FRob}`H- I{qH^h2RTN+RsaA1 literal 0 HcmV?d00001 diff --git a/demo/config/example1_config/dataset_config.yaml b/demo/config/example1_config/dataset_config.yaml index 9ab2e8e..c5ffd80 100644 --- a/demo/config/example1_config/dataset_config.yaml +++ b/demo/config/example1_config/dataset_config.yaml @@ -11,4 +11,3 @@ tiny_example1: "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], active: True, dtype: str, type: categorical}] label_col: {name: clk, dtype: float} - diff --git a/demo/config/example2_config/dataset_config.yaml b/demo/config/example2_config/dataset_config.yaml index d856d2a..f0cbf35 100644 --- a/demo/config/example2_config/dataset_config.yaml +++ b/demo/config/example2_config/dataset_config.yaml @@ -1,7 +1,7 @@ ### Tiny data for demo only -tiny_npz: +tiny_parquet: data_root: ../data/ - data_format: npz - train_data: ../data/tiny_npz/train.npz - valid_data: ../data/tiny_npz/valid.npz - test_data: ../data/tiny_npz/test.npz + data_format: parquet + train_data: ../data/tiny_parquet/train.parquet + valid_data: ../data/tiny_parquet/valid.parquet + test_data: ../data/tiny_parquet/test.parquet diff --git a/demo/config/example2_config/model_config.yaml b/demo/config/example2_config/model_config.yaml index e7b4130..5bb557f 100644 --- a/demo/config/example2_config/model_config.yaml +++ b/demo/config/example2_config/model_config.yaml @@ -12,9 +12,9 @@ Base: feature_specs: null feature_config: null -DeepFM_test_npz: +DeepFM_test_parquet: model: DeepFM - dataset_id: tiny_npz + dataset_id: tiny_parquet loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification diff --git a/demo/config/example3_config/dataset_config.yaml b/demo/config/example3_config/dataset_config.yaml index bc6ecb8..d856d2a 100644 --- a/demo/config/example3_config/dataset_config.yaml +++ b/demo/config/example3_config/dataset_config.yaml @@ -1,14 +1,7 @@ ### Tiny data for demo only -tiny_example3: +tiny_npz: data_root: ../data/ - data_format: csv - train_data: ../data/tiny_csv/train_sample.csv - valid_data: ../data/tiny_csv/valid_sample.csv - test_data: ../data/tiny_csv/test_sample.csv - min_categr_count: 1 - feature_cols: - [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", - "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], - active: True, dtype: str, type: categorical}] - label_col: {name: clk, dtype: float} - + data_format: npz + train_data: ../data/tiny_npz/train.npz + valid_data: ../data/tiny_npz/valid.npz + test_data: ../data/tiny_npz/test.npz diff --git a/demo/config/example3_config/model_config.yaml b/demo/config/example3_config/model_config.yaml index 17ca56e..e7b4130 100644 --- a/demo/config/example3_config/model_config.yaml +++ b/demo/config/example3_config/model_config.yaml @@ -12,9 +12,9 @@ Base: feature_specs: null feature_config: null -DeepFM_test_csv: +DeepFM_test_npz: model: DeepFM - dataset_id: tiny_example3 + dataset_id: tiny_npz loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification @@ -30,6 +30,6 @@ DeepFM_test_csv: embedding_dim: 4 epochs: 1 shuffle: True - seed: 2019 + seed: 2023 monitor: 'AUC' monitor_mode: 'max' diff --git a/demo/config/example4_config/dataset_config.yaml b/demo/config/example4_config/dataset_config.yaml index 28dc2e8..f0a7a36 100644 --- a/demo/config/example4_config/dataset_config.yaml +++ b/demo/config/example4_config/dataset_config.yaml @@ -7,9 +7,7 @@ tiny_example4: test_data: ../data/tiny_csv/test_sample.csv min_categr_count: 1 feature_cols: - [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz", - embedding_dim: 8, freeze_emb: True}, - {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", + [{name: ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], active: True, dtype: str, type: categorical}] label_col: {name: clk, dtype: float} diff --git a/demo/config/example4_config/model_config.yaml b/demo/config/example4_config/model_config.yaml index 49609ed..6ec1615 100644 --- a/demo/config/example4_config/model_config.yaml +++ b/demo/config/example4_config/model_config.yaml @@ -12,7 +12,7 @@ Base: feature_specs: null feature_config: null -DeepFM_test_pretrain: +DeepFM_test_csv: model: DeepFM dataset_id: tiny_example4 loss: 'binary_crossentropy' @@ -27,10 +27,9 @@ DeepFM_test_pretrain: batch_norm: False net_dropout: 0 batch_size: 128 - embedding_dim: 8 + embedding_dim: 4 epochs: 1 shuffle: True - seed: 2023 + seed: 2019 monitor: 'AUC' monitor_mode: 'max' - diff --git a/demo/config/example5_config/dataset_config.yaml b/demo/config/example5_config/dataset_config.yaml index 38e48bf..ecfc3d4 100644 --- a/demo/config/example5_config/dataset_config.yaml +++ b/demo/config/example5_config/dataset_config.yaml @@ -1,7 +1,15 @@ ### Tiny data for demo only -tiny_seq: +tiny_example5: data_root: ../data/ - data_format: npz - train_data: ../data/tiny_seq/train.npz - valid_data: ../data/tiny_seq/valid.npz - test_data: ../data/tiny_seq/test.npz + data_format: csv + train_data: ../data/tiny_csv/train_sample.csv + valid_data: ../data/tiny_csv/valid_sample.csv + test_data: ../data/tiny_csv/test_sample.csv + min_categr_count: 1 + feature_cols: + [{name: "userid", active: True, dtype: str, type: categorical, pretrained_emb: "../data/tiny_csv/userid_emb_dim8.npz", + embedding_dim: 8, freeze_emb: True}, + {name: ["adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid", + "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"], + active: True, dtype: str, type: categorical}] + label_col: {name: clk, dtype: float} diff --git a/demo/config/example5_config/model_config.yaml b/demo/config/example5_config/model_config.yaml index a42e947..673c5d9 100644 --- a/demo/config/example5_config/model_config.yaml +++ b/demo/config/example5_config/model_config.yaml @@ -12,32 +12,25 @@ Base: feature_specs: null feature_config: null -DIN_test: - model: DIN - dataset_id: tiny_seq +DeepFM_test_pretrain: + model: DeepFM + dataset_id: tiny_example5 loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification optimizer: adam - learning_rate: 1.0e-3 - embedding_regularizer: 0 + hidden_units: [64, 32] + hidden_activations: relu net_regularizer: 0 - batch_size: 128 - embedding_dim: 4 - dnn_hidden_units: [64, 32] - dnn_activations: relu - attention_hidden_units: [64] - attention_hidden_activations: "Dice" - attention_output_activation: null - attention_dropout: 0 - din_target_field: adgroup_id - din_sequence_field: click_sequence - feature_specs: [{name: click_sequence, feature_encoder: null}] - net_dropout: 0 + embedding_regularizer: 1.e-8 + learning_rate: 1.e-3 batch_norm: False + net_dropout: 0 + batch_size: 128 + embedding_dim: 8 epochs: 1 shuffle: True - seed: 2019 + seed: 2023 monitor: 'AUC' monitor_mode: 'max' - \ No newline at end of file + diff --git a/demo/config/example6_config/dataset_config.yaml b/demo/config/example6_config/dataset_config.yaml index d31dd7f..38e48bf 100644 --- a/demo/config/example6_config/dataset_config.yaml +++ b/demo/config/example6_config/dataset_config.yaml @@ -1,21 +1,7 @@ ### Tiny data for demo only -tiny_example6: +tiny_seq: data_root: ../data/ - data_format: csv - train_data: ../data/tiny_csv/custom_preprocess_train_sample.csv - valid_data: ../data/tiny_csv/custom_preprocess_valid_sample.csv - test_data: ../data/tiny_csv/custom_preprocess_test_sample.csv - min_categr_count: 1 - feature_cols: - - active: true - dtype: str - name: [msno, song_id, source_system_tab, source_screen_name, source_type, - city, gender, registered_via, language] - type: categorical - - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: genre_ids, - type: sequence} - - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: artist_name, - type: sequence} - - {active: true, dtype: str, name: isrc, preprocess: extract_country_code, type: categorical} - - {active: true, dtype: str, name: bd, preprocess: bucketize_age, type: categorical} - label_col: {dtype: float, name: label} + data_format: npz + train_data: ../data/tiny_seq/train.npz + valid_data: ../data/tiny_seq/valid.npz + test_data: ../data/tiny_seq/test.npz diff --git a/demo/config/example6_config/model_config.yaml b/demo/config/example6_config/model_config.yaml index 10e2b11..a42e947 100644 --- a/demo/config/example6_config/model_config.yaml +++ b/demo/config/example6_config/model_config.yaml @@ -12,24 +12,32 @@ Base: feature_specs: null feature_config: null -DeepFM_test_csv: - model: DeepFM - dataset_id: tiny_example6 +DIN_test: + model: DIN + dataset_id: tiny_seq loss: 'binary_crossentropy' metrics: ['logloss', 'AUC'] task: binary_classification optimizer: adam - hidden_units: [64, 32] - hidden_activations: relu + learning_rate: 1.0e-3 + embedding_regularizer: 0 net_regularizer: 0 - embedding_regularizer: 1.e-8 - learning_rate: 1.e-3 - batch_norm: False - net_dropout: 0 batch_size: 128 embedding_dim: 4 + dnn_hidden_units: [64, 32] + dnn_activations: relu + attention_hidden_units: [64] + attention_hidden_activations: "Dice" + attention_output_activation: null + attention_dropout: 0 + din_target_field: adgroup_id + din_sequence_field: click_sequence + feature_specs: [{name: click_sequence, feature_encoder: null}] + net_dropout: 0 + batch_norm: False epochs: 1 shuffle: True seed: 2019 monitor: 'AUC' monitor_mode: 'max' + \ No newline at end of file diff --git a/demo/config/example7_config/dataset_config.yaml b/demo/config/example7_config/dataset_config.yaml new file mode 100644 index 0000000..044c2a5 --- /dev/null +++ b/demo/config/example7_config/dataset_config.yaml @@ -0,0 +1,21 @@ +### Tiny data for demo only +tiny_example7: + data_root: ../data/ + data_format: csv + train_data: ../data/tiny_csv/custom_preprocess_train_sample.csv + valid_data: ../data/tiny_csv/custom_preprocess_valid_sample.csv + test_data: ../data/tiny_csv/custom_preprocess_test_sample.csv + min_categr_count: 1 + feature_cols: + - active: true + dtype: str + name: [msno, song_id, source_system_tab, source_screen_name, source_type, + city, gender, registered_via, language] + type: categorical + - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: genre_ids, + type: sequence} + - {active: true, dtype: str, encoder: MaskedSumPooling, max_len: 3, name: artist_name, + type: sequence} + - {active: true, dtype: str, name: isrc, preprocess: extract_country_code, type: categorical} + - {active: true, dtype: str, name: bd, preprocess: bucketize_age, type: categorical} + label_col: {dtype: float, name: label} diff --git a/demo/config/example7_config/model_config.yaml b/demo/config/example7_config/model_config.yaml new file mode 100644 index 0000000..ab01fe6 --- /dev/null +++ b/demo/config/example7_config/model_config.yaml @@ -0,0 +1,35 @@ +Base: + model_root: './checkpoints/' + num_workers: 3 + verbose: 1 + early_stop_patience: 2 + pickle_feature_encoder: True + save_best_only: True + eval_steps: null + debug_mode: False + group_id: null + use_features: null + feature_specs: null + feature_config: null + +DeepFM_test_csv: + model: DeepFM + dataset_id: tiny_example7 + loss: 'binary_crossentropy' + metrics: ['logloss', 'AUC'] + task: binary_classification + optimizer: adam + hidden_units: [64, 32] + hidden_activations: relu + net_regularizer: 0 + embedding_regularizer: 1.e-8 + learning_rate: 1.e-3 + batch_norm: False + net_dropout: 0 + batch_size: 128 + embedding_dim: 4 + epochs: 1 + shuffle: True + seed: 2019 + monitor: 'AUC' + monitor_mode: 'max' diff --git a/demo/example1_build_dataset_to_npz.py b/demo/example1_build_dataset_to_parquet.py similarity index 100% rename from demo/example1_build_dataset_to_npz.py rename to demo/example1_build_dataset_to_parquet.py diff --git a/demo/example2_DeepFM_with_parquet_input.py b/demo/example2_DeepFM_with_parquet_input.py new file mode 100644 index 0000000..501fb31 --- /dev/null +++ b/demo/example2_DeepFM_with_parquet_input.py @@ -0,0 +1,55 @@ +import sys +sys.path.append('../') +import os +import logging +from fuxictr import datasets +from datetime import datetime +from fuxictr.utils import load_config, set_logger, print_to_json +from fuxictr.features import FeatureMap +from fuxictr.pytorch.torch_utils import seed_everything +from fuxictr.pytorch.dataloaders import RankDataLoader +from model_zoo import DeepFM + + +if __name__ == '__main__': + # Load params from config files + config_dir = './config/example2_config' + experiment_id = 'DeepFM_test_parquet' # corresponds to input `data/tiny_parquet` + params = load_config(config_dir, experiment_id) + + # set up logger and random seed + set_logger(params) + logging.info("Params: " + print_to_json(params)) + seed_everything(seed=params['seed']) + + # Load feature_map from json + data_dir = os.path.join(params['data_root'], params['dataset_id']) + feature_map_json = os.path.join(data_dir, "feature_map.json") + feature_map = FeatureMap(params['dataset_id'], data_dir) + feature_map.load(feature_map_json, params) + logging.info("Feature specs: " + print_to_json(feature_map.features)) + + # Get train and validation data generators + train_gen, valid_gen = RankDataLoader(feature_map, + stage='train', + train_data=params['train_data'], + valid_data=params['valid_data'], + batch_size=params['batch_size'], + data_format=params["data_format"], + shuffle=params['shuffle']).make_iterator() + + # Model initialization and fitting + model = DeepFM(feature_map, **params) + model.fit(train_gen, validation_data=valid_gen, epochs=params['epochs']) + + logging.info('***** Validation evaluation *****') + model.evaluate(valid_gen) + + logging.info('***** Test evaluation *****') + test_gen = RankDataLoader(feature_map, + stage='test', + test_data=params['test_data'], + batch_size=params['batch_size'], + data_format=params["data_format"], + shuffle=False).make_iterator() + model.evaluate(test_gen) diff --git a/demo/example2_DeepFM_with_npz_input.py b/demo/example3_DeepFM_with_npz_input.py similarity index 91% rename from demo/example2_DeepFM_with_npz_input.py rename to demo/example3_DeepFM_with_npz_input.py index e7d6f14..732447b 100644 --- a/demo/example2_DeepFM_with_npz_input.py +++ b/demo/example3_DeepFM_with_npz_input.py @@ -13,7 +13,7 @@ if __name__ == '__main__': # Load params from config files - config_dir = './config/example2_config' + config_dir = './config/example3_config' experiment_id = 'DeepFM_test_npz' # corresponds to input `data/tiny_npz` params = load_config(config_dir, experiment_id) @@ -35,6 +35,7 @@ train_data=params['train_data'], valid_data=params['valid_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=params['shuffle']).make_iterator() # Model initialization and fitting @@ -49,5 +50,6 @@ stage='test', test_data=params['test_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example3_DeepFM_with_csv_input.py b/demo/example4_DeepFM_with_csv_input.py similarity index 93% rename from demo/example3_DeepFM_with_csv_input.py rename to demo/example4_DeepFM_with_csv_input.py index 26fd71f..acd2d71 100644 --- a/demo/example3_DeepFM_with_csv_input.py +++ b/demo/example4_DeepFM_with_csv_input.py @@ -14,7 +14,7 @@ if __name__ == '__main__': # Load params from config files - config_dir = './config/example3_config' + config_dir = './config/example4_config' experiment_id = 'DeepFM_test_csv' # corresponds to input `data/tiny_npz` params = load_config(config_dir, experiment_id) @@ -48,6 +48,7 @@ train_data=params['train_data'], valid_data=params['valid_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=params['shuffle']).make_iterator() # Model initialization and fitting @@ -62,6 +63,7 @@ stage='test', test_data=params['test_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example4_DeepFM_with_pretrained_emb.py b/demo/example5_DeepFM_with_pretrained_emb.py similarity index 93% rename from demo/example4_DeepFM_with_pretrained_emb.py rename to demo/example5_DeepFM_with_pretrained_emb.py index e8c2926..2555ad6 100644 --- a/demo/example4_DeepFM_with_pretrained_emb.py +++ b/demo/example5_DeepFM_with_pretrained_emb.py @@ -14,7 +14,7 @@ if __name__ == '__main__': # Load params from config files - config_dir = './config/example4_config' + config_dir = './config/example5_config' experiment_id = 'DeepFM_test_pretrain' params = load_config(config_dir, experiment_id) @@ -48,6 +48,7 @@ train_data=params['train_data'], valid_data=params['valid_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=params['shuffle']).make_iterator() # Model initialization and fitting @@ -62,6 +63,7 @@ stage='test', test_data=params['test_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example5_DIN_with_sequence_feature.py b/demo/example6_DIN_with_sequence_feature.py similarity index 91% rename from demo/example5_DIN_with_sequence_feature.py rename to demo/example6_DIN_with_sequence_feature.py index 3a0704f..9d11ab0 100644 --- a/demo/example5_DIN_with_sequence_feature.py +++ b/demo/example6_DIN_with_sequence_feature.py @@ -13,7 +13,7 @@ if __name__ == '__main__': # Load params from config files - config_dir = './config/example5_config' + config_dir = './config/example6_config' experiment_id = 'DIN_test' params = load_config(config_dir, experiment_id) @@ -34,6 +34,7 @@ train_data=params['train_data'], valid_data=params['valid_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=params['shuffle']).make_iterator() # Model initialization and fitting @@ -48,5 +49,6 @@ stage='test', test_data=params['test_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=False).make_iterator() model.evaluate(test_gen) diff --git a/demo/example6_DeepFM_with_custom_preprocess.py b/demo/example7_DeepFM_with_custom_preprocess.py similarity index 87% rename from demo/example6_DeepFM_with_custom_preprocess.py rename to demo/example7_DeepFM_with_custom_preprocess.py index 7eeca32..546d4aa 100644 --- a/demo/example6_DeepFM_with_custom_preprocess.py +++ b/demo/example7_DeepFM_with_custom_preprocess.py @@ -62,14 +62,14 @@ def _bucketize(age): use_custom_processor = True if use_custom_processor: feature_encoder = CustomFeatureProcessor(feature_cols=params["feature_cols"], - label_col=params["label_col"], - dataset_id=params["dataset_id"], - data_root=params["data_root"]) + label_col=params["label_col"], + dataset_id=params["dataset_id"], + data_root=params["data_root"]) else: feature_encoder = FeatureProcessor(feature_cols=params["feature_cols"], - label_col=params["label_col"], - dataset_id=params["dataset_id"], - data_root=params["data_root"]) + label_col=params["label_col"], + dataset_id=params["dataset_id"], + data_root=params["data_root"]) # Build dataset from csv to npz, and remap data paths to npz files params["train_data"], params["valid_data"], params["test_data"] = \ @@ -90,6 +90,7 @@ def _bucketize(age): train_data=params['train_data'], valid_data=params['valid_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=params['shuffle']).make_iterator() # Model initialization and fitting @@ -104,10 +105,6 @@ def _bucketize(age): stage='test', test_data=params['test_data'], batch_size=params['batch_size'], + data_format=params["data_format"], shuffle=False).make_iterator() model.evaluate(test_gen) - - - - - diff --git a/experiment/config/DCN_tiny_npz_tuner_config.yaml b/experiment/config/DCN_tiny_npz_tuner_config.yaml index 9797271..1eb5092 100644 --- a/experiment/config/DCN_tiny_npz_tuner_config.yaml +++ b/experiment/config/DCN_tiny_npz_tuner_config.yaml @@ -16,4 +16,3 @@ tuner_space: group_id: user_id metrics: [[gAUC, AUC, logloss]] monitor: {"gAUC": 1, "AUC": 1} - diff --git a/fuxictr/preprocess/build_dataset.py b/fuxictr/preprocess/build_dataset.py index 25f2dc0..288481f 100644 --- a/fuxictr/preprocess/build_dataset.py +++ b/fuxictr/preprocess/build_dataset.py @@ -72,7 +72,7 @@ def transform(feature_encoder, ddf, filename, block_size=0): pool.close() pool.join() else: - transform_block(feature_encoder, ddf, filename) + transform_block(feature_encoder, ddf, filename + ".parquet") def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=None, @@ -122,7 +122,7 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N transform(feature_encoder, test_ddf, 'test', block_size=data_block_size) del test_ddf gc.collect() - logging.info("Transform csv data to npz done.") + logging.info("Transform csv data to parquet done.") train_data, valid_data, test_data = ( os.path.join(feature_encoder.data_dir, "train"), \ diff --git a/fuxictr/preprocess/feature_processor.py b/fuxictr/preprocess/feature_processor.py index 1e8498b..b7b3389 100644 --- a/fuxictr/preprocess/feature_processor.py +++ b/fuxictr/preprocess/feature_processor.py @@ -181,9 +181,11 @@ def fit_numeric_col(self, col, col_series): feature_type = col["type"] feature_source = col.get("source", "") self.feature_map.features[name] = {"source": feature_source, - "type": feature_type} + "type": feature_type} if "feature_encoder" in col: self.feature_map.features[name]["feature_encoder"] = col["feature_encoder"] + if "embedding_dim" in col: + self.feature_map.features[name]["embedding_dim"] = col["embedding_dim"] if "normalizer" in col: normalizer = Normalizer(col["normalizer"]) if self.rebuild_dataset: @@ -196,7 +198,7 @@ def fit_categorical_col(self, col, col_series, min_categr_count=1, num_buckets=1 feature_source = col.get("source", "") min_categr_count = col.get("min_categr_count", min_categr_count) self.feature_map.features[name] = {"source": feature_source, - "type": feature_type} + "type": feature_type} if "feature_encoder" in col: self.feature_map.features[name]["feature_encoder"] = col["feature_encoder"] if "embedding_dim" in col: diff --git a/fuxictr/pytorch/layers/embeddings/feature_embedding.py b/fuxictr/pytorch/layers/embeddings/feature_embedding.py index 3475a89..8d5be85 100644 --- a/fuxictr/pytorch/layers/embeddings/feature_embedding.py +++ b/fuxictr/pytorch/layers/embeddings/feature_embedding.py @@ -84,7 +84,10 @@ def __init__(self, continue if feature_spec["type"] == "numeric": - self.embedding_layers[feature] = nn.Linear(1, feat_dim, bias=False) + if feat_dim > 0: + self.embedding_layers[feature] = nn.Linear(1, feat_dim, bias=False) + else: + self.embedding_layers[feature] = nn.Identity() elif feature_spec["type"] in ["categorical", "sequence"]: if use_pretrain and "pretrained_emb" in feature_spec: pretrain_path = os.path.join(feature_map.data_dir, diff --git a/fuxictr/version.py b/fuxictr/version.py index 1108fcc..50dc9be 100644 --- a/fuxictr/version.py +++ b/fuxictr/version.py @@ -1 +1 @@ -__version__="2.3.0" +__version__="2.3.1" diff --git a/model_zoo/DCN/DCN_torch/config/dataset_config.yaml b/model_zoo/DCN/DCN_torch/config/dataset_config.yaml index 2d773e5..96d58bf 100644 --- a/model_zoo/DCN/DCN_torch/config/dataset_config.yaml +++ b/model_zoo/DCN/DCN_torch/config/dataset_config.yaml @@ -5,3 +5,4 @@ tiny_npz: train_data: ../../../data/tiny_npz/train.npz valid_data: ../../../data/tiny_npz/valid.npz test_data: ../../../data/tiny_npz/test.npz + \ No newline at end of file diff --git a/setup.py b/setup.py index 9f141d1..1f07b7c 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="fuxictr", - version="2.3.0", + version="2.3.1", author="RECZOO", author_email="reczoo@users.noreply.github.com", description="A configurable, tunable, and reproducible library for CTR prediction",