From 4646e255ecca98cdf4b03ca0f9d2ea56634dd3d8 Mon Sep 17 00:00:00 2001 From: xpai Date: Tue, 16 Apr 2024 11:51:52 +0800 Subject: [PATCH] Fix issues #84, #85 --- CHANGELOG.md | 6 +++++- fuxictr/preprocess/tokenizer.py | 10 +++++++--- fuxictr/pytorch/dataloaders/npz_block_dataloader.py | 3 +-- fuxictr/version.py | 2 +- setup.py | 2 +- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d83c72..ee86d9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ [Doing] Add support for saving pb file, exporting embeddings [Doing] Add support of NVTabular data +**FuxiCTR v2.2.1, 2024-04-16** ++ [Fix] Fix issue of evaluation not performed at epoch end when streaming=True ([#85](https://github.com/xue-pai/FuxiCTR/issues/85)) ++ [Fix] Fix issue when loading pretrain_emb in npz format ([#84](https://github.com/xue-pai/FuxiCTR/issues/84)) + **FuxiCTR v2.2.0, 2024-02-17** + [Feature] Add support of npz format for pretrained_emb + [Refactor] Change data format from h5 to npz @@ -16,7 +20,7 @@ + [Feature] Add GDCN model + [Refactor] Rename FINAL model to FinalNet + [Refactor] Update RecZoo URLs -+ [Fix] Fix bug #75 ++ [Fix] Fix bug [#75](https://github.com/xue-pai/FuxiCTR/issues/75) + [Fix] Fix h5 file extenstion issue + [Fix] Fix typo in FinalNet diff --git a/fuxictr/preprocess/tokenizer.py b/fuxictr/preprocess/tokenizer.py index 7b9cb19..90aed37 100644 --- a/fuxictr/preprocess/tokenizer.py +++ b/fuxictr/preprocess/tokenizer.py @@ -127,9 +127,13 @@ def encode_sequence(self, texts): return np.array(sequence_list) def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True): - with h5py.File(pretrain_path, 'r') as hf: - keys = hf["key"][:] - keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str + if pretrain_path.endswith(".h5"): + with h5py.File(pretrain_path, 'r') as hf: + keys = hf["key"][:] + # in case mismatch of dtype between int and str + keys = keys.astype(feature_dtype) + elif pretrain_path.endswith(".npz"): + keys = np.load(pretrain_path)["key"] # Update vocab with pretrained keys in case new tokens appear in validation or test set # Do NOT update OOV index here since it is used in PretrainedEmbedding if expand_vocab: diff --git a/fuxictr/pytorch/dataloaders/npz_block_dataloader.py b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py index 58ac5f6..37bf697 100644 --- a/fuxictr/pytorch/dataloaders/npz_block_dataloader.py +++ b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py @@ -82,9 +82,8 @@ def __len__(self): def count_batches_and_samples(self): num_samples = 0 - num_batches = 0 for block_path in self.data_blocks: block_size = np.load(block_path)[self.feature_map.labels[0]].shape[0] num_samples += block_size - num_batches += int(np.ceil(block_size * 1.0 / self.batch_size)) + num_batches = int(np.ceil(num_samples / self.batch_size)) return num_batches, num_samples diff --git a/fuxictr/version.py b/fuxictr/version.py index 3aef30f..6daf179 100644 --- a/fuxictr/version.py +++ b/fuxictr/version.py @@ -1 +1 @@ -__version__="2.2.0" +__version__="2.2.1" diff --git a/setup.py b/setup.py index 9c1fba3..711fb38 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="fuxictr", - version="2.2.0", + version="2.2.1", author="RECZOO", author_email="reczoo@users.noreply.github.com", description="A configurable, tunable, and reproducible library for CTR prediction",