From 4646e255ecca98cdf4b03ca0f9d2ea56634dd3d8 Mon Sep 17 00:00:00 2001
From: xpai <xpai@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:51:52 +0800
Subject: [PATCH] Fix issues #84, #85

---
 CHANGELOG.md                                        |  6 +++++-
 fuxictr/preprocess/tokenizer.py                     | 10 +++++++---
 fuxictr/pytorch/dataloaders/npz_block_dataloader.py |  3 +--
 fuxictr/version.py                                  |  2 +-
 setup.py                                            |  2 +-
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d83c72..ee86d9e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@
 [Doing] Add support for saving pb file, exporting embeddings
 [Doing] Add support of NVTabular data
 
+**FuxiCTR v2.2.1, 2024-04-16**
++ [Fix] Fix issue of evaluation not performed at epoch end when streaming=True ([#85](https://github.com/xue-pai/FuxiCTR/issues/85))
++ [Fix] Fix issue when loading pretrain_emb in npz format ([#84](https://github.com/xue-pai/FuxiCTR/issues/84))
+
 **FuxiCTR v2.2.0, 2024-02-17**
 + [Feature] Add support of npz format for pretrained_emb
 + [Refactor] Change data format from h5 to npz
@@ -16,7 +20,7 @@
 + [Feature] Add GDCN model
 + [Refactor] Rename FINAL model to FinalNet
 + [Refactor] Update RecZoo URLs
-+ [Fix] Fix bug #75
++ [Fix] Fix bug [#75](https://github.com/xue-pai/FuxiCTR/issues/75)
 + [Fix] Fix h5 file extenstion issue
 + [Fix] Fix typo in FinalNet
  
diff --git a/fuxictr/preprocess/tokenizer.py b/fuxictr/preprocess/tokenizer.py
index 7b9cb19..90aed37 100644
--- a/fuxictr/preprocess/tokenizer.py
+++ b/fuxictr/preprocess/tokenizer.py
@@ -127,9 +127,13 @@ def encode_sequence(self, texts):
         return np.array(sequence_list)
     
     def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True):
-        with h5py.File(pretrain_path, 'r') as hf:
-            keys = hf["key"][:]
-            keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str
+        if pretrain_path.endswith(".h5"):
+            with h5py.File(pretrain_path, 'r') as hf:
+                keys = hf["key"][:]
+                # in case mismatch of dtype between int and str
+                keys = keys.astype(feature_dtype)
+        elif pretrain_path.endswith(".npz"):
+            keys = np.load(pretrain_path)["key"]
         # Update vocab with pretrained keys in case new tokens appear in validation or test set
         # Do NOT update OOV index here since it is used in PretrainedEmbedding
         if expand_vocab:
diff --git a/fuxictr/pytorch/dataloaders/npz_block_dataloader.py b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py
index 58ac5f6..37bf697 100644
--- a/fuxictr/pytorch/dataloaders/npz_block_dataloader.py
+++ b/fuxictr/pytorch/dataloaders/npz_block_dataloader.py
@@ -82,9 +82,8 @@ def __len__(self):
 
     def count_batches_and_samples(self):
         num_samples = 0
-        num_batches = 0
         for block_path in self.data_blocks:
             block_size = np.load(block_path)[self.feature_map.labels[0]].shape[0]
             num_samples += block_size
-            num_batches += int(np.ceil(block_size * 1.0 / self.batch_size))
+        num_batches = int(np.ceil(num_samples / self.batch_size))
         return num_batches, num_samples
diff --git a/fuxictr/version.py b/fuxictr/version.py
index 3aef30f..6daf179 100644
--- a/fuxictr/version.py
+++ b/fuxictr/version.py
@@ -1 +1 @@
-__version__="2.2.0"
+__version__="2.2.1"
diff --git a/setup.py b/setup.py
index 9c1fba3..711fb38 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="fuxictr",
-    version="2.2.0",
+    version="2.2.1",
     author="RECZOO",
     author_email="reczoo@users.noreply.github.com",
     description="A configurable, tunable, and reproducible library for CTR prediction",