get rid of the pqkmeans dependency

avidale · Oct 14, 2023 · 916ec48 · 916ec48
1 parent 9829067
commit 916ec48
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 13 deletions.
diff --git a/.github/workflows/test_and_deploy.yaml b/.github/workflows/test_and_deploy.yaml
@@ -15,7 +15,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pycodestyle pytest scikit-learn pqkmeans smart_open[http]
+        pip install pycodestyle pytest scikit-learn scipy smart_open[http]
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
         python setup.py develop
     - name: Lint with flake8

diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ big_model = load_facebook_model('path-to-original-model').wv
 small_model = compress_fasttext.prune_ft_freq(big_model, pq=True)
 small_model.save('path-to-new-model')
 ```
-To perform this compression, you will need to `pip install gensim==3.8.3 pqkmeans` beforehand. 
+To perform this compression, you will need to `pip install gensim==3.8.3 sklearn` beforehand. 
 
 Different compression methods include:
 - matrix decomposition (`svd_ft`)

diff --git a/compress_fasttext/pq_encoder_light.py b/compress_fasttext/pq_encoder_light.py
@@ -0,0 +1,123 @@
+# This file is based on the code from the `pqkmeans` package:
+# https://github.com/DwangoMediaVillage/pqkmeans/blob/master/pqkmeans/encoder/encoder_base.py
+# https://github.com/DwangoMediaVillage/pqkmeans/blob/master/pqkmeans/encoder/pq_encoder.py
+# it is refactored to avoid any external dependencies except scipy and numpy
+import typing
+import numpy
+from scipy.cluster.vq import vq, kmeans2
+import sklearn
+
+
+class EncoderBase(sklearn.base.BaseEstimator):
+    def fit_generator(self, x_train):
+        # type: (typing.Iterable[typing.Iterator[float]]) -> None
+        raise NotImplementedError()
+
+    def transform_generator(self, x_test):
+        # type: (typing.Iterable[typing.Iterator[float]]) -> Any
+        raise NotImplementedError()
+
+    def inverse_transform_generator(self, x_test):
+        # type: (typing.Iterable[typing.Iterator[Any]]) -> Any
+        raise NotImplementedError()
+
+    def fit(self, x_train):
+        # type: (numpy.array) -> None
+        assert len(x_train.shape) == 2
+        self.fit_generator(iter(x_train))
+
+    def transform(self, x_test):
+        # type: (numpy.array) -> Any
+        assert len(x_test.shape) == 2
+        return numpy.array(list(self.transform_generator(x_test)))
+
+    def inverse_transform(self, x_test):
+        # type: (numpy.array) -> Any
+        assert len(x_test.shape) == 2
+        return numpy.array(list(self.inverse_transform_generator(x_test)))
+
+    def _buffered_process(self, x_input, process, buffer_size=10000):
+        # type: (typing.Iterable[typing.Iterator[Any]], Any, int) -> Any
+        buffer = []
+        for input_vector in x_input:
+            buffer.append(input_vector)
+            if len(buffer) == buffer_size:
+                encoded = process(buffer)
+                for encoded_vec in encoded:
+                    yield encoded_vec
+                buffer = []
+        if len(buffer) > 0:  # rest
+            encoded = process(buffer)
+            for encoded_vec in encoded:
+                yield encoded_vec
+
+
+class PQEncoder(EncoderBase):
+    def __init__(self, iteration=20, num_subdim=4, Ks=256):
+        # type: (int, int, int) -> None
+        assert Ks <= 2 ** 32
+        self.iteration = iteration
+        self.M, self.Ks, self.Ds = num_subdim, Ks, None
+        self.code_dtype = numpy.uint8 if Ks <= 2 ** 8 else (numpy.uint16 if Ks <= 2 ** 16 else numpy.uint32)
+        self.trained_encoder = None
+
+    def fit(self, x_train):
+        # type: (numpy.array) -> None
+        assert x_train.ndim == 2
+        N, D = x_train.shape
+        assert self.Ks < N, "the number of training vector should be more than Ks"
+        assert D % self.M == 0, "input dimension must be dividable by M"
+        self.Ds = int(D / self.M)
+        assert self.trained_encoder is None, "fit must be called only once"
+
+        codewords = numpy.zeros((self.M, self.Ks, self.Ds), dtype=numpy.float)
+        for m in range(self.M):
+            x_train_sub = x_train[:, m * self.Ds: (m + 1) * self.Ds].astype(numpy.float)
+            codewords[m], _ = kmeans2(x_train_sub, self.Ks, iter=self.iteration, minit='points')
+        self.trained_encoder = TrainedPQEncoder(codewords, self.code_dtype)
+
+    def transform_generator(self, x_test):
+        # type: (typing.Iterable[typing.Iterator[float]]) -> Any
+        assert self.trained_encoder is not None, "This PQEncoder instance is not fitted yet. " \
+                                                 "Call 'fit' with appropriate arguments before using thie method."
+        return self._buffered_process(x_test, self.trained_encoder.encode_multi)
+
+    def inverse_transform_generator(self, x_test):
+        # type: (typing.Iterable[typing.Iterator[int]]) -> Any
+        assert self.trained_encoder is not None, "This PQEncoder instance is not fitted yet. " \
+                                                 "Call 'fit' with appropriate arguments before using thie method."
+        return self._buffered_process(x_test, self.trained_encoder.decode_multi)
+
+    @property
+    def codewords(self):
+        assert self.trained_encoder is not None, "This PQEncoder instance is not fitted yet. " \
+                                                 "Call 'fit' with appropriate arguments before using this method."
+        return self.trained_encoder.codewords
+
+
+class TrainedPQEncoder(object):
+    def __init__(self, codewords, code_dtype):
+        # type: (numpy.array, type) -> None
+        self.codewords, self.code_dtype = codewords, code_dtype
+        self.M, _, self.Ds = codewords.shape
+
+    def encode_multi(self, data_matrix):
+        data_matrix = numpy.array(data_matrix)
+        N, D = data_matrix.shape
+        assert self.Ds * self.M == D, "input dimension must be Ds * M"
+
+        codes = numpy.empty((N, self.M), dtype=self.code_dtype)
+        for m in range(self.M):
+            codes[:, m], _ = vq(data_matrix[:, m * self.Ds: (m + 1) * self.Ds], self.codewords[m])
+        return codes
+
+    def decode_multi(self, codes):
+        codes = numpy.array(codes)
+        N, M = codes.shape
+        assert M == self.M
+        assert codes.dtype == self.code_dtype
+
+        decoded = numpy.empty((N, self.Ds * self.M), dtype=numpy.float)
+        for m in range(self.M):
+            decoded[:, m * self.Ds: (m + 1) * self.Ds] = self.codewords[m][codes[:, m], :]
+        return decoded
diff --git a/compress_fasttext/quantization.py b/compress_fasttext/quantization.py
@@ -1,18 +1,12 @@
 import logging
 import numpy as np
 
-try:
-    import pqkmeans
-except ImportError:
-    pqkmeans = None
-
 from .navec_like import PQ
+from .pq_encoder_light import PQEncoder
 
 
 def quantize(matrix, qdim, centroids, sample=None, iterations=5, verbose=False):
-    if not pqkmeans:
-        raise ImportError('You need to install the `pqkmeans` package to perform quantization')
-    encoder = pqkmeans.encoder.PQEncoder(
+    encoder = PQEncoder(
         iteration=iterations,
         num_subdim=qdim,
         Ks=centroids

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 gensim>=4.0.0
 numpy
-pqkmeans
+scipy
 scikit-learn
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="compress-fasttext",
-    version="0.1.3",
+    version="0.1.4",
     author="David Dale",
     author_email="[email protected]",
     description="A set of tools to compress gensim fasttext models",
@@ -27,7 +27,7 @@
     extras_require={
         'full': [
             'scikit-learn',
-            'pqkmeans',
+            'scipy',
         ],
     }
 )