From a418c4997d3120df5ac16214115d0d34e69be2ae Mon Sep 17 00:00:00 2001
From: vincent d warmerdam <vincentwarmerdam@gmail.com>
Date: Wed, 7 Oct 2020 12:28:12 +0200
Subject: [PATCH] Convert Deprecation (#240)

* added-use-removed-convert

* docs-update

* modified-test

* typo

* test-fix

* removed-convert-from-tests

* convert-doh-import

* import-bug

* maybe this

* yet-another-fix

* found-the-bug

* multi-lang
---
 docs/api/language/universal_sentence.md       |   1 +
 docs/releases.md                              |   5 +
 mkdocs.yml                                    |   1 +
 tests/test_lang/test_convert_lang.py          |  63 -----------
 .../test_universal_sentence_encoder.py        |  11 ++
 tests/test_sklearn/test_simple_pipeline.py    |   4 -
 whatlies/language/__init__.py                 |  12 ++-
 whatlies/language/_convert_lang.py            | 101 ++----------------
 whatlies/language/_sentence_encode_lang.py    |  46 ++++++++
 9 files changed, 78 insertions(+), 166 deletions(-)
 create mode 100644 docs/api/language/universal_sentence.md
 delete mode 100644 tests/test_lang/test_convert_lang.py
 create mode 100644 tests/test_lang/test_universal_sentence_encoder.py
 create mode 100644 whatlies/language/_sentence_encode_lang.py

diff --git a/docs/api/language/universal_sentence.md b/docs/api/language/universal_sentence.md
new file mode 100644
index 00000000..0e5d5f81
--- /dev/null
+++ b/docs/api/language/universal_sentence.md
@@ -0,0 +1 @@
+::: whatlies.language._sentence_encode_lang
diff --git a/docs/releases.md b/docs/releases.md
index 8b4df5a0..1c92fa12 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -1,3 +1,8 @@
+v0.5.3
+
+- Deprecated the `ConveRTLanguage` backend. The original authors removed the embeddings.
+- Added the support for the Universal Sentence Encoder.
+
 v0.5.2
 
 - Fixed the `ConveRTLanguage` backend. The original source changed their download url.
diff --git a/mkdocs.yml b/mkdocs.yml
index 6333918f..c73bf712 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -36,6 +36,7 @@ nav:
       - Gensim: api/language/gensim_lang.md
       - Huggingface: api/language/transformers.md
       - TFHub: api/language/tfhub.md
+      - Universal Sentence Encoder: api/language/universal_sentence.md
   - Examples:
     - Debiasing Projections: examples/lipstick-pig.md
   - Roadmap: roadmap.md
diff --git a/tests/test_lang/test_convert_lang.py b/tests/test_lang/test_convert_lang.py
deleted file mode 100644
index 389a7733..00000000
--- a/tests/test_lang/test_convert_lang.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pytest
-
-from whatlies.language import ConveRTLanguage
-
-
-@pytest.fixture
-def lang(request):
-    lang = ConveRTLanguage(**request.param)
-    return lang
-
-
-@pytest.mark.parametrize(
-    "lang, expected_shape",
-    [
-        ({"model_id": "convert", "signature": "default"}, (1024,)),
-        ({"model_id": "convert", "signature": "encode_context"}, (512,)),
-        ({"model_id": "convert", "signature": "encode_response"}, (512,)),
-        ({"model_id": "convert", "signature": "encode_sequence"}, (512,)),
-        ({"model_id": "convert-multi-context", "signature": "default"}, (1024,)),
-        pytest.param(
-            {"model_id": "convert-multi-context", "signature": "encode_context"},
-            (512,),
-            marks=pytest.mark.xfail(raises=NotImplementedError),
-        ),
-        ({"model_id": "convert-multi-context", "signature": "encode_response"}, (512,)),
-        ({"model_id": "convert-multi-context", "signature": "encode_sequence"}, (512,)),
-        ({"model_id": "convert-ubuntu", "signature": "default"}, (1024,)),
-        pytest.param(
-            {"model_id": "convert-ubuntu", "signature": "encode_context"},
-            (512,),
-            marks=pytest.mark.xfail(raises=NotImplementedError),
-        ),
-        ({"model_id": "convert-ubuntu", "signature": "encode_response"}, (512,)),
-        ({"model_id": "convert-ubuntu", "signature": "encode_sequence"}, (512,)),
-    ],
-    indirect=["lang"],
-)
-def test_basic_usage(lang, expected_shape):
-    embset = lang[["bank", "money on the bank", "bank of the river"]]
-    assert len(embset) == 3
-    assert embset["bank"].vector.shape == expected_shape
-
-
-@pytest.mark.parametrize(
-    "lang",
-    [
-        pytest.param(
-            {"model_id": "convert-context", "signature": "encode_context"},
-            marks=pytest.mark.xfail(raises=ValueError, strict=True),
-        ),
-        pytest.param(
-            {"model_id": "convert", "signature": "encode"},
-            marks=pytest.mark.xfail(raises=ValueError, strict=True),
-        ),
-        pytest.param(
-            {"model_id": "multi-convert", "signature": "encoded_context"},
-            marks=pytest.mark.xfail(raises=ValueError, strict=True),
-        ),
-    ],
-    indirect=["lang"],
-)
-def test_invalid_argument_values_raise_error(lang):
-    pass
diff --git a/tests/test_lang/test_universal_sentence_encoder.py b/tests/test_lang/test_universal_sentence_encoder.py
new file mode 100644
index 00000000..81850f89
--- /dev/null
+++ b/tests/test_lang/test_universal_sentence_encoder.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+from whatlies.language import TFHubLanguage, UniversalSentenceLanguage
+
+
+def test_same_results():
+    use_lang = UniversalSentenceLanguage("multi", 3)
+    tf_lang = TFHubLanguage(
+        "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+    )
+    assert np.allclose(use_lang["hello world"].vector, tf_lang["hello world"].vector)
diff --git a/tests/test_sklearn/test_simple_pipeline.py b/tests/test_sklearn/test_simple_pipeline.py
index 9036a449..205b1016 100644
--- a/tests/test_sklearn/test_simple_pipeline.py
+++ b/tests/test_sklearn/test_simple_pipeline.py
@@ -8,12 +8,10 @@
 
 from whatlies.language import (
     FasttextLanguage,
-    CountVectorLanguage,
     SpacyLanguage,
     GensimLanguage,
     BytePairLanguage,
     TFHubLanguage,
-    ConveRTLanguage,
     HFTransformersLanguage,
 )
 
@@ -21,10 +19,8 @@
 backends = [
     SpacyLanguage("tests/custom_test_lang/"),
     FasttextLanguage("tests/custom_fasttext_model.bin"),
-    CountVectorLanguage(n_components=10),
     BytePairLanguage("en", vs=1000, dim=25, cache_dir="tests/cache"),
     GensimLanguage("tests/cache/custom_gensim_vectors.kv"),
-    ConveRTLanguage(),
     HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"),
     TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"),
 ]
diff --git a/whatlies/language/__init__.py b/whatlies/language/__init__.py
index 93915ca9..9b13fa1c 100644
--- a/whatlies/language/__init__.py
+++ b/whatlies/language/__init__.py
@@ -2,24 +2,26 @@
 from ._fasttext_lang import FasttextLanguage
 from ._countvector_lang import CountVectorLanguage
 from ._bpemblang import BytePairLanguage
-from ._bpemblang import BytePairLanguage as BytePairLang
 from ._gensim_lang import GensimLanguage
+from ._convert_lang import ConveRTLanguage
 
 from whatlies.error import NotInstalled
 
 try:
-    from ._convert_lang import ConveRTLanguage
     from ._tfhub_lang import TFHubLanguage
 except ModuleNotFoundError as e:
     TFHubLanguage = NotInstalled("TFHubLanguage", "tfhub")
-    ConveRTLanguage = NotInstalled("ConveRTLanguage", "tfhub")
+
+try:
+    from ._sentence_encode_lang import UniversalSentenceLanguage
+except ModuleNotFoundError as e:
+    UniversalSentenceLanguage = NotInstalled("UniversalSentenceLanguage", "tfhub")
 
 try:
     from ._hftransformers_lang import HFTransformersLanguage
 except ModuleNotFoundError as e:
     HFTransformersLanguage = NotInstalled("HFTransformersLanguage", "transformers")
 
-
 try:
     from ._sense2vec_lang import Sense2VecLanguage
 except ModuleNotFoundError as e:
@@ -31,10 +33,10 @@
     "Sense2VecLanguage",
     "FasttextLanguage",
     "CountVectorLanguage",
-    "BytePairLang",
     "BytePairLanguage",
     "GensimLanguage",
     "ConveRTLanguage",
     "TFHubLanguage",
     "HFTransformersLanguage",
+    "UniversalSentenceLanguage",
 ]
diff --git a/whatlies/language/_convert_lang.py b/whatlies/language/_convert_lang.py
index 18471cc3..0e6236bd 100644
--- a/whatlies/language/_convert_lang.py
+++ b/whatlies/language/_convert_lang.py
@@ -1,50 +1,7 @@
-from typing import Union, List
-
-import tensorflow_text  # noqa: F401
-import tensorflow as tf
-import tensorflow_hub as tfhub
-
-from whatlies.embedding import Embedding
-from whatlies.embeddingset import EmbeddingSet
-from whatlies.language._common import SklearnTransformerMixin, HiddenPrints
-
-
-class ConveRTLanguage(SklearnTransformerMixin):
+class ConveRTLanguage:
     """
-    This object is used to fetch [Embedding][whatlies.embedding.Embedding]s or
-    [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]s from a
-    [ConveRT](https://github.com/PolyAI-LDN/polyai-models) model.
-    This object is meant for retreival, not plotting.
-
     Important:
-        This object will automatically download a large file if it is not cached yet.
-
-        This language model does not contain a vocabulary, so it cannot be used
-        to retreive similar tokens. Use an `EmbeddingSet` instead.
-
-        This language backend might require you to manually install extra dependencies
-        unless you installed via either;
-
-        ```
-        pip install whatlies[tfhub]
-        pip install whatlies[all]
-        ```
-
-    Arguments:
-        model_id: identifier used for loading the corresponding TFHub module, which could be one of `'convert`, `'convert-multi-context'` or `'convert-ubuntu'`.
-            Each one of these correspond to a different model as described in [ConveRT manual](https://github.com/PolyAI-LDN/polyai-models#models).
-        signature: the TFHub signature of the model, which could be one of `'default'`, `'encode_context'`, `'encode_response'` or `'encode_sequence'`.
-            Note that `'encode_context'` is not currently supported with `'convert-multi-context'` or `'convert-ubuntu'` models.
-
-    **Usage**:
-
-    ```python
-    > from whatlies.language import ConveRTLanguage
-    > lang = ConveRTLanguage()
-    > lang['bank']
-    > lang = ConveRTLanguage(model_id='convert-multi-context', signature='encode_sequence')
-    > lang[['bank of the river', 'money on the bank', 'bank']]
-    ```
+        This model has been deprecated. The original authors took the embeddings down.
     """
 
     MODEL_URL = {
@@ -61,53 +18,9 @@ class ConveRTLanguage(SklearnTransformerMixin):
     ]
 
     def __init__(self, model_id: str = "convert", signature: str = "default") -> None:
-        if model_id not in self.MODEL_URL:
-            raise ValueError(
-                f"The `model_id` value should be one of {list(self.MODEL_URL.keys())}"
-            )
-        if signature not in self.MODEL_SIGNATURES:
-            raise ValueError(
-                f"The `signature` value should be one of {self.MODEL_SIGNATURES}"
-            )
-        if signature == "encode_context" and model_id in [
-            "convert-multi-context",
-            "convert-ubuntu",
-        ]:
-            raise NotImplementedError(
-                "Currently 'encode_context' signature is not support with multi-context and ubuntu models."
-            )
-        self.model_id = model_id
-        self.signature = signature
-
-        with HiddenPrints():
-            self.module = tfhub.load(self.MODEL_URL[self.model_id])
-            self.model = self.module.signatures[self.signature]
-
-    def __getitem__(
-        self, query: Union[str, List[str]]
-    ) -> Union[Embedding, EmbeddingSet]:
-        """
-        Retreive a single embedding or a set of embeddings.
-
-        Arguments:
-            query: single string or list of strings
-
-        **Usage**
+        pass
 
-        ```python
-        > from whatlies.language import ConveRTLanguage
-        > lang = ConveRTLanguage()
-        > lang['bank']
-        > lang = ConveRTLanguage()
-        > lang[['bank of the river', 'money on the bank', 'bank']]
-        ```
-        """
-        if isinstance(query, str):
-            query_tensor = tf.convert_to_tensor([query])
-            encoding = self.model(query_tensor)
-            if self.signature == "encode_sequence":
-                vec = encoding["sequence_encoding"].numpy().sum(axis=1)[0]
-            else:
-                vec = encoding["default"].numpy()[0]
-            return Embedding(query, vec)
-        return EmbeddingSet(*[self[tok] for tok in query])
+    def __getitem__(self, item):
+        raise DeprecationWarning(
+            "This model has been deprecated. The original authors took the embeddings down."
+        )
diff --git a/whatlies/language/_sentence_encode_lang.py b/whatlies/language/_sentence_encode_lang.py
new file mode 100644
index 00000000..73872d50
--- /dev/null
+++ b/whatlies/language/_sentence_encode_lang.py
@@ -0,0 +1,46 @@
+from typing import Union
+
+from ._tfhub_lang import TFHubLanguage
+
+
+def UniversalSentenceLanguage(variant: str = "base", version: Union[int, None] = None):
+    """
+    Retreive a [universal sentence encoder](https://tfhub.dev/google/collections/universal-sentence-encoder/1) model from tfhub.
+
+    You can download specific versions for specific variants. The variants that we support are listed below.
+
+    - `"base"`: the base variant (915MB) [link](https://tfhub.dev/google/universal-sentence-encoder/4)
+    - `"large"`: the large variant (523MB) [link](https://tfhub.dev/google/universal-sentence-encoder-large/5)
+    - `"qa"`: the variant based on question/answer (528MB) [link](https://tfhub.dev/google/universal-sentence-encoder-qa/3)
+    - `"multi"`: the multi-language variant (245MB) [link](https://tfhub.dev/google/universal-sentence-encoder-multilingual/3)
+    - `"multi-large"`: the large multi-language variant (303MB) [link](https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3)
+    - `"multi-qa"`: the multi-language qa variant (310MB) [link](https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3)
+
+    TFHub reports that the multi-language models support Arabic, Chinese-simplified, Chinese-traditional,
+    English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish and Russian.
+
+    Arguments:
+        variant: select a specific variant
+        version: select a specific version, if kept `None` we'll assume the most recent version
+    """
+    urls = {
+        "base": "https://tfhub.dev/google/universal-sentence-encoder/",
+        "large": "https://tfhub.dev/google/universal-sentence-encoder-large/",
+        "qa": "https://tfhub.dev/google/universal-sentence-encoder-qa/",
+        "multi": "https://tfhub.dev/google/universal-sentence-encoder-multilingual/",
+        "multi-large": "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/",
+        "multi-qa": "https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3",
+    }
+
+    versions = {
+        "base": 4,
+        "large": 5,
+        "qa": 3,
+        "multi": 3,
+        "multi-large": 3,
+        "multi-qa": 3,
+    }
+
+    version = versions[variant] if not version else version
+    url = urls[variant] + str(version)
+    return TFHubLanguage(url=url)