deepset-ai · anakin87 · Feb 15, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
@@ -64,13 +64,25 @@ root = "../.."
 git_describe_command = 'git describe --tags --match="integrations/instructor_embedders-v[0-9]*"'
 
 [tool.hatch.envs.default]
-dependencies = ["pytest", "pytest-cov", "haystack-pydoc-tools"]
-
+dependencies = [
+  "coverage[toml]>=6.5",
+  "pytest",
+  "haystack-pydoc-tools",
+]
 [tool.hatch.envs.default.scripts]
-cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=instructor-embedders --cov=tests"
-no-cov = "cov --no-cov"
 test = "pytest {args:tests}"
-docs = "pydoc-markdown pydoc/config.yml"
+test-cov = "coverage run -m pytest {args:tests}"
+cov-report = [
+  "- coverage combine",
+  "coverage report",
+]
+cov = [
+  "test-cov",
+  "cov-report",
+]
+docs = [
+  "pydoc-markdown pydoc/config.yml"
+]
 
 [[tool.hatch.envs.test.matrix]]
 python = ["38", "39", "310", "311"]
@@ -86,10 +98,13 @@ fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
 all = ["style", "typing"]
 
 [tool.coverage.run]
+source = ["haystack_integrations"]
 branch = true
 parallel = true
 
 [tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing=true
 exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
 
 [tool.ruff]

@@ -1,8 +1,9 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import ClassVar, Dict, List, Optional, Union
+from typing import ClassVar, Dict, List, Optional
 
+from haystack.utils.auth import Secret
 from InstructorEmbedding import INSTRUCTOR
 
 
@@ -14,16 +15,14 @@ class _InstructorEmbeddingBackendFactory:
     _instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {}
 
     @staticmethod
-    def get_embedding_backend(
-        model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
-    ):
-        embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"
+    def get_embedding_backend(model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None):
+        embedding_backend_id = f"{model_name_or_path}{device}{token}"
 
         if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
             return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]
 
         embedding_backend = _InstructorEmbeddingBackend(
-            model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
+            model_name_or_path=model_name_or_path, device=device, token=token
         )
         _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
         return embedding_backend
@@ -34,10 +33,12 @@ class _InstructorEmbeddingBackend:
     Class to manage INSTRUCTOR embeddings.
     """
 
-    def __init__(
-        self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
-    ):
-        self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)
+    def __init__(self, model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None):
+        self.model = INSTRUCTOR(
+            model_name_or_path=model_name_or_path,
+            device=device,
+            use_auth_token=token.resolve_value() if token else None,
+        )
 
     def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
         embeddings = self.model.encode(data, **kwargs).tolist()

@@ -1,9 +1,10 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -62,7 +63,7 @@ def __init__(
         self,
         model: str = "hkunlp/instructor-base",
         device: Optional[str] = None,
-        use_auth_token: Union[bool, str, None] = None,
+        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the document",
         batch_size: int = 32,
         progress_bar: bool = True,
@@ -98,7 +99,7 @@ def __init__(
         self.model_name_or_path = model
         # TODO: remove device parameter and use Haystack's device management once migrated
         self.device = device or "cpu"
-        self.use_auth_token = use_auth_token
+        self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -114,7 +115,7 @@ def to_dict(self) -> Dict[str, Any]:
             self,
             model=self.model_name_or_path,
             device=self.device,
-            use_auth_token=self.use_auth_token,
+            token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
@@ -128,6 +129,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
     def warm_up(self):
@@ -136,7 +138,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+                model_name_or_path=self.model_name_or_path, device=self.device, token=self.token
             )
 
     @component.output_types(documents=List[Document])

@@ -1,9 +1,10 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 from haystack import component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -38,7 +39,7 @@ def __init__(
         self,
         model: str = "hkunlp/instructor-base",
         device: Optional[str] = None,
-        use_auth_token: Union[bool, str, None] = None,
+        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the sentence",
         batch_size: int = 32,
         progress_bar: bool = True,
@@ -51,9 +52,7 @@ def __init__(
             such as ``'hkunlp/instructor-base'``.
         :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
             If None, checks if a GPU can be used.
-        :param use_auth_token: The API token used to download private models from Hugging Face.
-                        If this parameter is set to `True`, then the token generated when running
-                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+        :param token: The API token used to download private models from Hugging Face.
         :param instruction: The instruction string to be used while computing domain-specific embeddings.
             The instruction follows the unified template of the form:
             "Represent the 'domain' 'text_type' for 'task_objective'", where:
@@ -70,7 +69,7 @@ def __init__(
         self.model_name_or_path = model
         # TODO: remove device parameter and use Haystack's device management once migrated
         self.device = device or "cpu"
-        self.use_auth_token = use_auth_token
+        self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -84,7 +83,7 @@ def to_dict(self) -> Dict[str, Any]:
             self,
             model=self.model_name_or_path,
             device=self.device,
-            use_auth_token=self.use_auth_token,
+            token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
@@ -96,6 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
     def warm_up(self):
@@ -104,7 +104,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+                model_name_or_path=self.model_name_or_path, device=self.device, token=self.token
             )
 
     @component.output_types(embedding=List[float])

@@ -1,5 +1,6 @@
 from unittest.mock import patch
 
+from haystack.utils.auth import Secret
 from haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend import (
     _InstructorEmbeddingBackendFactory,
 )
@@ -29,10 +30,10 @@ def test_factory_behavior(mock_instructor):  # noqa: ARG001
 )
 def test_model_initialization(mock_instructor):
     _InstructorEmbeddingBackendFactory.get_embedding_backend(
-        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
+        model_name_or_path="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token")
     )
     mock_instructor.assert_called_once_with(
-        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
+        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="fake-api-token"
     )
     # restore the factory state
     _InstructorEmbeddingBackendFactory._instances = {}

@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 from haystack import Document
+from haystack.utils import Secret
 from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder
 
 
@@ -14,7 +15,7 @@ def test_init_default(self):
         embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
         assert embedder.model_name_or_path == "hkunlp/instructor-base"
         assert embedder.device == "cpu"
-        assert embedder.use_auth_token is None
+        assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the document"
         assert embedder.batch_size == 32
         assert embedder.progress_bar is True
@@ -29,7 +30,7 @@ def test_init_with_parameters(self):
         embedder = InstructorDocumentEmbedder(
             model="hkunlp/instructor-base",
             device="cuda",
-            use_auth_token=True,
+            token=Secret.from_token("fake-api-token"),
             instruction="Represent the 'domain' 'text_type' for 'task_objective'",
             batch_size=64,
             progress_bar=False,
@@ -39,7 +40,7 @@ def test_init_with_parameters(self):
         )
         assert embedder.model_name_or_path == "hkunlp/instructor-base"
         assert embedder.device == "cuda"
-        assert embedder.use_auth_token is True
+        assert embedder.token == Secret.from_token("fake-api-token")
         assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
         assert embedder.batch_size == 64
         assert embedder.progress_bar is False
@@ -58,7 +59,7 @@ def test_to_dict(self):
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
                 "device": "cpu",
-                "use_auth_token": None,
+                "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the document",
                 "batch_size": 32,
                 "progress_bar": True,
@@ -75,7 +76,6 @@ def test_to_dict_with_custom_init_parameters(self):
         embedder = InstructorDocumentEmbedder(
             model="hkunlp/instructor-base",
             device="cuda",
-            use_auth_token=True,
             instruction="Represent the financial document for retrieval",
             batch_size=64,
             progress_bar=False,
@@ -89,7 +89,7 @@ def test_to_dict_with_custom_init_parameters(self):
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
                 "device": "cuda",
-                "use_auth_token": True,
+                "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the financial document for retrieval",
                 "batch_size": 64,
                 "progress_bar": False,
@@ -108,7 +108,7 @@ def test_from_dict(self):
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
                 "device": "cpu",
-                "use_auth_token": None,
+                "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
                 "batch_size": 32,
                 "progress_bar": True,
@@ -120,7 +120,7 @@ def test_from_dict(self):
         embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
         assert embedder.model_name_or_path == "hkunlp/instructor-base"
         assert embedder.device == "cpu"
-        assert embedder.use_auth_token is None
+        assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
         assert embedder.batch_size == 32
         assert embedder.progress_bar is True
@@ -137,7 +137,7 @@ def test_from_dict_with_custom_init_parameters(self):
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
                 "device": "cuda",
-                "use_auth_token": True,
+                "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the financial document for retrieval",
                 "batch_size": 64,
                 "progress_bar": False,
@@ -149,7 +149,7 @@ def test_from_dict_with_custom_init_parameters(self):
         embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
         assert embedder.model_name_or_path == "hkunlp/instructor-base"
         assert embedder.device == "cuda"
-        assert embedder.use_auth_token is True
+        assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the financial document for retrieval"
         assert embedder.batch_size == 64
         assert embedder.progress_bar is False
@@ -168,7 +168,9 @@ def test_warmup(self, mocked_factory):
         mocked_factory.get_embedding_backend.assert_not_called()
         embedder.warm_up()
         mocked_factory.get_embedding_backend.assert_called_once_with(
-            model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None
+            model_name_or_path="hkunlp/instructor-base",
+            device="cpu",
+            token=Secret.from_env_var("HF_API_TOKEN", strict=False),
         )
 
     @patch(