deepset-ai · anakin87 · Feb 15, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
@@ -47,7 +47,7 @@
 p.add_component(instance=DocumentCleaner(), name="cleaner")
 p.add_component(instance=DocumentSplitter(split_by="word", split_length=150, split_overlap=30), name="splitter")
 p.add_component(
-    instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
@@ -63,7 +63,7 @@
 # Create a querying pipeline on the indexed data
 q = Pipeline()
 q.add_component(
-    instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 q.add_component("retriever", AstraEmbeddingRetriever(document_store))

@@ -62,7 +62,7 @@
 ]
 p = Pipeline()
 p.add_component(
-    instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
@@ -74,7 +74,7 @@
 # Construct rag pipeline
 rag_pipeline = Pipeline()
 rag_pipeline.add_component(
-    instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 rag_pipeline.add_component(instance=AstraEmbeddingRetriever(document_store=document_store), name="retriever")

diff --git a/integrations/instructor_embedders/pyproject.toml b/integrations/instructor_embedders/pyproject.toml
@@ -64,13 +64,25 @@ root = "../.."
 git_describe_command = 'git describe --tags --match="integrations/instructor_embedders-v[0-9]*"'
 
 [tool.hatch.envs.default]
-dependencies = ["pytest", "pytest-cov", "haystack-pydoc-tools"]
-
+dependencies = [
+  "coverage[toml]>=6.5",
+  "pytest",
+  "haystack-pydoc-tools",
+]
 [tool.hatch.envs.default.scripts]
-cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=instructor-embedders --cov=tests"
-no-cov = "cov --no-cov"
 test = "pytest {args:tests}"
-docs = "pydoc-markdown pydoc/config.yml"
+test-cov = "coverage run -m pytest {args:tests}"
+cov-report = [
+  "- coverage combine",
+  "coverage report",
+]
+cov = [
+  "test-cov",
+  "cov-report",
+]
+docs = [
+  "pydoc-markdown pydoc/config.yml"
+]
 
 [[tool.hatch.envs.test.matrix]]
 python = ["38", "39", "310", "311"]
@@ -86,10 +98,13 @@ fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
 all = ["style", "typing"]
 
 [tool.coverage.run]
+source = ["haystack_integrations"]
 branch = true
 parallel = true
 
 [tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing=true
 exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
 
 [tool.ruff]

@@ -1,8 +1,9 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import ClassVar, Dict, List, Optional, Union
+from typing import ClassVar, Dict, List, Optional
 
+from haystack.utils.auth import Secret
 from InstructorEmbedding import INSTRUCTOR
 
 
@@ -14,17 +15,13 @@ class _InstructorEmbeddingBackendFactory:
     _instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {}
 
     @staticmethod
-    def get_embedding_backend(
-        model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
-    ):
-        embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"
+    def get_embedding_backend(model: str, device: Optional[str] = None, token: Optional[Secret] = None):
+        embedding_backend_id = f"{model}{device}{token}"
 
         if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
             return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]
 
-        embedding_backend = _InstructorEmbeddingBackend(
-            model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
-        )
+        embedding_backend = _InstructorEmbeddingBackend(model=model, device=device, token=token)
         _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
         return embedding_backend
 
@@ -34,10 +31,12 @@ class _InstructorEmbeddingBackend:
     Class to manage INSTRUCTOR embeddings.
     """
 
-    def __init__(
-        self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
-    ):
-        self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)
+    def __init__(self, model: str, device: Optional[str] = None, token: Optional[Secret] = None):
+        self.model = INSTRUCTOR(
+            model_name_or_path=model,
+            device=device,
+            use_auth_token=token.resolve_value() if token else None,
+        )
 
     def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
         embeddings = self.model.encode(data, **kwargs).tolist()

@@ -1,9 +1,10 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -62,7 +63,7 @@ def __init__(
         self,
         model: str = "hkunlp/instructor-base",
         device: Optional[str] = None,
-        use_auth_token: Union[bool, str, None] = None,
+        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the document",
         batch_size: int = 32,
         progress_bar: bool = True,
@@ -95,10 +96,10 @@ def __init__(
         :param embedding_separator: Separator used to concatenate the meta fields to the Document content.
         """
 
-        self.model_name_or_path = model
+        self.model = model
         # TODO: remove device parameter and use Haystack's device management once migrated
         self.device = device or "cpu"
-        self.use_auth_token = use_auth_token
+        self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -112,9 +113,9 @@ def to_dict(self) -> Dict[str, Any]:
         """
         return default_to_dict(
             self,
-            model=self.model_name_or_path,
+            model=self.model,
             device=self.device,
-            use_auth_token=self.use_auth_token,
+            token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
@@ -128,6 +129,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
     def warm_up(self):
@@ -136,7 +138,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+                model=self.model, device=self.device, token=self.token
             )
 
     @component.output_types(documents=List[Document])

@@ -1,9 +1,10 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 from haystack import component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -38,7 +39,7 @@ def __init__(
         self,
         model: str = "hkunlp/instructor-base",
         device: Optional[str] = None,
-        use_auth_token: Union[bool, str, None] = None,
+        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the sentence",
         batch_size: int = 32,
         progress_bar: bool = True,
@@ -51,9 +52,7 @@ def __init__(
             such as ``'hkunlp/instructor-base'``.
         :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
             If None, checks if a GPU can be used.
-        :param use_auth_token: The API token used to download private models from Hugging Face.
-                        If this parameter is set to `True`, then the token generated when running
-                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+        :param token: The API token used to download private models from Hugging Face.
         :param instruction: The instruction string to be used while computing domain-specific embeddings.
             The instruction follows the unified template of the form:
             "Represent the 'domain' 'text_type' for 'task_objective'", where:
@@ -67,10 +66,10 @@ def __init__(
         :param normalize_embeddings: If set to true, returned vectors will have the length of 1.
         """
 
-        self.model_name_or_path = model
+        self.model = model
         # TODO: remove device parameter and use Haystack's device management once migrated
         self.device = device or "cpu"
-        self.use_auth_token = use_auth_token
+        self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -82,9 +81,9 @@ def to_dict(self) -> Dict[str, Any]:
         """
         return default_to_dict(
             self,
-            model=self.model_name_or_path,
+            model=self.model,
             device=self.device,
-            use_auth_token=self.use_auth_token,
+            token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
@@ -96,6 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
     def warm_up(self):
@@ -104,7 +104,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+                model=self.model, device=self.device, token=self.token
             )
 
     @component.output_types(embedding=List[float])

@@ -1,5 +1,6 @@
 from unittest.mock import patch
 
+from haystack.utils.auth import Secret
 from haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend import (
     _InstructorEmbeddingBackendFactory,
 )
@@ -10,11 +11,11 @@
 )
 def test_factory_behavior(mock_instructor):  # noqa: ARG001
     embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-        model_name_or_path="hkunlp/instructor-large", device="cpu"
+        model="hkunlp/instructor-large", device="cpu"
     )
     same_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend("hkunlp/instructor-large", "cpu")
     another_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-        model_name_or_path="hkunlp/instructor-base", device="cpu"
+        model="hkunlp/instructor-base", device="cpu"
     )
 
     assert same_embedding_backend is embedding_backend
@@ -29,10 +30,10 @@ def test_factory_behavior(mock_instructor):  # noqa: ARG001
 )
 def test_model_initialization(mock_instructor):
     _InstructorEmbeddingBackendFactory.get_embedding_backend(
-        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
+        model="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token")
     )
     mock_instructor.assert_called_once_with(
-        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
+        model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="fake-api-token"
     )
     # restore the factory state
     _InstructorEmbeddingBackendFactory._instances = {}
@@ -42,9 +43,7 @@ def test_model_initialization(mock_instructor):
     "haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR"
 )
 def test_embedding_function_with_kwargs(mock_instructor):  # noqa: ARG001
-    embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-        model_name_or_path="hkunlp/instructor-base"
-    )
+    embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(model="hkunlp/instructor-base")
 
     data = [["instruction", "sentence1"], ["instruction", "sentence2"]]
     embedding_backend.embed(data=data, normalize_embeddings=True)