Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

instructor - new secret management #417

Merged
merged 3 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions integrations/instructor_embedders/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,25 @@ root = "../.."
git_describe_command = 'git describe --tags --match="integrations/instructor_embedders-v[0-9]*"'

[tool.hatch.envs.default]
dependencies = ["pytest", "pytest-cov", "haystack-pydoc-tools"]

dependencies = [
"coverage[toml]>=6.5",
"pytest",
"haystack-pydoc-tools",
]
[tool.hatch.envs.default.scripts]
cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=instructor-embedders --cov=tests"
no-cov = "cov --no-cov"
test = "pytest {args:tests}"
docs = "pydoc-markdown pydoc/config.yml"
test-cov = "coverage run -m pytest {args:tests}"
cov-report = [
"- coverage combine",
"coverage report",
]
cov = [
"test-cov",
"cov-report",
]
docs = [
"pydoc-markdown pydoc/config.yml"
]

[[tool.hatch.envs.test.matrix]]
python = ["38", "39", "310", "311"]
Expand All @@ -86,10 +98,13 @@ fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
all = ["style", "typing"]

[tool.coverage.run]
source = ["haystack_integrations"]
branch = true
parallel = true

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing=true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

[tool.ruff]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import ClassVar, Dict, List, Optional, Union
from typing import ClassVar, Dict, List, Optional

from haystack.utils.auth import Secret
from InstructorEmbedding import INSTRUCTOR


Expand All @@ -14,16 +15,14 @@ class _InstructorEmbeddingBackendFactory:
_instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {}

@staticmethod
def get_embedding_backend(
model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"
def get_embedding_backend(model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None):
embedding_backend_id = f"{model_name_or_path}{device}{token}"

if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]

embedding_backend = _InstructorEmbeddingBackend(
model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
model_name_or_path=model_name_or_path, device=device, token=token
)
_InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
return embedding_backend
Expand All @@ -34,10 +33,12 @@ class _InstructorEmbeddingBackend:
Class to manage INSTRUCTOR embeddings.
"""

def __init__(
self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)
def __init__(self, model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None):
self.model = INSTRUCTOR(
model_name_or_path=model_name_or_path,
device=device,
use_auth_token=token.resolve_value() if token else None,
)

def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
embeddings = self.model.encode(data, **kwargs).tolist()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's actually a better idea, to have those ignore statements inline, instead of globally defined 👍🏽

instruction: str = "Represent the document",
batch_size: int = 32,
progress_bar: bool = True,
Expand Down Expand Up @@ -98,7 +99,7 @@ def __init__(
self.model_name_or_path = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
Expand All @@ -114,7 +115,7 @@ def to_dict(self) -> Dict[str, Any]:
self,
model=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
Expand All @@ -128,6 +129,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

def warm_up(self):
Expand All @@ -136,7 +138,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, token=self.token
)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from haystack import component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand Down Expand Up @@ -38,7 +39,7 @@ def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
instruction: str = "Represent the sentence",
batch_size: int = 32,
progress_bar: bool = True,
Expand All @@ -51,9 +52,7 @@ def __init__(
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param token: The API token used to download private models from Hugging Face.
:param instruction: The instruction string to be used while computing domain-specific embeddings.
The instruction follows the unified template of the form:
"Represent the 'domain' 'text_type' for 'task_objective'", where:
Expand All @@ -70,7 +69,7 @@ def __init__(
self.model_name_or_path = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
Expand All @@ -84,7 +83,7 @@ def to_dict(self) -> Dict[str, Any]:
self,
model=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
Expand All @@ -96,6 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

def warm_up(self):
Expand All @@ -104,7 +104,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, token=self.token
)

@component.output_types(embedding=List[float])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import patch

from haystack.utils.auth import Secret
from haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend import (
_InstructorEmbeddingBackendFactory,
)
Expand Down Expand Up @@ -29,10 +30,10 @@ def test_factory_behavior(mock_instructor): # noqa: ARG001
)
def test_model_initialization(mock_instructor):
_InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
model_name_or_path="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token")
)
mock_instructor.assert_called_once_with(
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="fake-api-token"
)
# restore the factory state
_InstructorEmbeddingBackendFactory._instances = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest
from haystack import Document
from haystack.utils import Secret
from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder


Expand All @@ -14,7 +15,7 @@ def test_init_default(self):
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the document"
assert embedder.batch_size == 32
assert embedder.progress_bar is True
Expand All @@ -29,7 +30,7 @@ def test_init_with_parameters(self):
embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
token=Secret.from_token("fake-api-token"),
instruction="Represent the 'domain' 'text_type' for 'task_objective'",
batch_size=64,
progress_bar=False,
Expand All @@ -39,7 +40,7 @@ def test_init_with_parameters(self):
)
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token == Secret.from_token("fake-api-token")
assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
assert embedder.batch_size == 64
assert embedder.progress_bar is False
Expand All @@ -58,7 +59,7 @@ def test_to_dict(self):
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the document",
"batch_size": 32,
"progress_bar": True,
Expand All @@ -75,7 +76,6 @@ def test_to_dict_with_custom_init_parameters(self):
embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
instruction="Represent the financial document for retrieval",
batch_size=64,
progress_bar=False,
Expand All @@ -89,7 +89,7 @@ def test_to_dict_with_custom_init_parameters(self):
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the financial document for retrieval",
"batch_size": 64,
"progress_bar": False,
Expand All @@ -108,7 +108,7 @@ def test_from_dict(self):
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
"batch_size": 32,
"progress_bar": True,
Expand All @@ -120,7 +120,7 @@ def test_from_dict(self):
embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
assert embedder.batch_size == 32
assert embedder.progress_bar is True
Expand All @@ -137,7 +137,7 @@ def test_from_dict_with_custom_init_parameters(self):
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the financial document for retrieval",
"batch_size": 64,
"progress_bar": False,
Expand All @@ -149,7 +149,7 @@ def test_from_dict_with_custom_init_parameters(self):
embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the financial document for retrieval"
assert embedder.batch_size == 64
assert embedder.progress_bar is False
Expand All @@ -168,7 +168,9 @@ def test_warmup(self, mocked_factory):
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
mocked_factory.get_embedding_backend.assert_called_once_with(
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None
model_name_or_path="hkunlp/instructor-base",
device="cpu",
token=Secret.from_env_var("HF_API_TOKEN", strict=False),
)

@patch(
Expand Down
Loading