Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

model_name_or_path > model #418

Merged
merged 8 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions integrations/astra/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
p.add_component(instance=DocumentCleaner(), name="cleaner")
p.add_component(instance=DocumentSplitter(split_by="word", split_length=150, split_overlap=30), name="splitter")
p.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
name="embedder",
)
p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
Expand All @@ -63,7 +63,7 @@
# Create a querying pipeline on the indexed data
q = Pipeline()
q.add_component(
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
name="embedder",
)
q.add_component("retriever", AstraEmbeddingRetriever(document_store))
Expand Down
4 changes: 2 additions & 2 deletions integrations/astra/examples/pipeline_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
]
p = Pipeline()
p.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
name="embedder",
)
p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
Expand All @@ -74,7 +74,7 @@
# Construct rag pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component(
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
name="embedder",
)
rag_pipeline.add_component(instance=AstraEmbeddingRetriever(document_store=document_store), name="retriever")
Expand Down
25 changes: 20 additions & 5 deletions integrations/instructor_embedders/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,25 @@ root = "../.."
git_describe_command = 'git describe --tags --match="integrations/instructor_embedders-v[0-9]*"'

[tool.hatch.envs.default]
dependencies = ["pytest", "pytest-cov", "haystack-pydoc-tools"]

dependencies = [
"coverage[toml]>=6.5",
"pytest",
"haystack-pydoc-tools",
]
[tool.hatch.envs.default.scripts]
cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=instructor-embedders --cov=tests"
no-cov = "cov --no-cov"
test = "pytest {args:tests}"
docs = "pydoc-markdown pydoc/config.yml"
test-cov = "coverage run -m pytest {args:tests}"
cov-report = [
"- coverage combine",
"coverage report",
]
cov = [
"test-cov",
"cov-report",
]
docs = [
"pydoc-markdown pydoc/config.yml"
]

[[tool.hatch.envs.test.matrix]]
python = ["38", "39", "310", "311"]
Expand All @@ -86,10 +98,13 @@ fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
all = ["style", "typing"]

[tool.coverage.run]
source = ["haystack_integrations"]
branch = true
parallel = true

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing=true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

[tool.ruff]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import ClassVar, Dict, List, Optional, Union
from typing import ClassVar, Dict, List, Optional

from haystack.utils.auth import Secret
from InstructorEmbedding import INSTRUCTOR


Expand All @@ -14,17 +15,13 @@ class _InstructorEmbeddingBackendFactory:
_instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {}

@staticmethod
def get_embedding_backend(
model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"
def get_embedding_backend(model: str, device: Optional[str] = None, token: Optional[Secret] = None):
embedding_backend_id = f"{model}{device}{token}"

if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]

embedding_backend = _InstructorEmbeddingBackend(
model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
)
embedding_backend = _InstructorEmbeddingBackend(model=model, device=device, token=token)
_InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
return embedding_backend

Expand All @@ -34,10 +31,12 @@ class _InstructorEmbeddingBackend:
Class to manage INSTRUCTOR embeddings.
"""

def __init__(
self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)
def __init__(self, model: str, device: Optional[str] = None, token: Optional[Secret] = None):
self.model = INSTRUCTOR(
model_name_or_path=model,
device=device,
use_auth_token=token.resolve_value() if token else None,
)

def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
embeddings = self.model.encode(data, **kwargs).tolist()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
instruction: str = "Represent the document",
batch_size: int = 32,
progress_bar: bool = True,
Expand Down Expand Up @@ -95,10 +96,10 @@ def __init__(
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
"""

self.model_name_or_path = model
self.model = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
Expand All @@ -112,9 +113,9 @@ def to_dict(self) -> Dict[str, Any]:
"""
return default_to_dict(
self,
model=self.model_name_or_path,
model=self.model,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
Expand All @@ -128,6 +129,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

def warm_up(self):
Expand All @@ -136,7 +138,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model=self.model, device=self.device, token=self.token
)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from haystack import component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand Down Expand Up @@ -38,7 +39,7 @@ def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
instruction: str = "Represent the sentence",
batch_size: int = 32,
progress_bar: bool = True,
Expand All @@ -51,9 +52,7 @@ def __init__(
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param token: The API token used to download private models from Hugging Face.
:param instruction: The instruction string to be used while computing domain-specific embeddings.
The instruction follows the unified template of the form:
"Represent the 'domain' 'text_type' for 'task_objective'", where:
Expand All @@ -67,10 +66,10 @@ def __init__(
:param normalize_embeddings: If set to true, returned vectors will have the length of 1.
"""

self.model_name_or_path = model
self.model = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
Expand All @@ -82,9 +81,9 @@ def to_dict(self) -> Dict[str, Any]:
"""
return default_to_dict(
self,
model=self.model_name_or_path,
model=self.model,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
Expand All @@ -96,6 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

def warm_up(self):
Expand All @@ -104,7 +104,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model=self.model, device=self.device, token=self.token
)

@component.output_types(embedding=List[float])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import patch

from haystack.utils.auth import Secret
from haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend import (
_InstructorEmbeddingBackendFactory,
)
Expand All @@ -10,11 +11,11 @@
)
def test_factory_behavior(mock_instructor): # noqa: ARG001
embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="hkunlp/instructor-large", device="cpu"
model="hkunlp/instructor-large", device="cpu"
)
same_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend("hkunlp/instructor-large", "cpu")
another_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="hkunlp/instructor-base", device="cpu"
model="hkunlp/instructor-base", device="cpu"
)

assert same_embedding_backend is embedding_backend
Expand All @@ -29,10 +30,10 @@ def test_factory_behavior(mock_instructor): # noqa: ARG001
)
def test_model_initialization(mock_instructor):
_InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
model="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token")
)
mock_instructor.assert_called_once_with(
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token"
model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="fake-api-token"
)
# restore the factory state
_InstructorEmbeddingBackendFactory._instances = {}
Expand All @@ -42,9 +43,7 @@ def test_model_initialization(mock_instructor):
"haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR"
)
def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001
embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="hkunlp/instructor-base"
)
embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(model="hkunlp/instructor-base")

data = [["instruction", "sentence1"], ["instruction", "sentence2"]]
embedding_backend.embed(data=data, normalize_embeddings=True)
Expand Down
Loading