From d0ffe8b7c290b52753af8bc7729f1dd666db6a2f Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 15 Feb 2024 11:41:53 +0100 Subject: [PATCH] model_name_or_path > model (#418) * instructor - new secret management * fix coverage * retry coverage * model_name_or_path > model * linting * too much renaming :-) * fix --- integrations/astra/examples/example.py | 4 ++-- integrations/astra/examples/pipeline_example.py | 4 ++-- .../embedding_backend/instructor_backend.py | 12 +++++------- .../instructor_document_embedder.py | 6 +++--- .../instructor_embedders/instructor_text_embedder.py | 6 +++--- .../tests/test_instructor_backend.py | 10 ++++------ .../tests/test_instructor_document_embedder.py | 10 +++++----- .../tests/test_instructor_text_embedder.py | 10 +++++----- integrations/llama_cpp/README.md | 4 ++-- .../llama_cpp/examples/rag_pipeline_example.py | 4 ++-- 10 files changed, 33 insertions(+), 37 deletions(-) diff --git a/integrations/astra/examples/example.py b/integrations/astra/examples/example.py index 8ecb2eef0..6d88f3929 100644 --- a/integrations/astra/examples/example.py +++ b/integrations/astra/examples/example.py @@ -47,7 +47,7 @@ p.add_component(instance=DocumentCleaner(), name="cleaner") p.add_component(instance=DocumentSplitter(split_by="word", split_length=150, split_overlap=30), name="splitter") p.add_component( - instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder", ) p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer") @@ -63,7 +63,7 @@ # Create a querying pipeline on the indexed data q = Pipeline() q.add_component( - instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder", ) q.add_component("retriever", AstraEmbeddingRetriever(document_store)) diff --git a/integrations/astra/examples/pipeline_example.py b/integrations/astra/examples/pipeline_example.py index ac87488d9..09521dd64 100644 --- a/integrations/astra/examples/pipeline_example.py +++ b/integrations/astra/examples/pipeline_example.py @@ -62,7 +62,7 @@ ] p = Pipeline() p.add_component( - instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder", ) p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer") @@ -74,7 +74,7 @@ # Construct rag pipeline rag_pipeline = Pipeline() rag_pipeline.add_component( - instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder", ) rag_pipeline.add_component(instance=AstraEmbeddingRetriever(document_store=document_store), name="retriever") diff --git a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/embedding_backend/instructor_backend.py b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/embedding_backend/instructor_backend.py index cc743fd7c..717534aba 100644 --- a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/embedding_backend/instructor_backend.py +++ b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -15,15 +15,13 @@ class _InstructorEmbeddingBackendFactory: _instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {} @staticmethod - def get_embedding_backend(model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None): - embedding_backend_id = f"{model_name_or_path}{device}{token}" + def get_embedding_backend(model: str, device: Optional[str] = None, token: Optional[Secret] = None): + embedding_backend_id = f"{model}{device}{token}" if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances: return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] - embedding_backend = _InstructorEmbeddingBackend( - model_name_or_path=model_name_or_path, device=device, token=token - ) + embedding_backend = _InstructorEmbeddingBackend(model=model, device=device, token=token) _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend return embedding_backend @@ -33,9 +31,9 @@ class _InstructorEmbeddingBackend: Class to manage INSTRUCTOR embeddings. """ - def __init__(self, model_name_or_path: str, device: Optional[str] = None, token: Optional[Secret] = None): + def __init__(self, model: str, device: Optional[str] = None, token: Optional[Secret] = None): self.model = INSTRUCTOR( - model_name_or_path=model_name_or_path, + model_name_or_path=model, device=device, use_auth_token=token.resolve_value() if token else None, ) diff --git a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py index 78dab9fb6..4246516a7 100644 --- a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py +++ b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py @@ -96,7 +96,7 @@ def __init__( :param embedding_separator: Separator used to concatenate the meta fields to the Document content. """ - self.model_name_or_path = model + self.model = model # TODO: remove device parameter and use Haystack's device management once migrated self.device = device or "cpu" self.token = token @@ -113,7 +113,7 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - model=self.model_name_or_path, + model=self.model, device=self.device, token=self.token.to_dict() if self.token else None, instruction=self.instruction, @@ -138,7 +138,7 @@ def warm_up(self): """ if not hasattr(self, "embedding_backend"): self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path=self.model_name_or_path, device=self.device, token=self.token + model=self.model, device=self.device, token=self.token ) @component.output_types(documents=List[Document]) diff --git a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py index 4383c9327..0299d076a 100644 --- a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py +++ b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py @@ -66,7 +66,7 @@ def __init__( :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ - self.model_name_or_path = model + self.model = model # TODO: remove device parameter and use Haystack's device management once migrated self.device = device or "cpu" self.token = token @@ -81,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - model=self.model_name_or_path, + model=self.model, device=self.device, token=self.token.to_dict() if self.token else None, instruction=self.instruction, @@ -104,7 +104,7 @@ def warm_up(self): """ if not hasattr(self, "embedding_backend"): self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path=self.model_name_or_path, device=self.device, token=self.token + model=self.model, device=self.device, token=self.token ) @component.output_types(embedding=List[float]) diff --git a/integrations/instructor_embedders/tests/test_instructor_backend.py b/integrations/instructor_embedders/tests/test_instructor_backend.py index 012a55a36..85c1f012a 100644 --- a/integrations/instructor_embedders/tests/test_instructor_backend.py +++ b/integrations/instructor_embedders/tests/test_instructor_backend.py @@ -11,11 +11,11 @@ ) def test_factory_behavior(mock_instructor): # noqa: ARG001 embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path="hkunlp/instructor-large", device="cpu" + model="hkunlp/instructor-large", device="cpu" ) same_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend("hkunlp/instructor-large", "cpu") another_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path="hkunlp/instructor-base", device="cpu" + model="hkunlp/instructor-base", device="cpu" ) assert same_embedding_backend is embedding_backend @@ -30,7 +30,7 @@ def test_factory_behavior(mock_instructor): # noqa: ARG001 ) def test_model_initialization(mock_instructor): _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token") + model="hkunlp/instructor-base", device="cpu", token=Secret.from_token("fake-api-token") ) mock_instructor.assert_called_once_with( model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="fake-api-token" @@ -43,9 +43,7 @@ def test_model_initialization(mock_instructor): "haystack_integrations.components.embedders.instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR" ) def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 - embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model_name_or_path="hkunlp/instructor-base" - ) + embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(model="hkunlp/instructor-base") data = [["instruction", "sentence1"], ["instruction", "sentence2"]] embedding_backend.embed(data=data, normalize_embeddings=True) diff --git a/integrations/instructor_embedders/tests/test_instructor_document_embedder.py b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py index 4c6e008a2..ba1444900 100644 --- a/integrations/instructor_embedders/tests/test_instructor_document_embedder.py +++ b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py @@ -13,7 +13,7 @@ def test_init_default(self): Test default initialization parameters for InstructorDocumentEmbedder. """ embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base") - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the document" @@ -38,7 +38,7 @@ def test_init_with_parameters(self): meta_fields_to_embed=["test_field"], embedding_separator=" | ", ) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cuda" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" @@ -118,7 +118,7 @@ def test_from_dict(self): }, } embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" @@ -147,7 +147,7 @@ def test_from_dict_with_custom_init_parameters(self): }, } embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cuda" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the financial document for retrieval" @@ -168,7 +168,7 @@ def test_warmup(self, mocked_factory): mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model_name_or_path="hkunlp/instructor-base", + model="hkunlp/instructor-base", device="cpu", token=Secret.from_env_var("HF_API_TOKEN", strict=False), ) diff --git a/integrations/instructor_embedders/tests/test_instructor_text_embedder.py b/integrations/instructor_embedders/tests/test_instructor_text_embedder.py index b74970e43..d888fef65 100644 --- a/integrations/instructor_embedders/tests/test_instructor_text_embedder.py +++ b/integrations/instructor_embedders/tests/test_instructor_text_embedder.py @@ -12,7 +12,7 @@ def test_init_default(self): Test default initialization parameters for InstructorTextEmbedder. """ embedder = InstructorTextEmbedder(model="hkunlp/instructor-base") - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the sentence" @@ -33,7 +33,7 @@ def test_init_with_parameters(self): progress_bar=False, normalize_embeddings=True, ) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cuda" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" @@ -103,7 +103,7 @@ def test_from_dict(self): }, } embedder = InstructorTextEmbedder.from_dict(embedder_dict) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" @@ -128,7 +128,7 @@ def test_from_dict_with_custom_init_parameters(self): }, } embedder = InstructorTextEmbedder.from_dict(embedder_dict) - assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.model == "hkunlp/instructor-base" assert embedder.device == "cuda" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the financial document for retrieval" @@ -147,7 +147,7 @@ def test_warmup(self, mocked_factory): mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model_name_or_path="hkunlp/instructor-base", + model="hkunlp/instructor-base", device="cpu", token=Secret.from_env_var("HF_API_TOKEN", strict=False), ) diff --git a/integrations/llama_cpp/README.md b/integrations/llama_cpp/README.md index be33a8c4e..cb975f69b 100644 --- a/integrations/llama_cpp/README.md +++ b/integrations/llama_cpp/README.md @@ -161,7 +161,7 @@ Index the documents to the `InMemoryDocumentStore` using the `SentenceTransforme ```python doc_store = InMemoryDocumentStore(embedding_similarity_function="cosine") -doc_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") +doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") # Indexing Pipeline indexing_pipeline = Pipeline() @@ -188,7 +188,7 @@ GPT4 Correct Assistant: rag_pipeline = Pipeline() -text_embedder = SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") +text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") # Load the LLM using LlamaCppGenerator model_path = "openchat-3.5-1210.Q3_K_S.gguf" diff --git a/integrations/llama_cpp/examples/rag_pipeline_example.py b/integrations/llama_cpp/examples/rag_pipeline_example.py index bb261ef50..76be99d32 100644 --- a/integrations/llama_cpp/examples/rag_pipeline_example.py +++ b/integrations/llama_cpp/examples/rag_pipeline_example.py @@ -23,7 +23,7 @@ ] doc_store = InMemoryDocumentStore(embedding_similarity_function="cosine") -doc_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") +doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") # Indexing Pipeline @@ -47,7 +47,7 @@ """ rag_pipeline = Pipeline() -text_embedder = SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") +text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") model_path = "openchat-3.5-1210.Q3_K_S.gguf" generator = LlamaCppGenerator(model_path=model_path, n_ctx=4096, n_batch=128)