diff --git a/integrations/unstructured/pyproject.toml b/integrations/unstructured/pyproject.toml index 7366a8adf..298fdb993 100644 --- a/integrations/unstructured/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -156,15 +156,13 @@ ban-relative-imports = "parents" "tests/**/*" = ["PLR2004", "S101", "TID252"] [tool.coverage.run] -source_pkgs = ["src", "tests"] +source = ["haystack_integrations"] branch = true parallel = true -[tool.coverage.paths] -unstructured_fileconverter_haystack = ["src/haystack_integrations", "*/unstructured-fileconverter-haystack/src"] -tests = ["tests", "*/unstructured-fileconverter-haystack/tests"] - [tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing=true exclude_lines = [ "no cov", "if __name__ == .__main__.:", diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 54cbd5559..a4a132437 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -10,6 +10,7 @@ from haystack import Document, component, default_to_dict from haystack.components.converters.utils import normalize_metadata +from haystack.utils import Secret from tqdm import tqdm from unstructured.documents.elements import Element # type: ignore[import] @@ -29,7 +30,7 @@ class UnstructuredFileConverter: def __init__( self, api_url: str = UNSTRUCTURED_HOSTED_API_URL, - api_key: Optional[str] = None, + api_key: Optional[Secret] = Secret.from_env_var("UNSTRUCTURED_API_KEY", strict=False), # noqa: B008 document_creation_mode: Literal[ "one-doc-per-file", "one-doc-per-page", "one-doc-per-element" ] = "one-doc-per-file", @@ -57,6 +58,7 @@ def __init__( """ self.api_url = api_url + self.api_key = api_key self.document_creation_mode = document_creation_mode self.unstructured_kwargs = unstructured_kwargs or {} self.separator = separator @@ -64,17 +66,15 @@ def __init__( is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL - api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY") # we check whether api_key is None or an empty string - if is_hosted_api and not api_key: + api_key_value = api_key.resolve_value() if api_key else None + if is_hosted_api and not api_key_value: msg = ( "To use the hosted version of Unstructured, you need to set the environment variable " - "UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key." + "UNSTRUCTURED_API_KEY (recommended) or explicitly pass the parameter api_key." ) raise ValueError(msg) - self.api_key = api_key - def to_dict(self) -> Dict[str, Any]: """ Serialize this component to a dictionary. @@ -84,6 +84,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, api_url=self.api_url, + api_key=self.api_key.to_dict() if self.api_key else None, document_creation_mode=self.document_creation_mode, separator=self.separator, unstructured_kwargs=self.unstructured_kwargs, @@ -140,8 +141,8 @@ def run( documents.extend(docs_for_file) return {"documents": documents} + @staticmethod def _create_documents( - self, filepath: Path, elements: List[Element], document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"], @@ -194,7 +195,10 @@ def _partition_file_into_elements(self, filepath: Path) -> List[Element]: elements = [] try: elements = partition_via_api( - filename=str(filepath), api_url=self.api_url, api_key=self.api_key, **self.unstructured_kwargs + filename=str(filepath), + api_url=self.api_url, + api_key=self.api_key.resolve_value() if self.api_key else None, + **self.unstructured_kwargs, ) except Exception as e: logger.warning(f"Unstructured could not process file {filepath}. Error: {e}") diff --git a/integrations/unstructured/tests/conftest.py b/integrations/unstructured/tests/conftest.py new file mode 100644 index 000000000..fa02cc5dd --- /dev/null +++ b/integrations/unstructured/tests/conftest.py @@ -0,0 +1,13 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture +def set_env_variables(monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_API_KEY", "test-api-key") + + +@pytest.fixture +def samples_path(): + return Path(__file__).parent / "samples" diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index e03e2e58e..7a5e135ac 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -1,22 +1,16 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from pathlib import Path - import pytest from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter -@pytest.fixture -def samples_path(): - return Path(__file__).parent / "samples" - - class TestUnstructuredFileConverter: + @pytest.mark.usefixtures("set_env_variables") def test_init_default(self): - converter = UnstructuredFileConverter(api_key="test-api-key") + converter = UnstructuredFileConverter() assert converter.api_url == "https://api.unstructured.io/general/v0/general" - assert converter.api_key == "test-api-key" + assert converter.api_key.resolve_value() == "test-api-key" assert converter.document_creation_mode == "one-doc-per-file" assert converter.separator == "\n\n" assert converter.unstructured_kwargs == {} @@ -31,20 +25,26 @@ def test_init_with_parameters(self): progress_bar=False, ) assert converter.api_url == "http://custom-url:8000/general" - assert converter.api_key is None + assert converter.api_key.resolve_value() is None assert converter.document_creation_mode == "one-doc-per-element" assert converter.separator == "|" assert converter.unstructured_kwargs == {"foo": "bar"} assert not converter.progress_bar + def test_init_hosted_without_api_key_raises_error(self): + with pytest.raises(ValueError): + UnstructuredFileConverter(api_url="https://api.unstructured.io/general/v0/general") + + @pytest.mark.usefixtures("set_env_variables") def test_to_dict(self): - converter = UnstructuredFileConverter(api_key="test-api-key") + converter = UnstructuredFileConverter() converter_dict = converter.to_dict() assert converter_dict == { "type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredFileConverter", "init_parameters": { "api_url": "https://api.unstructured.io/general/v0/general", + "api_key": {"env_vars": ["UNSTRUCTURED_API_KEY"], "strict": False, "type": "env_var"}, "document_creation_mode": "one-doc-per-file", "separator": "\n\n", "unstructured_kwargs": {},