From 46826d66f202118faaf54ec35b3ca4f9f81aaecb Mon Sep 17 00:00:00 2001 From: Ferran Llamas Date: Wed, 22 Jan 2025 10:31:23 +0100 Subject: [PATCH 1/3] Add support for extract strategies (#146) --- CHANGELOG.md | 2 +- nuclia/lib/kb.py | 6 ++++++ nuclia/sdk/upload.py | 20 ++++++++++++++++++++ requirements.txt | 6 +++--- 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d73564..e0508ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## 4.4.6 (unreleased) -- Nothing changed yet. +- Add support for extract strategy on file uploads, link and text fields. ## 4.4.5 (2025-01-16) diff --git a/nuclia/lib/kb.py b/nuclia/lib/kb.py index e8e156a..e51af97 100644 --- a/nuclia/lib/kb.py +++ b/nuclia/lib/kb.py @@ -263,6 +263,7 @@ def start_tus_upload( rid: Optional[str] = None, md5: Optional[str] = None, content_type: str = "application/octet-stream", + extract_strategy: Optional[str] = None, ): if self.writer_session is None: raise Exception("KB not configured") @@ -284,6 +285,8 @@ def start_tus_upload( headers["upload-metadata"] += ( f",md5 {base64.b64encode(md5.encode()).decode()}" ) + if extract_strategy is not None: + headers["x-extract-strategy"] = extract_strategy response: httpx.Response = self.writer_session.post(url, headers=headers) handle_http_errors(response) @@ -605,6 +608,7 @@ async def start_tus_upload( rid: Optional[str] = None, md5: Optional[str] = None, content_type: str = "application/octet-stream", + extract_strategy: Optional[str] = None, ): if self.writer_session is None: raise Exception("KB not configured") @@ -626,6 +630,8 @@ async def start_tus_upload( headers["upload-metadata"] += ( f",md5 {base64.b64encode(md5.encode()).decode()}" ) + if extract_strategy is not None: + headers["x-extract-strategy"] = extract_strategy response = await self.writer_session.post(url, headers=headers) handle_http_errors(response) diff --git a/nuclia/sdk/upload.py b/nuclia/sdk/upload.py index 2451401..7db3351 100644 --- a/nuclia/sdk/upload.py +++ b/nuclia/sdk/upload.py @@ -64,6 +64,7 @@ def file( interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, mimetype: Optional[str] = None, + extract_strategy: Optional[str] = None, **kwargs, ) -> Optional[str]: """Upload a file from filesystem to a Nuclia KnowledgeBox""" @@ -98,6 +99,7 @@ def file( filename=filename, content_type=mimetype, md5=md5_hash.hexdigest(), + extract_strategy=extract_strategy, ) offset = 0 @@ -197,6 +199,9 @@ def text( "format": format, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + texts[field]["extract_strategy"] = extract_strategy rid, is_new_resource = self._get_or_create_resource( texts=texts, icon=icon, @@ -226,6 +231,9 @@ def link( "css_selector": css_selector, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + links[field]["extract_strategy"] = extract_strategy kwargs["icon"] = "application/stf-link" rid, is_new_resource = self._get_or_create_resource( links=links, @@ -248,6 +256,7 @@ def remote( field: Optional[str] = "file", interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a remote url to a Nuclia KnowledgeBox""" @@ -279,6 +288,7 @@ def remote( size=size, filename=filename, content_type=mimetype, + extract_strategy=extract_strategy, ) offset = 0 for _ in tqdm(range((size // CHUNK_SIZE) + 1)): @@ -375,6 +385,7 @@ async def file( mimetype: Optional[str] = None, interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a file from filesystem to a Nuclia KnowledgeBox""" @@ -407,6 +418,7 @@ async def file( filename=filename, content_type=mimetype, md5=md5_hash.hexdigest(), + extract_strategy=extract_strategy, ) offset = 0 for _ in tqdm(range((size // CHUNK_SIZE) + 1)): @@ -503,6 +515,9 @@ async def text( "format": format, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + texts[field]["extract_strategy"] = extract_strategy rid, is_new_resource = await self._get_or_create_resource( texts=texts, icon=icon, @@ -530,6 +545,9 @@ async def link( "uri": uri, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + links[field]["extract_strategy"] = extract_strategy kwargs["icon"] = "application/stf-link" rid, is_new_resource = await self._get_or_create_resource( links=links, @@ -552,6 +570,7 @@ async def remote( field: Optional[str] = "file", interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a remote url to a Nuclia KnowledgeBox""" @@ -578,6 +597,7 @@ async def remote( size=size, filename=filename, content_type=mimetype, + extract_strategy=extract_strategy, ) offset = 0 with tqdm(total=(size // CHUNK_SIZE) + 1) as p_bar: diff --git a/requirements.txt b/requirements.txt index fdecd07..bf9367c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,9 @@ requests httpx httpcore>=1.0.0 prompt_toolkit -nucliadb_sdk>=6.2.1.post2735,<7 -nucliadb_models>=6.2.1.post2735,<7 -nucliadb_protos>=6.2.1.post2735,<7 +nucliadb_sdk>=6.2.1.post2864,<7 +nucliadb_models>=6.2.1.post2864,<7 +nucliadb_protos>=6.2.1.post2864,<7 nuclia-models>=0.24.3 tqdm aiofiles From 26aa4f4492acdc5f2d05760132a670a598901654 Mon Sep 17 00:00:00 2001 From: Eric BREHAULT Date: Wed, 22 Jan 2025 10:32:12 +0100 Subject: [PATCH 2/3] Preparing release 4.4.6 --- CHANGELOG.md | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0508ba..c5d2054 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 4.4.6 (unreleased) +## 4.4.6 (2025-01-22) - Add support for extract strategy on file uploads, link and text fields. diff --git a/VERSION b/VERSION index 6f9e382..b98ff4c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.4.6.dev0 +4.4.6 From 9071e3436bf1dd6db64e18bcb13cb96133275810 Mon Sep 17 00:00:00 2001 From: Eric BREHAULT Date: Wed, 22 Jan 2025 10:32:25 +0100 Subject: [PATCH 3/3] Back to development: 4.4.7 --- CHANGELOG.md | 6 ++++++ VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5d2054..8f0009d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 4.4.7 (unreleased) + + +- Nothing changed yet. + + ## 4.4.6 (2025-01-22) diff --git a/VERSION b/VERSION index b98ff4c..2ba3d10 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.4.6 +4.4.7.dev0