Skip to content

Commit

Permalink
Merge branch 'main' into langchain_adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
drf7 authored Jan 22, 2025
2 parents b6fe2cb + 9071e34 commit cf693a5
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 5 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# Changelog

## 4.4.6 (unreleased)
## 4.4.7 (unreleased)


- Nothing changed yet.


## 4.4.6 (2025-01-22)


- Add support for extract strategy on file uploads, link and text fields.


## 4.4.5 (2025-01-16)


Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4.4.6.dev0
4.4.7.dev0
6 changes: 6 additions & 0 deletions nuclia/lib/kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def start_tus_upload(
rid: Optional[str] = None,
md5: Optional[str] = None,
content_type: str = "application/octet-stream",
extract_strategy: Optional[str] = None,
):
if self.writer_session is None:
raise Exception("KB not configured")
Expand All @@ -284,6 +285,8 @@ def start_tus_upload(
headers["upload-metadata"] += (
f",md5 {base64.b64encode(md5.encode()).decode()}"
)
if extract_strategy is not None:
headers["x-extract-strategy"] = extract_strategy

response: httpx.Response = self.writer_session.post(url, headers=headers)
handle_http_errors(response)
Expand Down Expand Up @@ -605,6 +608,7 @@ async def start_tus_upload(
rid: Optional[str] = None,
md5: Optional[str] = None,
content_type: str = "application/octet-stream",
extract_strategy: Optional[str] = None,
):
if self.writer_session is None:
raise Exception("KB not configured")
Expand All @@ -626,6 +630,8 @@ async def start_tus_upload(
headers["upload-metadata"] += (
f",md5 {base64.b64encode(md5.encode()).decode()}"
)
if extract_strategy is not None:
headers["x-extract-strategy"] = extract_strategy

response = await self.writer_session.post(url, headers=headers)
handle_http_errors(response)
Expand Down
20 changes: 20 additions & 0 deletions nuclia/sdk/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def file(
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
mimetype: Optional[str] = None,
extract_strategy: Optional[str] = None,
**kwargs,
) -> Optional[str]:
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -98,6 +99,7 @@ def file(
filename=filename,
content_type=mimetype,
md5=md5_hash.hexdigest(),
extract_strategy=extract_strategy,
)

offset = 0
Expand Down Expand Up @@ -197,6 +199,9 @@ def text(
"format": format,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
texts[field]["extract_strategy"] = extract_strategy
rid, is_new_resource = self._get_or_create_resource(
texts=texts,
icon=icon,
Expand Down Expand Up @@ -226,6 +231,9 @@ def link(
"css_selector": css_selector,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
links[field]["extract_strategy"] = extract_strategy
kwargs["icon"] = "application/stf-link"
rid, is_new_resource = self._get_or_create_resource(
links=links,
Expand All @@ -248,6 +256,7 @@ def remote(
field: Optional[str] = "file",
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a remote url to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -279,6 +288,7 @@ def remote(
size=size,
filename=filename,
content_type=mimetype,
extract_strategy=extract_strategy,
)
offset = 0
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
Expand Down Expand Up @@ -375,6 +385,7 @@ async def file(
mimetype: Optional[str] = None,
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -407,6 +418,7 @@ async def file(
filename=filename,
content_type=mimetype,
md5=md5_hash.hexdigest(),
extract_strategy=extract_strategy,
)
offset = 0
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
Expand Down Expand Up @@ -503,6 +515,9 @@ async def text(
"format": format,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
texts[field]["extract_strategy"] = extract_strategy
rid, is_new_resource = await self._get_or_create_resource(
texts=texts,
icon=icon,
Expand Down Expand Up @@ -530,6 +545,9 @@ async def link(
"uri": uri,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
links[field]["extract_strategy"] = extract_strategy
kwargs["icon"] = "application/stf-link"
rid, is_new_resource = await self._get_or_create_resource(
links=links,
Expand All @@ -552,6 +570,7 @@ async def remote(
field: Optional[str] = "file",
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a remote url to a Nuclia KnowledgeBox"""
Expand All @@ -578,6 +597,7 @@ async def remote(
size=size,
filename=filename,
content_type=mimetype,
extract_strategy=extract_strategy,
)
offset = 0
with tqdm(total=(size // CHUNK_SIZE) + 1) as p_bar:
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ requests
httpx
httpcore>=1.0.0
prompt_toolkit
nucliadb_sdk>=6.2.1.post2735,<7
nucliadb_models>=6.2.1.post2735,<7
nucliadb_protos>=6.2.1.post2735,<7
nucliadb_sdk>=6.2.1.post2864,<7
nucliadb_models>=6.2.1.post2864,<7
nucliadb_protos>=6.2.1.post2864,<7
nuclia-models>=0.24.3
tqdm
aiofiles
Expand Down

0 comments on commit cf693a5

Please sign in to comment.