From d7e47830d0fba3691782d11500c50479ef50954d Mon Sep 17 00:00:00 2001 From: Nick Byrne <55434794+nenb@users.noreply.github.com> Date: Tue, 7 May 2024 09:42:37 -0300 Subject: [PATCH] Add endpoint for batch uploading document metadata (#404) --- ragna/deploy/_api/core.py | 27 ++++++++ ragna/deploy/_api/database.py | 26 +++++++ tests/deploy/api/test_batch_endpoints.py | 86 ++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 tests/deploy/api/test_batch_endpoints.py diff --git a/ragna/deploy/_api/core.py b/ragna/deploy/_api/core.py index 4f42f10d..5346b048 100644 --- a/ragna/deploy/_api/core.py +++ b/ragna/deploy/_api/core.py @@ -160,6 +160,33 @@ async def create_document_upload_info( ) return schemas.DocumentUpload(parameters=parameters, document=document) + # TODO: Add UI support and documentation for this endpoint (#406) + @app.post("/documents") + async def create_documents_upload_info( + user: UserDependency, + names: Annotated[list[str], Body(..., embed=True)], + ) -> list[schemas.DocumentUpload]: + with get_session() as session: + document_metadata_collection = [] + document_upload_collection = [] + for name in names: + document = schemas.Document(name=name) + metadata, parameters = await config.document.get_upload_info( + config=config, user=user, id=document.id, name=document.name + ) + document_metadata_collection.append((document, metadata)) + document_upload_collection.append( + schemas.DocumentUpload(parameters=parameters, document=document) + ) + + database.add_documents( + session, + user=user, + document_metadata_collection=document_metadata_collection, + ) + return document_upload_collection + + # TODO: Add new endpoint for batch uploading documents (#407) @app.put("/document") async def upload_document( token: Annotated[str, Form()], file: UploadFile diff --git a/ragna/deploy/_api/database.py b/ragna/deploy/_api/database.py index cbf388e2..30d62ef9 100644 --- a/ragna/deploy/_api/database.py +++ b/ragna/deploy/_api/database.py @@ -55,6 +55,32 @@ def add_document( session.commit() +def add_documents( + session: Session, + *, + user: str, + document_metadata_collection: list[tuple[schemas.Document, dict[str, Any]]], +) -> None: + """ + Add multiple documents to the database. + + This function allows adding multiple documents at once by calling `add_all`. This is + important when there is non-negligible latency attached to each database operation. + """ + user_id = _get_user_id(session, user) + documents = [ + orm.Document( + id=document.id, + user_id=user_id, + name=document.name, + metadata_=metadata, + ) + for document, metadata in document_metadata_collection + ] + session.add_all(documents) + session.commit() + + def _orm_to_schema_document(document: orm.Document) -> schemas.Document: return schemas.Document(id=document.id, name=document.name) diff --git a/tests/deploy/api/test_batch_endpoints.py b/tests/deploy/api/test_batch_endpoints.py new file mode 100644 index 00000000..94740750 --- /dev/null +++ b/tests/deploy/api/test_batch_endpoints.py @@ -0,0 +1,86 @@ +from fastapi import status +from fastapi.testclient import TestClient + +from ragna.deploy import Config +from ragna.deploy._api import app + +from .utils import authenticate + + +def test_batch_sequential_upload_equivalence(tmp_local_root): + "Check that uploading documents sequentially and in batch gives the same result" + config = Config(local_root=tmp_local_root) + + document_root = config.local_root / "documents" + document_root.mkdir() + document_path1 = document_root / "test1.txt" + with open(document_path1, "w") as file: + file.write("!\n") + document_path2 = document_root / "test2.txt" + with open(document_path2, "w") as file: + file.write("?\n") + + with TestClient( + app(config=Config(), ignore_unavailable_components=False) + ) as client: + authenticate(client) + + document1_upload = ( + client.post("/document", json={"name": document_path1.name}) + .raise_for_status() + .json() + ) + document2_upload = ( + client.post("/document", json={"name": document_path2.name}) + .raise_for_status() + .json() + ) + + documents_upload = ( + client.post( + "/documents", json={"names": [document_path1.name, document_path2.name]} + ) + .raise_for_status() + .json() + ) + + assert ( + document1_upload["parameters"]["url"] + == documents_upload[0]["parameters"]["url"] + ) + assert ( + document2_upload["parameters"]["url"] + == documents_upload[1]["parameters"]["url"] + ) + + assert ( + document1_upload["document"]["name"] + == documents_upload[0]["document"]["name"] + ) + assert ( + document2_upload["document"]["name"] + == documents_upload[1]["document"]["name"] + ) + + # assuming that if test passes for first document it will also pass for the other + with open(document_path1, "rb") as file: + response_sequential_upload1 = client.request( + document1_upload["parameters"]["method"], + document1_upload["parameters"]["url"], + data=document1_upload["parameters"]["data"], + files={"file": file}, + ) + response_batch_upload1 = client.request( + documents_upload[0]["parameters"]["method"], + documents_upload[0]["parameters"]["url"], + data=documents_upload[0]["parameters"]["data"], + files={"file": file}, + ) + + assert response_sequential_upload1.status_code == status.HTTP_200_OK + assert response_batch_upload1.status_code == status.HTTP_200_OK + + assert ( + response_sequential_upload1.json()["name"] + == response_batch_upload1.json()["name"] + )