From 1ee8e5fdba929e6142f6ba51f924b903eeea4e2c Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Fri, 20 Sep 2024 10:43:24 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9A=97=EF=B8=8F(backend)=20function=20to=20e?= =?UTF-8?q?xtract=20text=20from=20base64=20yjs=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function to extract text from base64 yjs document. Can be usefull if we need to index the content of the documents. --- CHANGELOG.md | 4 ++++ src/backend/core/tests/test_utils.py | 28 +++++++++++++++++++++++++++- src/backend/core/utils.py | 18 ++++++++++++++++++ src/backend/pyproject.toml | 2 ++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3289e0be4..1cbfb68f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to ## [Unreleased] +## Added + +- ⚗️(backend) Extract text from base64 yjs document #270 + ## [1.4.0] - 2024-09-17 diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 288d31971..d25242567 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -10,7 +10,7 @@ import pytest -from core.utils import email_invitation +from core.utils import email_invitation, yjs_base64_to_text pytestmark = pytest.mark.django_db @@ -85,3 +85,29 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail): assert email == "guest@example.com" assert isinstance(exception, smtplib.SMTPException) + + +def test_yjs_base64_to_text(): + """ + Test extract_text_from_saved_yjs_document + This base64 string is an example of what is saved in the database. + This base64 is generated from the blocknote editor, it contains + the text \n# *Hello* \n- w**or**ld + """ + base64_string = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" + ) + + assert yjs_base64_to_text(base64_string) == "Hello world" diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index b3767eafd..e35545d25 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -2,6 +2,7 @@ Utilities for the core app. """ +import base64 import smtplib from logging import getLogger @@ -12,6 +13,9 @@ from django.utils.translation import gettext_lazy as _ from django.utils.translation import override +import y_py as Y +from bs4 import BeautifulSoup + logger = getLogger(__name__) @@ -38,3 +42,17 @@ def email_invitation(language, email, document_id): except smtplib.SMTPException as exception: logger.error("invitation to %s was not sent: %s", email, exception) + + +def yjs_base64_to_text(base64_string): + """Extract text from base64 yjs document""" + + decoded_bytes = base64.b64decode(base64_string) + uint8_array = bytearray(decoded_bytes) + + doc = Y.YDoc() # pylint: disable=E1101 + Y.apply_update(doc, uint8_array) # pylint: disable=E1101 + blocknote_structure = str(doc.get_xml_element("document-store")) + + soup = BeautifulSoup(blocknote_structure, "html.parser") + return soup.get_text(separator=" ").strip() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index ccf044f33..0c8bc504e 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -25,6 +25,7 @@ license = { file = "LICENSE" } readme = "README.md" requires-python = ">=3.10" dependencies = [ + "beautifulsoup4==4.12.3", "boto3==1.35.10", "Brotli==1.1.0", "celery[redis]==5.4.0", @@ -57,6 +58,7 @@ dependencies = [ "WeasyPrint>=60.2", "whitenoise==6.7.0", "mozilla-django-oidc==4.0.1", + "y-py==0.5.5", ] [project.urls]