From 3122c6a9b022fcc1d83d45c62cd6ebb579cc2cf2 Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Thu, 19 Sep 2024 12:11:23 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9A=97=EF=B8=8F=20example=20about=20how=20to?= =?UTF-8?q?=20extract=20text=20from=20base64=20yjs=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../documents/test_api_documents_retrieve.py | 34 +++++++++++++++++++ src/backend/pyproject.toml | 2 ++ 2 files changed, 36 insertions(+) diff --git a/src/backend/core/tests/documents/test_api_documents_retrieve.py b/src/backend/core/tests/documents/test_api_documents_retrieve.py index f37339f6d..d23e62b68 100644 --- a/src/backend/core/tests/documents/test_api_documents_retrieve.py +++ b/src/backend/core/tests/documents/test_api_documents_retrieve.py @@ -2,7 +2,11 @@ Tests for Documents API endpoint in impress's core app: retrieve """ +import base64 +from bs4 import BeautifulSoup import pytest +import y_py as Y + from rest_framework.test import APIClient from core import factories, models @@ -581,3 +585,33 @@ def test_api_documents_retrieve_authenticated_related_team_owners( "created_at": document.created_at.isoformat().replace("+00:00", "Z"), "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), } + + +def test_read_document_yjs_blocknote(): + # I wrote "Hello world" in the blocknote editor + # This is the base64 string of the Yjs document saved in Minio + base64_string = "ARCymr/3DgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcAspq/9w4AAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4BAwlwYXJhZ3JhcGgHALKav/cOAgYEALKav/cOAwFIKACymr/3DgINdGV4dEFsaWdubWVudAF3BGxlZnQoALKav/cOAQJpZAF3DmluaXRpYWxCbG9ja0lkKACymr/3DgEJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4BD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSHspq/9w4BAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4JAwlwYXJhZ3JhcGgoALKav/cOCg10ZXh0QWxpZ25tZW50AXcEbGVmdCgAspq/9w4JAmlkAXckMTFjYTgzYmEtZGM3OS00N2Q3LTllNzYtNmM4OTQwNzc1ZjE3KACymr/3DgkJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4JD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSEspq/9w4EC2VsbG8gd29ybGQgAA==" + decoded_bytes = base64.b64decode(base64_string) + uint8_array = bytearray(decoded_bytes) + + d1 = Y.YDoc() + Y.apply_update(d1, uint8_array) + blocknote = str(d1.get_xml_element('document-store')) + + # blocknote var will look like this: + # + # + # + # Hello world + # + # + # + # + # + # + + # BeautifulSoup is used to extract the text from the previous structure + soup = BeautifulSoup(blocknote, "html.parser") + soupValue = soup.get_text(separator=' ').strip() + + assert soupValue == "Hello world" diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index ccf044f33..0c8bc504e 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -25,6 +25,7 @@ license = { file = "LICENSE" } readme = "README.md" requires-python = ">=3.10" dependencies = [ + "beautifulsoup4==4.12.3", "boto3==1.35.10", "Brotli==1.1.0", "celery[redis]==5.4.0", @@ -57,6 +58,7 @@ dependencies = [ "WeasyPrint>=60.2", "whitenoise==6.7.0", "mozilla-django-oidc==4.0.1", + "y-py==0.5.5", ] [project.urls]