Skip to content

Commit

Permalink
⚗️ example about how to extract text from base64 yjs document
Browse files Browse the repository at this point in the history
  • Loading branch information
AntoLC committed Sep 19, 2024
1 parent ac86a4e commit 3122c6a
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 0 deletions.
34 changes: 34 additions & 0 deletions src/backend/core/tests/documents/test_api_documents_retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
Tests for Documents API endpoint in impress's core app: retrieve
"""

import base64
from bs4 import BeautifulSoup
import pytest
import y_py as Y

from rest_framework.test import APIClient

from core import factories, models
Expand Down Expand Up @@ -581,3 +585,33 @@ def test_api_documents_retrieve_authenticated_related_team_owners(
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
}


def test_read_document_yjs_blocknote():
# I wrote "Hello world" in the blocknote editor
# This is the base64 string of the Yjs document saved in Minio
base64_string = "ARCymr/3DgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcAspq/9w4AAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4BAwlwYXJhZ3JhcGgHALKav/cOAgYEALKav/cOAwFIKACymr/3DgINdGV4dEFsaWdubWVudAF3BGxlZnQoALKav/cOAQJpZAF3DmluaXRpYWxCbG9ja0lkKACymr/3DgEJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4BD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSHspq/9w4BAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4JAwlwYXJhZ3JhcGgoALKav/cOCg10ZXh0QWxpZ25tZW50AXcEbGVmdCgAspq/9w4JAmlkAXckMTFjYTgzYmEtZGM3OS00N2Q3LTllNzYtNmM4OTQwNzc1ZjE3KACymr/3DgkJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4JD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSEspq/9w4EC2VsbG8gd29ybGQgAA=="
decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)

d1 = Y.YDoc()
Y.apply_update(d1, uint8_array)
blocknote = str(d1.get_xml_element('document-store'))

# blocknote var will look like this:
# <UNDEFINED>
# <blockGroup>
# <blockContainer "backgroundColor"="default" "id"="initialBlockId" "textColor"="default">
# <paragraph "textAlignment"="left">Hello world </paragraph>
# </blockContainer>
# <blockContainer "id"="11ca83ba-dc79-47d7-9e76-6c8940775f17" "backgroundColor"="default" "textColor"="default">
# <paragraph "textAlignment"="left"></paragraph>
# </blockContainer>
# </blockGroup>
# </UNDEFINED>

# BeautifulSoup is used to extract the text from the previous structure
soup = BeautifulSoup(blocknote, "html.parser")
soupValue = soup.get_text(separator=' ').strip()

assert soupValue == "Hello world"
2 changes: 2 additions & 0 deletions src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"boto3==1.35.10",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
Expand Down Expand Up @@ -57,6 +58,7 @@ dependencies = [
"WeasyPrint>=60.2",
"whitenoise==6.7.0",
"mozilla-django-oidc==4.0.1",
"y-py==0.5.5",
]

[project.urls]
Expand Down

0 comments on commit 3122c6a

Please sign in to comment.