Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
christianhg committed Jan 27, 2025
1 parent 6416dfa commit 3546f53
Show file tree
Hide file tree
Showing 14 changed files with 536 additions and 5 deletions.
13 changes: 8 additions & 5 deletions packages/block-tools/src/HtmlDeserializer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,14 @@ export default class HtmlDeserializer {
const fragment = parseHtml(html)
const children = Array.from(fragment.childNodes) as HTMLElement[]
// Ensure that there are no blocks within blocks, and trim whitespace
const blocks = trimWhitespace(
flattenNestedBlocks(
ensureRootIsBlocks(this.deserializeElements(children)),
),
)
const deserializedElements = this.deserializeElements(children)
// console.log('deserializedElements', deserializedElements)
const rawBlocks = ensureRootIsBlocks(deserializedElements)
// console.log('rawBlocks', rawBlocks)
const flattenedBlocks = flattenNestedBlocks(rawBlocks)
// console.log('flattenedBlocks', flattenedBlocks)
const blocks = trimWhitespace(flattenedBlocks)
// console.log('blocks', blocks)

if (this._markDefs.length > 0) {
blocks
Expand Down
2 changes: 2 additions & 0 deletions packages/block-tools/src/HtmlDeserializer/rules/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import type {BlockEnabledFeatures, DeserializerRule} from '../../types'
import createGDocsRules from './gdocs'
import createHTMLRules from './html'
import createNotionRules from './notion'
import {createOfficeOnlineRules} from './office-online'
import createWordRules from './word'

export function createRules(
Expand All @@ -13,6 +14,7 @@ export function createRules(
...createWordRules(),
...createNotionRules(blockContentType),
...createGDocsRules(blockContentType, options),
...createOfficeOnlineRules(),
...createHTMLRules(blockContentType, options),
]
}
128 changes: 128 additions & 0 deletions packages/block-tools/src/HtmlDeserializer/rules/office-online.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import {BLOCK_DEFAULT_STYLE, DEFAULT_BLOCK, DEFAULT_SPAN} from '../../constants'
import {DeserializerRule} from '../../types'
import {isElement, tagName} from '../helpers'
import {spanRule} from './span'

export function createOfficeOnlineRules(): Array<DeserializerRule> {
return [
{
// Headings
deserialize(el, next) {
if (isElement(el) && tagName(el) === 'p') {
const role = el.getAttribute('role')
const levelRaw = el.getAttribute('aria-level')
const level =
typeof levelRaw === 'string'
? Number.parseInt(levelRaw, 10)
: undefined
const style = level !== undefined ? `h${level}` : undefined

if (role === 'heading' && style !== undefined) {
return {
...DEFAULT_BLOCK,
style,
children: next(el.childNodes),
}
}
}
},
},
{
// Lists
deserialize(el, next) {
if (isElement(el) && tagName(el) === 'li') {
const ariaLevel = el.getAttribute('data-aria-level')
const level = ariaLevel ? Number.parseInt(ariaLevel, 10) : undefined

const parentElement = el.parentElement
const listItem =
parentElement && tagName(parentElement) === 'ul'
? 'bullet'
: tagName(parentElement) === 'ol'
? 'number'
: undefined

if (!listItem || level === undefined) {
return
}

return {
...DEFAULT_BLOCK,
...(listItem !== undefined && level !== undefined
? {listItem, level}
: {}),
children: next(el.childNodes),
style: BLOCK_DEFAULT_STYLE,
}
}
},
},
{
// Spans
deserialize(el, next, block) {
// if (isElement(el) && tagName(el) === 'span') {
// console.log('span', {text:el.textContent}, el.classList.contains('EmptyTextRun'))
// }

if (
isElement(el) &&
tagName(el) === 'span' &&
el.classList.contains('EOP')
) {
// return {
// ...DEFAULT_SPAN,
// text: '',
// }
// console.log('span', {text: el.textContent}, el.classList.toString())
return undefined
}

if (
isElement(el) &&
tagName(el) === 'span' &&
el.classList.contains('TextRun')
) {
const marks: Array<string> = []
const style = el.getAttribute('style')

if (style) {
if (/font-style\s*:\s*italic/.test(style)) {
marks.push('em')
}

if (/font-weight\s*:\s*bold/.test(style)) {
marks.push('strong')
}

if (/text-decoration\s*:\s*underline/.test(style)) {
if (tagName(el.parentNode) !== 'a') {
marks.push('underline')
}
}
}

const text = (el.textContent ?? '').replace(/\s\s+/g, ' ')

// console.log(text === el.textContent)

// console.log({text})

return {
...DEFAULT_SPAN,
marks,
text,
}
}

return undefined
},
},
]
}

function isOfficeOnlineElement(element: Element) {
return (
element.classList.contains('TextRun') ||
element.classList.contains('NormalTextRun')
)
}
22 changes: 22 additions & 0 deletions packages/block-tools/src/HtmlDeserializer/rules/span.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import {DEFAULT_SPAN} from '../../constants'
import {DeserializerRule} from '../../types'
import {isElement, tagName} from '../helpers'

export const spanRule: DeserializerRule = {
deserialize: (node, next) => {
if (
isElement(node) &&
tagName(node) === 'span' &&
node.childNodes.length === 1 &&
node.childNodes[0].nodeName === '#text'
) {
return {
...DEFAULT_SPAN,
marks: [],
text: (node.childNodes[0].textContent ?? '').replace(/\s\s+/g, ' '),
}
}

return undefined
},
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<meta charset='utf-8'><div class="OutlineElement Ltr SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; clear: both; cursor: text; overflow: visible; position: relative; direction: ltr; color: rgb(0, 0, 0); font-family: &quot;Segoe UI&quot;, &quot;Segoe UI Web&quot;, Arial, Verdana, sans-serif; font-size: 12px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: normal; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;"><p class="Paragraph SCXW104514979 BCX0" role="heading" aria-level="1" xml:lang="EN-US" lang="EN-US" paraid="1941180722" paraeid="{aa52cbe1-de22-42de-8a6b-3bf858441b21}{162}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 24px 0px 5.33333px; padding: 0px; user-select: text; overflow-wrap: break-word; white-space: pre-wrap; font-weight: normal; font-style: normal; vertical-align: baseline; font-kerning: none; background-color: transparent; color: rgb(15, 71, 97); text-align: left; text-indent: 0px;"><span data-contrast="none" xml:lang="EN-US" lang="EN-US" class="TextRun SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(15, 71, 97); font-size: 20pt; font-style: normal; line-height: 36.0375px; font-family: &quot;Aptos Display&quot;, &quot;Aptos Display_EmbeddedFont&quot;, &quot;Aptos Display_MSFontService&quot;, sans-serif;"><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 1" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">Headin</span><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 1" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">g 1</span></span><span class="EOP SCXW104514979 BCX0" data-ccp-props="{&quot;134245418&quot;:true,&quot;134245529&quot;:true,&quot;335559738&quot;:360,&quot;335559739&quot;:80}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-size: 20pt; line-height: 36.0375px; font-family: &quot;Aptos Display&quot;, &quot;Aptos Display_EmbeddedFont&quot;, &quot;Aptos Display_MSFontService&quot;, sans-serif; color: rgb(15, 71, 97);"> </span></p></div><div class="OutlineElement Ltr SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; clear: both; cursor: text; overflow: visible; position: relative; direction: ltr; color: rgb(0, 0, 0); font-family: &quot;Segoe UI&quot;, &quot;Segoe UI Web&quot;, Arial, Verdana, sans-serif; font-size: 12px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: normal; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;"><p class="Paragraph SCXW104514979 BCX0" role="heading" aria-level="2" xml:lang="EN-US" lang="EN-US" paraid="2032552764" paraeid="{6b2bb504-0a9f-47cd-a86d-c572a25d51fa}{100}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 10.6667px 0px 5.33333px; padding: 0px; user-select: text; overflow-wrap: break-word; white-space: pre-wrap; font-weight: normal; font-style: normal; vertical-align: baseline; font-kerning: none; background-color: transparent; color: rgb(15, 71, 97); text-align: left; text-indent: 0px;"><span data-contrast="none" xml:lang="EN-US" lang="EN-US" class="TextRun SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(15, 71, 97); font-size: 16pt; font-style: normal; line-height: 27.9px; font-family: &quot;Aptos Display&quot;, &quot;Aptos Display_EmbeddedFont&quot;, &quot;Aptos Display_MSFontService&quot;, sans-serif;"><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 2" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">Heading 2</span></span><span class="EOP SCXW104514979 BCX0" data-ccp-props="{&quot;134245418&quot;:true,&quot;134245529&quot;:true,&quot;335559738&quot;:160,&quot;335559739&quot;:80}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-size: 16pt; line-height: 27.9px; font-family: &quot;Aptos Display&quot;, &quot;Aptos Display_EmbeddedFont&quot;, &quot;Aptos Display_MSFontService&quot;, sans-serif; color: rgb(15, 71, 97);"> </span></p></div><div class="OutlineElement Ltr SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; clear: both; cursor: text; overflow: visible; position: relative; direction: ltr; color: rgb(0, 0, 0); font-family: &quot;Segoe UI&quot;, &quot;Segoe UI Web&quot;, Arial, Verdana, sans-serif; font-size: 12px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: normal; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;"><p class="Paragraph SCXW104514979 BCX0" role="heading" aria-level="3" xml:lang="EN-US" lang="EN-US" paraid="428206664" paraeid="{6b2bb504-0a9f-47cd-a86d-c572a25d51fa}{176}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 10.6667px 0px 5.33333px; padding: 0px; user-select: text; overflow-wrap: break-word; white-space: pre-wrap; font-weight: normal; font-style: normal; vertical-align: baseline; font-kerning: none; background-color: transparent; color: rgb(15, 71, 97); text-align: left; text-indent: 0px;"><span data-contrast="none" xml:lang="EN-US" lang="EN-US" class="TextRun SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(15, 71, 97); font-size: 14pt; font-style: normal; line-height: 25.575px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif;"><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 3" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">Heading 3</span></span><span class="EOP SCXW104514979 BCX0" data-ccp-props="{&quot;134245418&quot;:true,&quot;134245529&quot;:true,&quot;335559738&quot;:160,&quot;335559739&quot;:80}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-size: 14pt; line-height: 25.575px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif; color: rgb(15, 71, 97);"> </span></p></div><div class="OutlineElement Ltr SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; clear: both; cursor: text; overflow: visible; position: relative; direction: ltr; color: rgb(0, 0, 0); font-family: &quot;Segoe UI&quot;, &quot;Segoe UI Web&quot;, Arial, Verdana, sans-serif; font-size: 12px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: normal; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;"><p class="Paragraph SCXW104514979 BCX0" role="heading" aria-level="4" xml:lang="EN-US" lang="EN-US" paraid="915197819" paraeid="{6b2bb504-0a9f-47cd-a86d-c572a25d51fa}{236}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 5.33333px 0px 2.66667px; padding: 0px; user-select: text; overflow-wrap: break-word; white-space: pre-wrap; font-weight: normal; font-style: italic; vertical-align: baseline; font-kerning: none; background-color: transparent; color: rgb(15, 71, 97); text-align: left; text-indent: 0px;"><span data-contrast="none" xml:lang="EN-US" lang="EN-US" class="TextRun SCXW104514979 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(15, 71, 97); font-size: 12pt; font-style: normal; line-height: 20.925px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif;"><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 4" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">Heading</span><span class="NormalTextRun SCXW104514979 BCX0" data-ccp-parastyle="heading 4" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;"> 4</span></span><span class="EOP SCXW104514979 BCX0" data-ccp-props="{&quot;134245418&quot;:true,&quot;134245529&quot;:true,&quot;335559738&quot;:80,&quot;335559739&quot;:40}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-size: 12pt; line-height: 20.925px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif; color: rgb(15, 71, 97);"> </span></p></div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import fs from 'node:fs'
import path from 'node:path'
import {JSDOM} from 'jsdom'
import {expect, test} from 'vitest'
import {htmlToBlocks} from '../../src'
import defaultSchema from '../fixtures/defaultSchema'
import {createTestKeyGenerator} from '../test-key-generator'

const blockContentType = defaultSchema
.get('blogPost')
.fields.find((field: any) => field.name === 'body').type

const html = fs
.readFileSync(path.resolve(__dirname, 'office-online-headings.html'))
.toString()

const keyGenerator = createTestKeyGenerator()

test(htmlToBlocks.name, () => {
expect(
htmlToBlocks(html, blockContentType, {
parseHtml: (html) => new JSDOM(html).window.document,
keyGenerator,
}),
).toMatchObject([
{
_type: 'block',
children: [
{
_type: 'span',
text: 'Heading 1',
},
],
style: 'h1',
},
{
_type: 'block',
children: [
{
_type: 'span',
text: 'Heading 2',
},
],
style: 'h2',
},
{
_type: 'block',
children: [
{
_type: 'span',
text: 'Heading 3',
},
],
style: 'h3',
},
{
_type: 'block',
children: [
{
_type: 'span',
text: 'Heading 4',
},
],
style: 'h4',
},
])
})
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<meta charset='utf-8'><a class="Hyperlink SCXW204275500 BCX0" href="https://example.com/" target="_blank" rel="noreferrer noopener" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; cursor: text; font-family: &quot;Segoe UI&quot;, &quot;Segoe UI Web&quot;, Arial, Verdana, sans-serif; font-size: 12px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: pre-wrap; background-color: rgb(255, 255, 255); text-decoration: none; color: inherit;"><span data-contrast="none" xml:lang="EN-US" lang="EN-US" class="TextRun Underlined SCXW204275500 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(70, 120, 134); font-size: 12pt; text-decoration: underline; line-height: 20.925px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif; font-weight: normal;"><span class="NormalTextRun SCXW204275500 BCX0" data-ccp-charstyle="Hyperlink" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text;">a link</span></span></a><span data-contrast="auto" xml:lang="EN-US" lang="EN-US" class="TextRun EmptyTextRun SCXW204275500 BCX0" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; font-variant-ligatures: none !important; color: rgb(0, 0, 0); font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: pre-wrap; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial; font-size: 12pt; line-height: 20.925px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif;"></span><span class="EOP SCXW204275500 BCX0" data-ccp-props="{}" style="-webkit-user-drag: none; -webkit-tap-highlight-color: transparent; margin: 0px; padding: 0px; user-select: text; color: rgb(0, 0, 0); font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: pre-wrap; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial; font-size: 12pt; line-height: 20.925px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, sans-serif;"> </span>
Loading

0 comments on commit 3546f53

Please sign in to comment.