From 244d6db3b8c7336894bef57ccf89904f07497201 Mon Sep 17 00:00:00 2001 From: alunturner <56027671+alunturner@users.noreply.github.com> Date: Fri, 30 Jun 2023 09:09:39 +0100 Subject: [PATCH] Update web text parsing (#736) * Improve parsing of markdown when building a model --- platforms/web/lib/conversion.test.ts | 376 +++++++++++++++++++++++++-- platforms/web/lib/conversion.ts | 129 +++++++-- 2 files changed, 465 insertions(+), 40 deletions(-) diff --git a/platforms/web/lib/conversion.test.ts b/platforms/web/lib/conversion.test.ts index e470862db..c5dd2a290 100644 --- a/platforms/web/lib/conversion.test.ts +++ b/platforms/web/lib/conversion.test.ts @@ -17,8 +17,8 @@ limitations under the License. import { richToPlain, plainToRich, - plainToMarkdown, markdownToPlain, + plainTextInnerHtmlToMarkdown, } from './conversion'; describe('Rich text <=> plain text', () => { @@ -27,7 +27,6 @@ describe('Rich text <=> plain text', () => { { rich: 'plain', plain: 'plain' }, { rich: 'bold', plain: '__bold__' }, { rich: 'italic', plain: '*italic*' }, - { rich: 'underline', plain: 'underline' }, { rich: 'strike', plain: '~~strike~~' }, ]; const mappedTestCases = testCases.map(({ rich, plain }) => [rich, plain]); @@ -43,43 +42,47 @@ describe('Rich text <=> plain text', () => { }, ); - it('converts linebreaks for display rich => plain', async () => { - const richText = 'multi
line'; - const convertedPlainText = await richToPlain(richText); - const expectedPlainText = `multi\nline`; + it('converts underline case rich => plain', async () => { + // This is the html representation of underlining + const rich = 'underline'; + // When we convert the plain text, we expect the output to be the `rich` string - it + // is then set as `.innerText` in element web so that handles html escaping entities + const expectedPlainText = 'underline'; + + const convertedPlainText = await richToPlain(rich); expect(convertedPlainText).toBe(expectedPlainText); }); - it('converts linebreaks for display plain => rich', async () => { - const plainText = 'multi\nline'; - const convertedRichText = await plainToRich(plainText, false); - const expectedRichText = 'multi
line'; + it('converts underline case plain => rich', async () => { + // When the above is typed by a user in the plain text editor, the innerHTML + // will look like this + const plain = '<u>underline</u>'; + const expectedRichText = 'underline'; + + const convertedRichText = await plainToRich(plain, false); expect(convertedRichText).toBe(expectedRichText); }); -}); -describe('Plain text <=> markdown', () => { - it('converts single linebreak for plain => markdown', () => { - const plain = 'multi\nline'; - const convertedMarkdown = plainToMarkdown(plain); - const expectedMarkdown = `multi
line`; + it('converts linebreaks for display rich => plain', async () => { + const richText = 'multi
line'; + const convertedPlainText = await richToPlain(richText); + const expectedPlainText = `multi\nline`; - expect(convertedMarkdown).toBe(expectedMarkdown); + expect(convertedPlainText).toBe(expectedPlainText); }); - it('converts multiple linebreak for plain => markdown', () => { - // nb for correct display, there will be one br tag less - // than \n at the end - const plain = 'multiple\nline\n\nbreaks\n\n\n'; - const convertedMarkdown = plainToMarkdown(plain); - const expectedMarkdown = - 'multiple
line

breaks

'; + it('converts blockquotes from plain => rich', async () => { + const plainText = '> I am a quote'; + const convertedRichText = await plainToRich(plainText, false); + const expectedRichText = '

I am a quote

'; - expect(convertedMarkdown).toBe(expectedMarkdown); + expect(convertedRichText).toBe(expectedRichText); }); +}); +describe('markdownToPlain', () => { it('converts single linebreak for markdown => plain', () => { const markdown = 'multi\\\nline'; const convertedPlainText = markdownToPlain(markdown); @@ -158,3 +161,326 @@ describe('Mentions', () => { ); }); }); + +// Although a bit clunky, all of these tests simulate a plain text composer by creating a content editable +// div, appending children to it and then reading the composer's innerHTML. This way we can ensure that we +// are giving the conversion function decent input for the tests. +describe('plainTextInnerHtmlToMarkdown', () => { + let mockComposer: HTMLDivElement; + + function createMentionElement(identifier = ''): HTMLAnchorElement { + const mention = document.createElement('a'); + mention.appendChild(document.createTextNode(`inner text${identifier}`)); + mention.setAttribute('href', `testHref${identifier}`); + mention.setAttribute('data-mention-type', `testType${identifier}`); + mention.setAttribute('style', `testStyle${identifier}`); + mention.setAttribute('contenteditable', 'false'); + return mention; + } + + function createPlaceholderDiv(): HTMLDivElement { + const div = document.createElement('div'); + const br = document.createElement('br'); + div.appendChild(br); + return div; + } + + beforeEach(() => { + mockComposer = document.createElement('div'); + mockComposer.setAttribute('contenteditable', 'true'); + }); + + it('handles two lines of text, second line empty, with newline char', () => { + const textNode = document.createTextNode('firstline\n\n'); + mockComposer.appendChild(textNode); + + const expected = 'firstline\n'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles two lines of text, second line empty, with placeholder div', () => { + const textNode = document.createTextNode('firstline'); + + mockComposer.append(textNode, createPlaceholderDiv()); + + const expected = 'firstline\n'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles consecutive newlines between text lines for newline chars', () => { + const textNode = document.createTextNode('first\n\n\n\nlast'); + mockComposer.appendChild(textNode); + + const expected = 'first\n\n\n\nlast'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles consecutive newlines between text lines for placeholder divs', () => { + const firstText = document.createTextNode('first'); + + // after the placeholders have started, text can only be inserted inside divs + const lastDiv = document.createElement('div'); + const lastText = document.createTextNode('last'); + lastDiv.appendChild(lastText); + + const children = [ + firstText, + createPlaceholderDiv(), + createPlaceholderDiv(), + createPlaceholderDiv(), + lastDiv, + ]; + + mockComposer.append(...children); + + const expected = 'first\n\n\n\nlast'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles divs with a line break', () => { + const innerDiv = document.createElement('div'); + const innerBreak = document.createElement('br'); + innerDiv.appendChild(innerBreak); + mockComposer.appendChild(innerDiv); + + const expected = '\n'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles divs with text content', () => { + const innerDiv = document.createElement('div'); + innerDiv.appendChild(document.createTextNode('some text')); + mockComposer.appendChild(innerDiv); + + const expected = 'some text'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles multiple divs with text content', () => { + const firstInnerDiv = document.createElement('div'); + const secondInnerDiv = document.createElement('div'); + firstInnerDiv.appendChild(document.createTextNode('some text')); + secondInnerDiv.appendChild(document.createTextNode('some more text')); + + mockComposer.append(firstInnerDiv, secondInnerDiv); + + const expected = 'some text\nsome more text'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles div following plain text node', () => { + const firstTextNode = 'textnode text'; + const secondDiv = document.createElement('div'); + secondDiv.appendChild(document.createTextNode('some more text')); + + mockComposer.append(firstTextNode, secondDiv); + + const expected = 'textnode text\nsome more text'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles multiple adjacent text nodes at top level', () => { + // this is how chrome structures the child nodes + const strings = [ + 'first string', + '\n', + 'second string', + '\n', + 'third string', + ]; + strings.forEach((s) => + mockComposer.appendChild(document.createTextNode(s)), + ); + + const expected = 'first string\nsecond string\nthird string'; + + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles multiple adjacent text nodes in nested div', () => { + const innerDiv = document.createElement('div'); + // this is how chrome structures the child nodes + const strings = [ + 'first string', + '\n', + 'second string', + '\n', + 'third string', + ]; + strings.forEach((s) => + innerDiv.appendChild(document.createTextNode(s)), + ); + mockComposer.appendChild(innerDiv); + + const expected = 'first string\nsecond string\nthird string'; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles a mention at the top level', () => { + const mention = createMentionElement(); + mockComposer.appendChild(mention); + + // eslint-disable-next-line max-len + const expected = `inner text`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles a mention at the top level inline with textnodes', () => { + const mention = createMentionElement(); + + mockComposer.appendChild(document.createTextNode('preceding ')); + mockComposer.appendChild(mention); + mockComposer.appendChild(document.createTextNode(' following')); + + // eslint-disable-next-line max-len + const expected = `preceding inner text following`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles a nested mention', () => { + const innerDiv = document.createElement('div'); + const mention = createMentionElement(); + innerDiv.appendChild(mention); + mockComposer.appendChild(innerDiv); + + // eslint-disable-next-line max-len + const expected = `inner text`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles a nested mention with nested text nodes', () => { + const innerDiv = document.createElement('div'); + const mention = createMentionElement(); + + innerDiv.appendChild(document.createTextNode('preceding ')); + innerDiv.appendChild(mention); + innerDiv.appendChild(document.createTextNode(' following')); + mockComposer.appendChild(innerDiv); + + // eslint-disable-next-line max-len + const expected = `preceding inner text following`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles a nested mention next to top level text node', () => { + const innerDiv = document.createElement('div'); + const mention = createMentionElement(); + + mockComposer.appendChild(document.createTextNode('preceding')); + innerDiv.appendChild(mention); + mockComposer.appendChild(innerDiv); + + // eslint-disable-next-line max-len + const expected = `preceding\ninner text`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles adjacent top level mentions', () => { + ['1', '2', '3'].forEach((id) => { + const mention = createMentionElement(id); + mockComposer.appendChild(mention); + }); + + // eslint-disable-next-line max-len + const expected = `inner text1inner text2inner text3`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles adjacent nested mentions', () => { + const innerDiv = document.createElement('div'); + ['1', '2', '3'].forEach((id) => { + const mention = createMentionElement(id); + innerDiv.appendChild(mention); + }); + mockComposer.appendChild(innerDiv); + + // eslint-disable-next-line max-len + const expected = `inner text1inner text2inner text3`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('handles adjacent top level and nested mentions', () => { + const topLevelMention = createMentionElement('1'); + const nestedMention = createMentionElement('2'); + + const innerDiv = document.createElement('div'); + innerDiv.appendChild(nestedMention); + + mockComposer.append(topLevelMention, innerDiv); + + // eslint-disable-next-line max-len + const expected = `inner text1\ninner text2`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + }); + + it('outputs to console.debug for unexpected node types', () => { + const debugSpy = vi + .spyOn(console, 'debug') + .mockImplementation(() => {}); + + const unorderedList = document.createElement('ul'); + const listItem = document.createElement('li'); + const textNode = document.createTextNode('list text'); + + listItem.appendChild(textNode); + unorderedList.appendChild(listItem); + mockComposer.appendChild(unorderedList); + + // check that it pulls the inner text out correctly, despite the unexpected tags + const expected = `list text`; + expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe( + expected, + ); + + // check that it has also called console.debug with some useful output + expect(debugSpy.mock.calls).toMatchInlineSnapshot(` + [ + [ + "Converting unexpected node type UL", + ], + [ + "Converting unexpected node type LI", + ], + ] + `); + + // reset the spy + debugSpy.mockRestore(); + }); +}); diff --git a/platforms/web/lib/conversion.ts b/platforms/web/lib/conversion.ts index 2f4becba8..408afd2a6 100644 --- a/platforms/web/lib/conversion.ts +++ b/platforms/web/lib/conversion.ts @@ -21,18 +21,7 @@ import { } from '../generated/wysiwyg.js'; import { initOnce } from './useComposerModel.js'; -// In plain text, due to cursor positioning, ending at a linebreak will -// include an extra \n, so trim that off if required. -// We replace the remaining \n with valid markdown before -// parsing by MarkdownHTMLParser::to_html. -export const plainToMarkdown = (plainText: string) => { - let markdown = plainText; - if (markdown.endsWith('\n')) { - // manually remove the final linebreak - markdown = markdown.slice(0, -1); - } - return markdown.replaceAll(/\n/g, '
'); -}; +const NEWLINE_CHAR = '\n'; // In plain text, markdown newlines (displays '\' character followed by // a newline character) will be represented as \n for display as the @@ -40,9 +29,9 @@ export const plainToMarkdown = (plainText: string) => { // We also may need to manually add a \n to account for trailing newlines. export const markdownToPlain = (markdown: string) => { let plainText = markdown; - if (plainText.endsWith('\n')) { + if (plainText.endsWith(NEWLINE_CHAR)) { // for cursor positioning we need to manually add another linebreak - plainText = `${plainText}\n`; + plainText = `${plainText}${NEWLINE_CHAR}`; } return plainText.replaceAll(/\\/g, ''); }; @@ -78,7 +67,7 @@ export async function plainToRich(plainText: string, inMessageFormat: boolean) { // convert the plain text into markdown so that we can use it to // set the model - const markdown = plainToMarkdown(plainText); + const markdown = plainTextInnerHtmlToMarkdown(plainText); // set the model and return the rich text const model = new_composer_model(); @@ -88,3 +77,113 @@ export async function plainToRich(plainText: string, inMessageFormat: boolean) { ? model.get_content_as_message_html() : model.get_content_as_html(); } + +/* +The reason for requiring this function requires it's own explanation, so here it is. +When manipulating a content editable div in a browser, as we do for the plain text version +of the composer in element web, there is a limited subset of html that the composer can contain. +Currently, the innerHTML of the plain text composer can only contain: + - text with `\n` line separation if `shift + enter` is used to insert the linebreak + - in this case, inserting a newline after a single word will result in `word\n\n`, then + subsequent typing will replace the final `\n` to give `word\nanother word` + - text with
separation if `cmd + enter to send` is enabled and `enter` is used to insert + the linebreak + - in this case, inserting a newline inserts `

`, and then subsequent typing + replaces the
tag with the new content + - mentions (ie tags with special attributes) which can be at the top level, or nested inside + a div +What we need to do is to get this input into a good shape for the markdown parser in the rust model. +Because of some of the intricacies of how text content is parsed when you use `.innerHTML` vs `.innerText` +we do it manually so that we can extract: + - text content from any text nodes exactly as the user has written it, so that there is no escaping + of html entities like < or & + - mentions in their pure html form so that they can be passed through as valid html, as the mentions + in the plain text composer can be parsed into mentions inside the rust model +*/ +export function plainTextInnerHtmlToMarkdown(innerHtml: string): string { + // Parse the innerHtml into a DOM and treat the `body` as the `composer` + const { body: composer } = new DOMParser().parseFromString( + innerHtml, + 'text/html', + ); + + // Create an iterator to allow us to traverse the DOM node by node, excluding the + // text nodes inside mentions + const iterator = document.createNodeIterator( + composer, + undefined, + nodeFilter, + ); + let node = iterator.nextNode(); + + // Use this to store the manually built markdown output + let markdownOutput = ''; + + while (node !== null) { + // TEXT NODES - `node` represents the text node + const isTextNode = node.nodeName === '#text'; + + // MENTION NODES - `node` represents the enclosing tag + const isMention = node.nodeName === 'A'; + + // LINEBREAK DIVS - `node` represents the enclosing
tag + const isDivContainingBreak = + node.nodeName === 'DIV' && + node.childNodes.length === 1 && + node.firstChild?.nodeName === 'BR'; + + // UNEXPECTED NODE - `node` represents an unexpected tag type + const isUnexpectedNode = !expectedNodeNames.includes(node.nodeName); + + if (isDivContainingBreak) { + markdownOutput += NEWLINE_CHAR; + } else if (isTextNode) { + // content is the text itself, unescaped i.e. > is >, not > + let content = node.textContent; + if (shouldAddNewlineCharacter(node)) { + content += NEWLINE_CHAR; + } + markdownOutput += content; + } else if (isMention) { + // content is the html of the mention i.e. text + let content = node.firstChild?.parentElement?.outerHTML ?? ''; + if (shouldAddNewlineCharacter(node)) { + content += NEWLINE_CHAR; + } + markdownOutput += content; + } else if (isUnexpectedNode) { + console.debug(`Converting unexpected node type ${node.nodeName}`); + } + + node = iterator.nextNode(); + } + + // After converting the DOM, we need to trim a single `\n` off the end of the + // output if we have consecutive newlines, as this is a browser placeholder + if (markdownOutput.endsWith(NEWLINE_CHAR.repeat(2))) { + markdownOutput = markdownOutput.slice(0, -1); + } + + return markdownOutput; +} + +const expectedNodeNames = ['#text', 'BR', 'A', 'DIV', 'BODY']; + +// When we parse the nodes, we need to manually add newlines if the node is either +// adjacent to a div or is the last child and it's parent is adjacent to a div +function shouldAddNewlineCharacter(node: Node): boolean { + const nextSibling = node.nextSibling || node.parentElement?.nextSibling; + + return nextSibling?.nodeName === 'DIV'; +} + +// Filter to allow us to skip evaluating the text nodes inside mentions +function nodeFilter(node: Node): number { + if ( + node.nodeName === '#text' && + node.parentElement?.hasAttribute('data-mention-type') + ) { + return NodeFilter.FILTER_REJECT; + } + return NodeFilter.FILTER_ACCEPT; +}