From 244d6db3b8c7336894bef57ccf89904f07497201 Mon Sep 17 00:00:00 2001
From: alunturner <56027671+alunturner@users.noreply.github.com>
Date: Fri, 30 Jun 2023 09:09:39 +0100
Subject: [PATCH] Update web text parsing (#736)

* Improve parsing of markdown when building a model
---
 platforms/web/lib/conversion.test.ts | 376 +++++++++++++++++++++++++--
 platforms/web/lib/conversion.ts      | 129 +++++++--
 2 files changed, 465 insertions(+), 40 deletions(-)
diff --git a/platforms/web/lib/conversion.test.ts b/platforms/web/lib/conversion.test.ts
index e470862db..c5dd2a290 100644
--- a/platforms/web/lib/conversion.test.ts
+++ b/platforms/web/lib/conversion.test.ts
@@ -17,8 +17,8 @@ limitations under the License.
 import {
     richToPlain,
     plainToRich,
-    plainToMarkdown,
     markdownToPlain,
+    plainTextInnerHtmlToMarkdown,
 } from './conversion';
 
 describe('Rich text <=> plain text', () => {
@@ -27,7 +27,6 @@ describe('Rich text <=> plain text', () => {
         { rich: 'plain', plain: 'plain' },
         { rich: '<strong>bold</strong>', plain: '__bold__' },
         { rich: '<em>italic</em>', plain: '*italic*' },
-        { rich: '<u>underline</u>', plain: '<u>underline</u>' },
         { rich: '<del>strike</del>', plain: '~~strike~~' },
     ];
     const mappedTestCases = testCases.map(({ rich, plain }) => [rich, plain]);
@@ -43,43 +42,47 @@ describe('Rich text <=> plain text', () => {
         },
     );
 
-    it('converts linebreaks for display rich => plain', async () => {
-        const richText = 'multi<br />line';
-        const convertedPlainText = await richToPlain(richText);
-        const expectedPlainText = `multi\nline`;
+    it('converts underline case rich => plain', async () => {
+        // This is the html representation of underlining
+        const rich = '<u>underline</u>';
 
+        // When we convert the plain text, we expect the output to be the `rich` string - it
+        // is then set as `.innerText` in element web so that handles html escaping entities
+        const expectedPlainText = '<u>underline</u>';
+
+        const convertedPlainText = await richToPlain(rich);
         expect(convertedPlainText).toBe(expectedPlainText);
     });
 
-    it('converts linebreaks for display plain => rich', async () => {
-        const plainText = 'multi\nline';
-        const convertedRichText = await plainToRich(plainText, false);
-        const expectedRichText = 'multi<br />line';
+    it('converts underline case plain => rich', async () => {
+        // When the above is typed by a user in the plain text editor, the innerHTML
+        // will look like this
+        const plain = '&lt;u&gt;underline&lt;/u&gt;';
 
+        const expectedRichText = '<u>underline</u>';
+
+        const convertedRichText = await plainToRich(plain, false);
         expect(convertedRichText).toBe(expectedRichText);
     });
-});
 
-describe('Plain text <=> markdown', () => {
-    it('converts single linebreak for plain => markdown', () => {
-        const plain = 'multi\nline';
-        const convertedMarkdown = plainToMarkdown(plain);
-        const expectedMarkdown = `multi<br />line`;
+    it('converts linebreaks for display rich => plain', async () => {
+        const richText = 'multi<br />line';
+        const convertedPlainText = await richToPlain(richText);
+        const expectedPlainText = `multi\nline`;
 
-        expect(convertedMarkdown).toBe(expectedMarkdown);
+        expect(convertedPlainText).toBe(expectedPlainText);
     });
 
-    it('converts multiple linebreak for plain => markdown', () => {
-        // nb for correct display, there will be one br tag less
-        // than \n at the end
-        const plain = 'multiple\nline\n\nbreaks\n\n\n';
-        const convertedMarkdown = plainToMarkdown(plain);
-        const expectedMarkdown =
-            'multiple<br />line<br /><br />breaks<br /><br />';
+    it('converts blockquotes from plain => rich', async () => {
+        const plainText = '> I am a quote';
+        const convertedRichText = await plainToRich(plainText, false);
+        const expectedRichText = '<blockquote><p>I am a quote</p></blockquote>';
 
-        expect(convertedMarkdown).toBe(expectedMarkdown);
+        expect(convertedRichText).toBe(expectedRichText);
     });
+});
 
+describe('markdownToPlain', () => {
     it('converts single linebreak for markdown => plain', () => {
         const markdown = 'multi\\\nline';
         const convertedPlainText = markdownToPlain(markdown);
@@ -158,3 +161,326 @@ describe('Mentions', () => {
         );
     });
 });
+
+// Although a bit clunky, all of these tests simulate a plain text composer by creating a content editable
+// div, appending children to it and then reading the composer's innerHTML. This way we can ensure that we
+// are giving the conversion function decent input for the tests.
+describe('plainTextInnerHtmlToMarkdown', () => {
+    let mockComposer: HTMLDivElement;
+
+    function createMentionElement(identifier = ''): HTMLAnchorElement {
+        const mention = document.createElement('a');
+        mention.appendChild(document.createTextNode(`inner text${identifier}`));
+        mention.setAttribute('href', `testHref${identifier}`);
+        mention.setAttribute('data-mention-type', `testType${identifier}`);
+        mention.setAttribute('style', `testStyle${identifier}`);
+        mention.setAttribute('contenteditable', 'false');
+        return mention;
+    }
+
+    function createPlaceholderDiv(): HTMLDivElement {
+        const div = document.createElement('div');
+        const br = document.createElement('br');
+        div.appendChild(br);
+        return div;
+    }
+
+    beforeEach(() => {
+        mockComposer = document.createElement('div');
+        mockComposer.setAttribute('contenteditable', 'true');
+    });
+
+    it('handles two lines of text, second line empty, with newline char', () => {
+        const textNode = document.createTextNode('firstline\n\n');
+        mockComposer.appendChild(textNode);
+
+        const expected = 'firstline\n';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles two lines of text, second line empty, with placeholder div', () => {
+        const textNode = document.createTextNode('firstline');
+
+        mockComposer.append(textNode, createPlaceholderDiv());
+
+        const expected = 'firstline\n';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles consecutive newlines between text lines for newline chars', () => {
+        const textNode = document.createTextNode('first\n\n\n\nlast');
+        mockComposer.appendChild(textNode);
+
+        const expected = 'first\n\n\n\nlast';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles consecutive newlines between text lines for placeholder divs', () => {
+        const firstText = document.createTextNode('first');
+
+        // after the placeholders have started, text can only be inserted inside divs
+        const lastDiv = document.createElement('div');
+        const lastText = document.createTextNode('last');
+        lastDiv.appendChild(lastText);
+
+        const children = [
+            firstText,
+            createPlaceholderDiv(),
+            createPlaceholderDiv(),
+            createPlaceholderDiv(),
+            lastDiv,
+        ];
+
+        mockComposer.append(...children);
+
+        const expected = 'first\n\n\n\nlast';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles divs with a line break', () => {
+        const innerDiv = document.createElement('div');
+        const innerBreak = document.createElement('br');
+        innerDiv.appendChild(innerBreak);
+        mockComposer.appendChild(innerDiv);
+
+        const expected = '\n';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles divs with text content', () => {
+        const innerDiv = document.createElement('div');
+        innerDiv.appendChild(document.createTextNode('some text'));
+        mockComposer.appendChild(innerDiv);
+
+        const expected = 'some text';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles multiple divs with text content', () => {
+        const firstInnerDiv = document.createElement('div');
+        const secondInnerDiv = document.createElement('div');
+        firstInnerDiv.appendChild(document.createTextNode('some text'));
+        secondInnerDiv.appendChild(document.createTextNode('some more text'));
+
+        mockComposer.append(firstInnerDiv, secondInnerDiv);
+
+        const expected = 'some text\nsome more text';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles div following plain text node', () => {
+        const firstTextNode = 'textnode text';
+        const secondDiv = document.createElement('div');
+        secondDiv.appendChild(document.createTextNode('some more text'));
+
+        mockComposer.append(firstTextNode, secondDiv);
+
+        const expected = 'textnode text\nsome more text';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles multiple adjacent text nodes at top level', () => {
+        // this is how chrome structures the child nodes
+        const strings = [
+            'first string',
+            '\n',
+            'second string',
+            '\n',
+            'third string',
+        ];
+        strings.forEach((s) =>
+            mockComposer.appendChild(document.createTextNode(s)),
+        );
+
+        const expected = 'first string\nsecond string\nthird string';
+
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles multiple adjacent text nodes in nested div', () => {
+        const innerDiv = document.createElement('div');
+        // this is how chrome structures the child nodes
+        const strings = [
+            'first string',
+            '\n',
+            'second string',
+            '\n',
+            'third string',
+        ];
+        strings.forEach((s) =>
+            innerDiv.appendChild(document.createTextNode(s)),
+        );
+        mockComposer.appendChild(innerDiv);
+
+        const expected = 'first string\nsecond string\nthird string';
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles a mention at the top level', () => {
+        const mention = createMentionElement();
+        mockComposer.appendChild(mention);
+
+        // eslint-disable-next-line max-len
+        const expected = `<a href="testHref" data-mention-type="testType" style="testStyle" contenteditable="false">inner text</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles a mention at the top level inline with textnodes', () => {
+        const mention = createMentionElement();
+
+        mockComposer.appendChild(document.createTextNode('preceding '));
+        mockComposer.appendChild(mention);
+        mockComposer.appendChild(document.createTextNode(' following'));
+
+        // eslint-disable-next-line max-len
+        const expected = `preceding <a href="testHref" data-mention-type="testType" style="testStyle" contenteditable="false">inner text</a> following`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles a nested mention', () => {
+        const innerDiv = document.createElement('div');
+        const mention = createMentionElement();
+        innerDiv.appendChild(mention);
+        mockComposer.appendChild(innerDiv);
+
+        // eslint-disable-next-line max-len
+        const expected = `<a href="testHref" data-mention-type="testType" style="testStyle" contenteditable="false">inner text</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles a nested mention with nested text nodes', () => {
+        const innerDiv = document.createElement('div');
+        const mention = createMentionElement();
+
+        innerDiv.appendChild(document.createTextNode('preceding '));
+        innerDiv.appendChild(mention);
+        innerDiv.appendChild(document.createTextNode(' following'));
+        mockComposer.appendChild(innerDiv);
+
+        // eslint-disable-next-line max-len
+        const expected = `preceding <a href="testHref" data-mention-type="testType" style="testStyle" contenteditable="false">inner text</a> following`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles a nested mention next to top level text node', () => {
+        const innerDiv = document.createElement('div');
+        const mention = createMentionElement();
+
+        mockComposer.appendChild(document.createTextNode('preceding'));
+        innerDiv.appendChild(mention);
+        mockComposer.appendChild(innerDiv);
+
+        // eslint-disable-next-line max-len
+        const expected = `preceding\n<a href="testHref" data-mention-type="testType" style="testStyle" contenteditable="false">inner text</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles adjacent top level mentions', () => {
+        ['1', '2', '3'].forEach((id) => {
+            const mention = createMentionElement(id);
+            mockComposer.appendChild(mention);
+        });
+
+        // eslint-disable-next-line max-len
+        const expected = `<a href="testHref1" data-mention-type="testType1" style="testStyle1" contenteditable="false">inner text1</a><a href="testHref2" data-mention-type="testType2" style="testStyle2" contenteditable="false">inner text2</a><a href="testHref3" data-mention-type="testType3" style="testStyle3" contenteditable="false">inner text3</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles adjacent nested mentions', () => {
+        const innerDiv = document.createElement('div');
+        ['1', '2', '3'].forEach((id) => {
+            const mention = createMentionElement(id);
+            innerDiv.appendChild(mention);
+        });
+        mockComposer.appendChild(innerDiv);
+
+        // eslint-disable-next-line max-len
+        const expected = `<a href="testHref1" data-mention-type="testType1" style="testStyle1" contenteditable="false">inner text1</a><a href="testHref2" data-mention-type="testType2" style="testStyle2" contenteditable="false">inner text2</a><a href="testHref3" data-mention-type="testType3" style="testStyle3" contenteditable="false">inner text3</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('handles adjacent top level and nested mentions', () => {
+        const topLevelMention = createMentionElement('1');
+        const nestedMention = createMentionElement('2');
+
+        const innerDiv = document.createElement('div');
+        innerDiv.appendChild(nestedMention);
+
+        mockComposer.append(topLevelMention, innerDiv);
+
+        // eslint-disable-next-line max-len
+        const expected = `<a href="testHref1" data-mention-type="testType1" style="testStyle1" contenteditable="false">inner text1</a>\n<a href="testHref2" data-mention-type="testType2" style="testStyle2" contenteditable="false">inner text2</a>`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+    });
+
+    it('outputs to console.debug for unexpected node types', () => {
+        const debugSpy = vi
+            .spyOn(console, 'debug')
+            .mockImplementation(() => {});
+
+        const unorderedList = document.createElement('ul');
+        const listItem = document.createElement('li');
+        const textNode = document.createTextNode('list text');
+
+        listItem.appendChild(textNode);
+        unorderedList.appendChild(listItem);
+        mockComposer.appendChild(unorderedList);
+
+        // check that it pulls the inner text out correctly, despite the unexpected tags
+        const expected = `list text`;
+        expect(plainTextInnerHtmlToMarkdown(mockComposer.innerHTML)).toBe(
+            expected,
+        );
+
+        // check that it has also called console.debug with some useful output
+        expect(debugSpy.mock.calls).toMatchInlineSnapshot(`
+          [
+            [
+              "Converting unexpected node type UL",
+            ],
+            [
+              "Converting unexpected node type LI",
+            ],
+          ]
+        `);
+
+        // reset the spy
+        debugSpy.mockRestore();
+    });
+});
diff --git a/platforms/web/lib/conversion.ts b/platforms/web/lib/conversion.ts
index 2f4becba8..408afd2a6 100644
--- a/platforms/web/lib/conversion.ts
+++ b/platforms/web/lib/conversion.ts
@@ -21,18 +21,7 @@ import {
 } from '../generated/wysiwyg.js';
 import { initOnce } from './useComposerModel.js';
 
-// In plain text, due to cursor positioning, ending at a linebreak will
-// include an extra \n, so trim that off if required.
-// We replace the remaining \n with valid markdown before
-// parsing by MarkdownHTMLParser::to_html.
-export const plainToMarkdown = (plainText: string) => {
-    let markdown = plainText;
-    if (markdown.endsWith('\n')) {
-        // manually remove the final linebreak
-        markdown = markdown.slice(0, -1);
-    }
-    return markdown.replaceAll(/\n/g, '<br />');
-};
+const NEWLINE_CHAR = '\n';
 
 // In plain text, markdown newlines (displays '\' character followed by
 // a newline character) will be represented as \n for display as the
@@ -40,9 +29,9 @@ export const plainToMarkdown = (plainText: string) => {
 // We also may need to manually add a \n to account for trailing newlines.
 export const markdownToPlain = (markdown: string) => {
     let plainText = markdown;
-    if (plainText.endsWith('\n')) {
+    if (plainText.endsWith(NEWLINE_CHAR)) {
         // for cursor positioning we need to manually add another linebreak
-        plainText = `${plainText}\n`;
+        plainText = `${plainText}${NEWLINE_CHAR}`;
     }
     return plainText.replaceAll(/\\/g, '');
 };
@@ -78,7 +67,7 @@ export async function plainToRich(plainText: string, inMessageFormat: boolean) {
 
     // convert the plain text into markdown so that we can use it to
     // set the model
-    const markdown = plainToMarkdown(plainText);
+    const markdown = plainTextInnerHtmlToMarkdown(plainText);
 
     // set the model and return the rich text
     const model = new_composer_model();
@@ -88,3 +77,113 @@ export async function plainToRich(plainText: string, inMessageFormat: boolean) {
         ? model.get_content_as_message_html()
         : model.get_content_as_html();
 }
+
+/*
+The reason for requiring this function requires it's own explanation, so here it is.
+When manipulating a content editable div in a browser, as we do for the plain text version
+of the composer in element web, there is a limited subset of html that the composer can contain.
+Currently, the innerHTML of the plain text composer can only contain:
+  - text with `\n` line separation if `shift + enter` is used to insert the linebreak
+    - in this case, inserting a newline after a single word will result in `word\n\n`, then
+      subsequent typing will replace the final `\n` to give `word\nanother word`
+  - text with <div> separation if `cmd + enter to send` is enabled and `enter` is used to insert
+    the linebreak
+    - in this case, inserting a newline inserts `<div><br></div>`, and then subsequent typing 
+      replaces the <br> tag with the new content
+  - mentions (ie <a> tags with special attributes) which can be at the top level, or nested inside
+    a div 
+What we need to do is to get this input into a good shape for the markdown parser in the rust model. 
+Because of some of the intricacies of how text content is parsed when you use `.innerHTML` vs `.innerText`
+we do it manually so that we can extract:
+  - text content from any text nodes exactly as the user has written it, so that there is no escaping
+    of html entities like < or &
+  - mentions in their pure html form so that they can be passed through as valid html, as the mentions
+    in the plain text composer can be parsed into mentions inside the rust model
+*/
+export function plainTextInnerHtmlToMarkdown(innerHtml: string): string {
+    // Parse the innerHtml into a DOM and treat the `body` as the `composer`
+    const { body: composer } = new DOMParser().parseFromString(
+        innerHtml,
+        'text/html',
+    );
+
+    // Create an iterator to allow us to traverse the DOM node by node, excluding the
+    // text nodes inside mentions
+    const iterator = document.createNodeIterator(
+        composer,
+        undefined,
+        nodeFilter,
+    );
+    let node = iterator.nextNode();
+
+    // Use this to store the manually built markdown output
+    let markdownOutput = '';
+
+    while (node !== null) {
+        // TEXT NODES - `node` represents the text node
+        const isTextNode = node.nodeName === '#text';
+
+        // MENTION NODES - `node` represents the enclosing <a> tag
+        const isMention = node.nodeName === 'A';
+
+        // LINEBREAK DIVS - `node` represents the enclosing <div> tag
+        const isDivContainingBreak =
+            node.nodeName === 'DIV' &&
+            node.childNodes.length === 1 &&
+            node.firstChild?.nodeName === 'BR';
+
+        // UNEXPECTED NODE - `node` represents an unexpected tag type
+        const isUnexpectedNode = !expectedNodeNames.includes(node.nodeName);
+
+        if (isDivContainingBreak) {
+            markdownOutput += NEWLINE_CHAR;
+        } else if (isTextNode) {
+            // content is the text itself, unescaped i.e. > is >, not &gt;
+            let content = node.textContent;
+            if (shouldAddNewlineCharacter(node)) {
+                content += NEWLINE_CHAR;
+            }
+            markdownOutput += content;
+        } else if (isMention) {
+            // content is the html of the mention i.e. <a ...attributes>text</a>
+            let content = node.firstChild?.parentElement?.outerHTML ?? '';
+            if (shouldAddNewlineCharacter(node)) {
+                content += NEWLINE_CHAR;
+            }
+            markdownOutput += content;
+        } else if (isUnexpectedNode) {
+            console.debug(`Converting unexpected node type ${node.nodeName}`);
+        }
+
+        node = iterator.nextNode();
+    }
+
+    // After converting the DOM, we need to trim a single `\n` off the end of the
+    // output if we have consecutive newlines, as this is a browser placeholder
+    if (markdownOutput.endsWith(NEWLINE_CHAR.repeat(2))) {
+        markdownOutput = markdownOutput.slice(0, -1);
+    }
+
+    return markdownOutput;
+}
+
+const expectedNodeNames = ['#text', 'BR', 'A', 'DIV', 'BODY'];
+
+// When we parse the nodes, we need to manually add newlines if the node is either
+// adjacent to a div or is the last child and it's parent is adjacent to a div
+function shouldAddNewlineCharacter(node: Node): boolean {
+    const nextSibling = node.nextSibling || node.parentElement?.nextSibling;
+
+    return nextSibling?.nodeName === 'DIV';
+}
+
+// Filter to allow us to skip evaluating the text nodes inside mentions
+function nodeFilter(node: Node): number {
+    if (
+        node.nodeName === '#text' &&
+        node.parentElement?.hasAttribute('data-mention-type')
+    ) {
+        return NodeFilter.FILTER_REJECT;
+    }
+    return NodeFilter.FILTER_ACCEPT;
+}