Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Determine correct bullet points when calling parseParagraph #61

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f6ef460
fix: Numbered Lists not incremental
buddy-web3 Jan 11, 2024
ee94b23
fix: run script prettier
buddy-web3 Jan 11, 2024
7f705df
fix: update test Context
buddy-web3 Jan 11, 2024
461f097
fix: update test Context
buddy-web3 Jan 11, 2024
9916324
feat: allow multiple lists in one document
buddy-web3 Jan 12, 2024
d780570
starting allParagraphs parsing implementation
jrhender Aug 27, 2024
b04c74c
impl of orderCounting from all paragraphs
jrhender Aug 27, 2024
209b930
test(parseParagraph): adjust tests to pass allParagraphs
jrhender Oct 7, 2024
3f01d6b
test(parseParagraph): remove helper function
jrhender Oct 7, 2024
37a2c2c
feat(parseParagraph): paragraphIDs should be unique
jrhender Oct 7, 2024
5941dd3
test(parseParagraph): test multiple ordered list items
jrhender Oct 7, 2024
9657ba4
test(parseParagraph): test nested list
jrhender Oct 18, 2024
a784cca
feat(parseParagraph): add support for nesting
jrhender Oct 18, 2024
419b913
refactor(parseParagraph): make more bullet ordering more concise
jrhender Oct 18, 2024
109394a
test(parseParagraph): refactor multiitem parsing
jrhender Oct 18, 2024
f7361e8
test(parseParagraph): test multiple ordered lists
jrhender Oct 18, 2024
3fff347
style(parser): prettier fixes
jrhender Oct 18, 2024
cf7c7ed
refactor(parser): move the bullet ordering
jrhender Oct 18, 2024
d4e8aa1
refactor(parse): make bullet order map when constructing doc context
jrhender Oct 18, 2024
9ba5108
refactor(parser): undo changes to make PR easier to review
jrhender Oct 18, 2024
25be9d8
refactor(parser): move makeBulletOrderMap comment
jrhender Oct 18, 2024
6dd7a5c
Merge branch 'main' into stampyui-352-jhender-parseParagraph-all-para…
LeMurphant Nov 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 128 additions & 13 deletions parser/__tests__/parser.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
parsefootnoteReference,
parseElement,
mergeSameElements,
makeBulletOrderMap,
} from "../parser.js";

fetchMock.enableMocks();
Expand Down Expand Up @@ -269,6 +270,23 @@ describe("parseElement", () => {
});

describe("parseParagraph", () => {
const getParagraph = (startIndex, runCount) => {
const elements = [];
for (let i = 0; i < runCount; i++) {
elements.push({
startIndex: startIndex + i,
textRun: {
content: i % 2 == 0 ? "Hello, " : "world!",
},
});
}
return {
elements,
paragraphStyle: { namedStyleType: "NORMAL_TEXT" },
};
};
const paragraph = getParagraph(1, 2);

const documentContext = {
lists: {
"list-id": {
Expand All @@ -283,14 +301,6 @@ describe("parseParagraph", () => {
},
};

const paragraph = {
elements: [
{ textRun: { content: "Hello, " } },
{ textRun: { content: "world!" } },
],
paragraphStyle: { namedStyleType: "NORMAL_TEXT" },
};

it("should handle empty paragraphs", () => {
const result = parseParagraph(documentContext)({
elements: [
Expand Down Expand Up @@ -354,12 +364,117 @@ describe("parseParagraph", () => {
it("should return an ordered list item", () => {
const listItem = {
...paragraph,
bullet: { nestingLevel: 1, listId: "list-id" },
bullet: { listId: "list-id" },
};
documentContext.lists["list-id"].listProperties.nestingLevels[1].glyphType =
"DECIMAL";
const result = parseParagraph(documentContext)(listItem);
expect(result).toEqual(" 1. Hello, world!");
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap([listItem]),
};
const result = parseParagraph(context)(listItem);
expect(result).toEqual("1. Hello, world!");
});

it("should parse a list with several items", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id" },
};
paragraphs.push(listItem);
}
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
expect(result2).toEqual("2. Hello, world!");
});

it("should parse multiple lists", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id-" + i },
};
paragraphs.push(listItem);
}
const decimalList = {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
};
const context = {
...documentContext,
lists: {
"list-id-0": decimalList,
"list-id-1": decimalList,
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
expect(result2).toEqual("1. Hello, world!");
});

it("should parse a list with a nested item", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id" },
};
if (i >= 1) {
listItem.bullet.nestingLevel = i;
}
paragraphs.push(listItem);
}
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }, { glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
const nestingSpacer = " ";
expect(result2).toEqual(nestingSpacer + "1. Hello, world!");
});
});

Expand Down
71 changes: 62 additions & 9 deletions parser/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ const extractDocParts = (doc) => {
};

export const parseDoc = async (doc, answer) => {
const { paragraphs, relatedAnswerDocIDs, alternativePhrasings, glossary } =
extractDocParts(doc);

// contextual information about the doc that is sometimes useful
// to the parsers of particular elements
const documentContext = {
Expand All @@ -94,18 +97,15 @@ export const parseDoc = async (doc, answer) => {
inlineObjects: doc.inlineObjects,
lists: doc.lists || {},
suggestions: new Map(), // Accumulators for the count and total text length of all suggestions
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const { paragraphs, relatedAnswerDocIDs, alternativePhrasings, glossary } =
extractDocParts(doc);

// If the content is just a link to external content, fetch it and use it as the contents
const tagContent = await fetchExternalContent(paragraphs);
if (tagContent) {
return { md: tagContent, relatedAnswerDocIDs, alternativePhrasings };
}

const body = paragraphs.map(parseParagraph(documentContext)).join("\n\n");

const footnotes = extractFootnotes(documentContext, doc);

const md = body + "\n\n" + footnotes;
Expand Down Expand Up @@ -265,6 +265,11 @@ export const parseParagraph = (documentContext) => (paragraph) => {
const listID = pb.listId;
const list = documentContext.lists[listID];
const currentLevel = list.listProperties.nestingLevels[nestingLevel];
if (!currentLevel) {
throw new Error(
"Level information should be available for all nesting levels. Input json must be incorrect"
);
}

// This check is ugly as sin, but necessary because GDocs doesn't actually clearly say "this is an [un]ordered list" anywhere
// I think this is because internally, all lists are ordered and it just only sometimes uses glyphs which represent that
Expand All @@ -273,12 +278,17 @@ export const parseParagraph = (documentContext) => (paragraph) => {
currentLevel.hasOwnProperty("glyphType") &&
currentLevel.glyphType !== "GLYPH_TYPE_UNSPECIFIED";

// Please forgive me for always using 1. as the sequence number on list items
// It's sorta hard to count them properly so I'm depending on markdown renderers doing the heavy lifting for me.
// Which, in fairness, they're supposed to.
itemMarker = isOrdered ? "1. " : "- ";
const getBulletOrder = (paragraph) => {
const orderNumber = documentContext.getBulletOrderNumber(paragraph);
if (!orderNumber) {
throw new Error(
"Order number should be available for all ordered paragraphs"
);
}
return orderNumber;
};
itemMarker = isOrdered ? getBulletOrder(paragraph) + ". " : "- ";
leadingSpace = new Array(nestingLevel).fill(" ").join("");

return (
leadingSpace +
itemMarker +
Expand All @@ -295,6 +305,49 @@ export const parseParagraph = (documentContext) => (paragraph) => {
}
};

/**
* The order numbers for paragraph bullets are stored to then be used in the actual parsing.
* This is done separately from the parsing because it must be done on the paragraphs in order.
* Once the bullet orders are determined then further parsing could be done out of order.
* @param {*} paragraphs an array of paragraphs where the list items are in the desired order
* @returns order number getter function
*/
export const makeBulletOrderMap = (paragraphs) => {
// Using the startIndex of the first element of the paragraph
// Assuming that each paragraph has at least one element
const getParagraphId = (paragraph) => {
const firstElement = paragraph.elements[0];
return firstElement.startIndex;
};

const bulletOrderNumbers = new Map();
const listBulletCounters = new Map();
paragraphs.forEach((paragraph) => {
const { elements, ...paragraphContext } = paragraph;
const { bullet: pb } = paragraphContext;
if (!pb) return;

const listCounter = listBulletCounters.get(pb.listId) || new Map();
listBulletCounters.set(pb.listId, listCounter);

// Each nesting level should have separate count
const nestingLevel = pb.nestingLevel || 0;
const paragraphOrderNum = (listCounter.get(nestingLevel) || 0) + 1;
listCounter.set(nestingLevel, paragraphOrderNum);

const paragraphId = getParagraphId(paragraph);
if (bulletOrderNumbers.has(paragraphId)) {
throw new Error("ParagraphId should be unique for each paragraph");
}
bulletOrderNumbers.set(paragraphId, paragraphOrderNum);
});

return (paragraph) => {
const paragraphId = getParagraphId(paragraph);
return bulletOrderNumbers.get(paragraphId);
};
};

const isGrey = (textStyle) => {
const { red, green, blue } =
textStyle?.foregroundColor?.color?.rgbColor || {};
Expand Down
Loading