Skip to content

Commit

Permalink
Merge pull request #61 from StampyAI/stampyui-352-jhender-parseParagr…
Browse files Browse the repository at this point in the history
…aph-all-paragraphs

Determine correct bullet points when calling parseParagraph
  • Loading branch information
LeMurphant authored Nov 13, 2024
2 parents d5f618b + 6dd7a5c commit b86afb0
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 22 deletions.
141 changes: 128 additions & 13 deletions parser/__tests__/parser.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
parsefootnoteReference,
parseElement,
mergeSameElements,
makeBulletOrderMap,
} from "../parser.js";

fetchMock.enableMocks();
Expand Down Expand Up @@ -269,6 +270,23 @@ describe("parseElement", () => {
});

describe("parseParagraph", () => {
const getParagraph = (startIndex, runCount) => {
const elements = [];
for (let i = 0; i < runCount; i++) {
elements.push({
startIndex: startIndex + i,
textRun: {
content: i % 2 == 0 ? "Hello, " : "world!",
},
});
}
return {
elements,
paragraphStyle: { namedStyleType: "NORMAL_TEXT" },
};
};
const paragraph = getParagraph(1, 2);

const documentContext = {
lists: {
"list-id": {
Expand All @@ -283,14 +301,6 @@ describe("parseParagraph", () => {
},
};

const paragraph = {
elements: [
{ textRun: { content: "Hello, " } },
{ textRun: { content: "world!" } },
],
paragraphStyle: { namedStyleType: "NORMAL_TEXT" },
};

it("should handle empty paragraphs", () => {
const result = parseParagraph(documentContext)({
elements: [
Expand Down Expand Up @@ -354,12 +364,117 @@ describe("parseParagraph", () => {
it("should return an ordered list item", () => {
const listItem = {
...paragraph,
bullet: { nestingLevel: 1, listId: "list-id" },
bullet: { listId: "list-id" },
};
documentContext.lists["list-id"].listProperties.nestingLevels[1].glyphType =
"DECIMAL";
const result = parseParagraph(documentContext)(listItem);
expect(result).toEqual(" 1. Hello, world!");
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap([listItem]),
};
const result = parseParagraph(context)(listItem);
expect(result).toEqual("1. Hello, world!");
});

it("should parse a list with several items", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id" },
};
paragraphs.push(listItem);
}
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
expect(result2).toEqual("2. Hello, world!");
});

it("should parse multiple lists", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id-" + i },
};
paragraphs.push(listItem);
}
const decimalList = {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }],
},
};
const context = {
...documentContext,
lists: {
"list-id-0": decimalList,
"list-id-1": decimalList,
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
expect(result2).toEqual("1. Hello, world!");
});

it("should parse a list with a nested item", () => {
const paragraphCount = 2;
const paragraphs = [];
for (let i = 0; i < paragraphCount; i++) {
const runCount = 2;
const paragraph = getParagraph(i * runCount + 1, runCount);
const listItem = {
...paragraph,
bullet: { listId: "list-id" },
};
if (i >= 1) {
listItem.bullet.nestingLevel = i;
}
paragraphs.push(listItem);
}
const context = {
...documentContext,
lists: {
"list-id": {
listProperties: {
nestingLevels: [{ glyphType: "DECIMAL" }, { glyphType: "DECIMAL" }],
},
},
},
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const parseWithContext = parseParagraph(context, paragraphs);
const result1 = parseWithContext(paragraphs[0]);
expect(result1).toEqual("1. Hello, world!");
const result2 = parseWithContext(paragraphs[1]);
const nestingSpacer = " ";
expect(result2).toEqual(nestingSpacer + "1. Hello, world!");
});
});

Expand Down
71 changes: 62 additions & 9 deletions parser/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ const extractDocParts = (doc) => {
};

export const parseDoc = async (doc, answer) => {
const { paragraphs, relatedAnswerDocIDs, alternativePhrasings, glossary } =
extractDocParts(doc);

// contextual information about the doc that is sometimes useful
// to the parsers of particular elements
const documentContext = {
Expand All @@ -94,18 +97,15 @@ export const parseDoc = async (doc, answer) => {
inlineObjects: doc.inlineObjects,
lists: doc.lists || {},
suggestions: new Map(), // Accumulators for the count and total text length of all suggestions
getBulletOrderNumber: makeBulletOrderMap(paragraphs),
};
const { paragraphs, relatedAnswerDocIDs, alternativePhrasings, glossary } =
extractDocParts(doc);

// If the content is just a link to external content, fetch it and use it as the contents
const tagContent = await fetchExternalContent(paragraphs);
if (tagContent) {
return { md: tagContent, relatedAnswerDocIDs, alternativePhrasings };
}

const body = paragraphs.map(parseParagraph(documentContext)).join("\n\n");

const footnotes = extractFootnotes(documentContext, doc);

const md = body + "\n\n" + footnotes;
Expand Down Expand Up @@ -265,6 +265,11 @@ export const parseParagraph = (documentContext) => (paragraph) => {
const listID = pb.listId;
const list = documentContext.lists[listID];
const currentLevel = list.listProperties.nestingLevels[nestingLevel];
if (!currentLevel) {
throw new Error(
"Level information should be available for all nesting levels. Input json must be incorrect"
);
}

// This check is ugly as sin, but necessary because GDocs doesn't actually clearly say "this is an [un]ordered list" anywhere
// I think this is because internally, all lists are ordered and it just only sometimes uses glyphs which represent that
Expand All @@ -273,12 +278,17 @@ export const parseParagraph = (documentContext) => (paragraph) => {
currentLevel.hasOwnProperty("glyphType") &&
currentLevel.glyphType !== "GLYPH_TYPE_UNSPECIFIED";

// Please forgive me for always using 1. as the sequence number on list items
// It's sorta hard to count them properly so I'm depending on markdown renderers doing the heavy lifting for me.
// Which, in fairness, they're supposed to.
itemMarker = isOrdered ? "1. " : "- ";
const getBulletOrder = (paragraph) => {
const orderNumber = documentContext.getBulletOrderNumber(paragraph);
if (!orderNumber) {
throw new Error(
"Order number should be available for all ordered paragraphs"
);
}
return orderNumber;
};
itemMarker = isOrdered ? getBulletOrder(paragraph) + ". " : "- ";
leadingSpace = new Array(nestingLevel).fill(" ").join("");

return (
leadingSpace +
itemMarker +
Expand All @@ -295,6 +305,49 @@ export const parseParagraph = (documentContext) => (paragraph) => {
}
};

/**
* The order numbers for paragraph bullets are stored to then be used in the actual parsing.
* This is done separately from the parsing because it must be done on the paragraphs in order.
* Once the bullet orders are determined then further parsing could be done out of order.
* @param {*} paragraphs an array of paragraphs where the list items are in the desired order
* @returns order number getter function
*/
export const makeBulletOrderMap = (paragraphs) => {
// Using the startIndex of the first element of the paragraph
// Assuming that each paragraph has at least one element
const getParagraphId = (paragraph) => {
const firstElement = paragraph.elements[0];
return firstElement.startIndex;
};

const bulletOrderNumbers = new Map();
const listBulletCounters = new Map();
paragraphs.forEach((paragraph) => {
const { elements, ...paragraphContext } = paragraph;
const { bullet: pb } = paragraphContext;
if (!pb) return;

const listCounter = listBulletCounters.get(pb.listId) || new Map();
listBulletCounters.set(pb.listId, listCounter);

// Each nesting level should have separate count
const nestingLevel = pb.nestingLevel || 0;
const paragraphOrderNum = (listCounter.get(nestingLevel) || 0) + 1;
listCounter.set(nestingLevel, paragraphOrderNum);

const paragraphId = getParagraphId(paragraph);
if (bulletOrderNumbers.has(paragraphId)) {
throw new Error("ParagraphId should be unique for each paragraph");
}
bulletOrderNumbers.set(paragraphId, paragraphOrderNum);
});

return (paragraph) => {
const paragraphId = getParagraphId(paragraph);
return bulletOrderNumbers.get(paragraphId);
};
};

const isGrey = (textStyle) => {
const { red, green, blue } =
textStyle?.foregroundColor?.color?.rgbColor || {};
Expand Down

0 comments on commit b86afb0

Please sign in to comment.