Skip to content

Commit

Permalink
fix: refactor the indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
wolfgangmm committed Dec 4, 2022
1 parent 945f9f1 commit 95f0313
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 95 deletions.
18 changes: 2 additions & 16 deletions .eleventy.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,8 @@ const tpPlugin = require("./index");
module.exports = (eleventyConfig) => {
eleventyConfig.addPassthroughCopy('demo/css/*.css');
eleventyConfig.addPlugin(tpPlugin, {
remote: 'http://localhost:8040/exist/apps/tei-publisher/',
collections: true,
index: {
content: {
"view1": {
selectors: "p,dd,li,h1,h2,h3,h4,h5,h6",
tag: 'guidelines'
}
},
title: {
"breadcrumbs": {
selectors: ".breadcrumb",
allowHtml: true
}
}
}
remote: 'http://localhost:8080/exist/apps/tei-publisher/',
collections: true
});

return {
Expand Down
99 changes: 20 additions & 79 deletions plugin.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
const { JSDOM } = require("jsdom");
const NODE_TYPE = require("jsdom/lib/jsdom/living/node-type");
const axios = require("axios");
const fs = require("fs");
const path = require("path");
Expand Down Expand Up @@ -37,14 +36,7 @@ function warn(input, ...messages) {
*/

/**
* @typedef {Object} IndexConfig - Configuration for a field in the index
* @property {string} [selectors]
* @property {string} [exclude]
* @property {string} [tag]
* @property {boolean} allowHtml=false
*
* @typedef {{string: IndexConfig|Function}} IndexComponent
* @typedef {{string: IndexComponent}} Index - Index configuration options
* @typedef {{string: Function}} Index - Index configuration options
*/


Expand Down Expand Up @@ -277,79 +269,28 @@ class TpPlugin {
}
debug(`Indexing files in ${context.outputDir}...`);
const indexFile = path.join(context.baseDir, 'index.jsonl');
let counter = 1;
let foundFields;
do {
const indexEntry = {};
foundFields = false;
for (const [field, components] of Object.entries(this.config.index)) {
for (const [component, fieldDef] of Object.entries(components)) {
const file = path.join(context.outputDir, `${component}-${counter}.json`);
if (fs.existsSync(file)) {
const json = JSON.parse(fs.readFileSync(file));
const content = [];
if (typeof fieldDef === 'function') {
content.push(fieldDef.call(indexEntry, json, context.outputDir));
} else {
const dom = JSDOM.fragment(json.content);
if (fieldDef.selectors) {
dom.querySelectorAll(fieldDef.selectors).forEach((elem) => {
if (fieldDef.allowHtml) {
content.push(elem.outerHTML);
} else {
this._plainText(elem, content, fieldDef.exclude);
}
});
} else {
if (fieldDef.allowHtml) {
content.push(dom.outerHTML);
} else {
this._plainText(dom, content, fieldDef.exclude, fieldDef.allowHtml);
}
}
}
if (!indexEntry.link) {
indexEntry.link = `${json.doc}?${json.id ? 'id=' : 'root='}${json.id || json.root}`;
for (const [component, indexDef] of Object.entries(this.config.index)) {
let counter = 1;
while (true) {
const file = path.join(context.outputDir, `${component}-${counter}.json`);
if (fs.existsSync(file)) {
const json = JSON.parse(fs.readFileSync(file));
const dom = JSDOM.fragment(json.content);
if (typeof indexDef === 'function') {
let entries = indexDef.call(null, dom, json, context.outputDir);
if (!Array.isArray(entries)) {
entries = [entries];
}
if (fieldDef.tag) {
indexEntry.tag = fieldDef.tag;
}
indexEntry[field] = content.join(' ').replace(/[\n\s]+/g, ' ');
foundFields = true;
}
}
}
if (foundFields) {
fs.writeFileSync(indexFile, JSON.stringify(indexEntry) + '\n', {
flag: 'a'
});
}
counter += 1;
} while (foundFields);
}

/**
* @param {any} node
* @param {string[]} content
* @param {string} exclude
*/
_plainText(node, content, exclude = "style,script") {
for (let i = 0; i < node.childNodes.length; i++) {
const child = node.childNodes[i];
switch (child.nodeType) {
case NODE_TYPE.ELEMENT_NODE:
if (!child.matches(exclude)) {
this._plainText(child, content, exclude);
} else {
debug(`Skipping ${child}`);
entries.forEach(entry => {
fs.writeFileSync(indexFile, JSON.stringify(entry) + '\n', {
flag: 'a'
});
});
}
} else {
break;
case NODE_TYPE.TEXT_NODE:
case NODE_TYPE.CDATA_SECTION_NODE:
content.push(child.textContent);
break;
default:
break;
}
counter += 1;
}
}
}
Expand Down
34 changes: 34 additions & 0 deletions util.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
const NODE_TYPE = require("jsdom/lib/jsdom/living/node-type");

/**
* Extracts plain text of a DOM node, optionally omitting certain
* descendant elements selected by a CSS selector.
*
* @param {Node} node the DOM node to index
* @param {string} exclude CSS selector defining elements to omit from indexing
* @returns the plain text of the document fragment
*/
module.exports.extractPlainText = (node, exclude = "style,script") => {
const content = [];
_extractPlainText(node, content, exclude);
return content.join('');
};

function _extractPlainText(node, content, exclude) {
for (let i = 0; i < node.childNodes.length; i++) {
const child = node.childNodes[i];
switch (child.nodeType) {
case NODE_TYPE.ELEMENT_NODE:
if (!child.matches(exclude)) {
_extractPlainText(child, content, exclude);
}
break;
case NODE_TYPE.TEXT_NODE:
case NODE_TYPE.CDATA_SECTION_NODE:
content.push(child.textContent);
break;
default:
break;
}
}
}

0 comments on commit 95f0313

Please sign in to comment.