Skip to content

Commit

Permalink
The conversion can be performed, but there is a small problem
Browse files Browse the repository at this point in the history
  • Loading branch information
Zacharia2 committed Jul 12, 2023
1 parent 39b1cca commit 95ab5b6
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 74 deletions.
1 change: 0 additions & 1 deletion epub2twpub/epub-reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ class EpubReader {
},
logError: this.logError.bind(this)
});
await textExtractor.initialise();
// Extract each HTML file listed in the spine
for(const spineItem of this.spine) {
const manifestItem = this.manifest[spineItem];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Each stylsheet entry is the text of the stylesheet
*/

exports.getPageText = function(win,doc) {
exports.getStructure = function(win,doc) {
win = win || window;
doc = doc || document;

Expand Down Expand Up @@ -112,7 +112,7 @@ const nonBlankChunks = chunks.outputChunks.filter(chunk => {
return !(chunk.anchorIds.length === 0 && (chunk.nodes.length === 1) && (typeof (chunk.nodes[0]) === "string") && (!(/\S/.test(chunk.nodes[0]))));
})
// Get the expected test results if present
const domExpectedResults = document.getElementsByTagName("script")[0];
const domExpectedResults = doc.getElementsByTagName("script")[0];
var expectedResults;
if(domExpectedResults && domExpectedResults.id === "expectedResults") {
try {
Expand Down
3 changes: 1 addition & 2 deletions epub2twpub/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ async function testPage(filepath) {
console.log("Text extractor error: " + msg)
}
});
await textExtractor.initialise();
// Get the text of the page
// Get the text of the page
const results = await textExtractor.getPageText("index.html");
// Flatten the nodes of the results
for(const chunk of results.chunks) {
Expand Down
82 changes: 32 additions & 50 deletions epub2twpub/text-extractor.js
Original file line number Diff line number Diff line change
@@ -1,59 +1,41 @@
/*
Class representing the Puppeteer-based wrapper for get-page-text.js
Class representing the jsdom wrapper for get-page-text.js
*/

const playwright = require("playwright"),
{getPageText} = require("./injected/get-page-text");

const URL_PREFIX = "https://example.com/";
const { JSDOM } = require("jsdom");
const { getStructure } = require("./get-page-text");

class TextExtractor {

/*
Options:
getFile: function(href) returns {type:, contents:}
logError: function(msg)
*/
constructor (options) {
this.getFile = options.getFile;
this.logError = options.logError;
}

async initialise() {
this.browser = await playwright.chromium.launch();
this.context = await this.browser.newContext({
javaScriptEnabled: false
});
this.page = await this.context.newPage();
await this.page.route("**/*",async (route) => {
const request = route.request();
if(request.method() === "GET" && request.url().startsWith(URL_PREFIX)) {
const fileHref = request.url().slice(URL_PREFIX.length);
const {type,contents} = await this.getFile(fileHref);
if(!type) {
this.logError(`Missing file \`${fileHref}\``);
route.fulfill({status: 404, contentType: "text/plain", body: "Not found!"});
} else {
route.fulfill({status: 200, contentType: type, body: contents});
}
} else {
route.abort();
}
});
}

async getPageText(href) {
// console.log("processing page",href)
// 这里用playwright的headless浏览器加载图书页面,然后返回这个页面的evaluate。
const pageURL = URL_PREFIX + href;
await this.page.goto(pageURL,{waitUntil: "load"});
return await this.page.evaluate(getPageText);
}

async close() {
await this.page.close();
await this.browser.close();
}
/*
Options:
getFile: function(href) returns {type:, contents:}
logError: function(msg)
*/
constructor(options) {
this.getFile = options.getFile;
this.logError = options.logError;
}

/**
* @description 从文件中获取内容,使用这个内容通过JSDOM获得dom对象。然后通过getPageText传入DOm获得格式化的结构。
* @param {string} href 文件名。
* @returns 返回一个结构:{chunks: [], stylsheets: [text]}。
*/
async getPageText(href) {
// href是一个文件名。
const { type, contents } = await this.getFile(href);
if (!type) {
this.logError(`Missing file \`${href}\``);
return "";
} else {
var window = new JSDOM(contents).window;
var document = window.document;
var result = getStructure(window, document);
console.log(result);
}
return result;
}

}

Expand Down
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"css": "^3.0.0",
"jszip": "^3.10.1",
"@xmldom/xmldom": "^0.8.7",
"playwright": "1.36.0",
"tiddlywiki": "5.3.0",
"jsdom": "22.1.0"
},
Expand Down
18 changes: 0 additions & 18 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 95ab5b6

Please sign in to comment.