Skip to content

Commit

Permalink
feat(slb-495): html parser updates, lagoon env check and more
Browse files Browse the repository at this point in the history
  • Loading branch information
dspachos committed Nov 26, 2024
1 parent 312ef28 commit 527b817
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 17 deletions.
33 changes: 21 additions & 12 deletions apps/converter/htmlToMarkdown.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,29 @@ import TurndownService from 'turndown';
import { fileURLToPath } from 'url';

// @todo Fix this to work locally and live
const isLagoon = !!process.env.LAGOON;
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
let lagoon_dirname = __dirname;
lagoon_dirname = '/app/web/sites/default/files/converted';
const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);

async function extractMainContent(htmlString) {
// Create a new JSDOM instance and parse the HTML string
const dom = new JSDOM(htmlString);

// Extract the <main> element content
const mainElement = dom.window.document.querySelector('main');

// Return the inner HTML of the <main> tag, or an empty string if not found
return mainElement ? mainElement.innerHTML : '';
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
const match = htmlString.match(bodyRegex);
// Return the captured group (content between tags) or null if no match
const html = match ? match[1] : null;

if (html) {
// Create a new JSDOM instance and parse the HTML string
const dom = new JSDOM(html);
// Extract the <main> element content
let mainElement = dom.window.document.querySelector('main');
if (!mainElement) {
mainElement = dom.window.document.querySelector('article');
}
// Return the inner HTML of the <main> tag, or an empty string if not found
return mainElement ? mainElement.innerHTML : '';
}
}

async function getImageExtension(buffer) {
Expand Down Expand Up @@ -73,7 +82,7 @@ export async function htmlToMarkdown(url) {
const html = await extractMainContent(fullHtml);
// Generate folder name based on HTML content
const folderName = generateFolderName(html);
const outputDir = path.join(lagoon_dirname, folderName);
const outputDir = path.join(__dirname, folderName);
const imagesDir = path.join(outputDir, 'images');

await fs.ensureDir(outputDir);
Expand Down
10 changes: 5 additions & 5 deletions apps/converter/wordToMarkdown.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ import TurndownService from 'turndown';
import { fileURLToPath } from 'url';

// @todo Fix this to work locally and live
// const isLagoon = !!process.env.LAGOON;
const isLagoon = !!process.env.LAGOON;
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
let lagoon_dirname = __dirname;
lagoon_dirname = '/app/web/sites/default/files/converted';
const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);

async function getImageExtension(buffer) {
const type = await imageType(buffer);
Expand All @@ -31,7 +31,7 @@ export async function wordToMarkdown(filePath) {

const folderName = generateFolderName(filePath);
// const outputDir = path.join(__dirname, folderName);
const outputDir = path.join(lagoon_dirname, folderName);
const outputDir = path.join(__dirname, folderName);
const imagesDir = path.join(outputDir, 'images');

await fs.ensureDir(outputDir);
Expand Down

0 comments on commit 527b817

Please sign in to comment.