Skip to content

Commit

Permalink
Merge pull request #376 from AmazeeLabs/feature/SLB-495-dev-merge
Browse files Browse the repository at this point in the history
SLB 495 Import content AI
  • Loading branch information
dspachos authored Nov 27, 2024
2 parents 53929a4 + e79453c commit 71d8d01
Show file tree
Hide file tree
Showing 29 changed files with 5,516 additions and 44 deletions.
11 changes: 11 additions & 0 deletions .lagoon/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ RUN --mount=type=cache,target=/tmp/cache pnpm i && \
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/cms" /tmp/.deploy/cms --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/publisher" /tmp/.deploy/publisher --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/preview" /tmp/.deploy/preview --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/converter" /tmp/.deploy/converter --prod

# ====================================================================================================
# CLI IMAGE
Expand Down Expand Up @@ -119,3 +120,13 @@ RUN npm install -g [email protected]
COPY --from=builder /tmp/.deploy/publisher /app

CMD pnpm publisher

# ====================================================================================================
# CONVERTER IMAGE
# ====================================================================================================

FROM uselagoon/node-18 as convertmd

RUN npm install -g [email protected]
COPY --from=builder /tmp/.deploy/converter /app
CMD pnpm start
170 changes: 170 additions & 0 deletions apps/converter/htmlToMarkdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import crypto from 'crypto';
import fs from 'fs-extra';
import imageType from 'image-type';
import { JSDOM } from 'jsdom';
import fetch from 'node-fetch';
import path from 'path';
import TurndownService from 'turndown';
import { fileURLToPath } from 'url';

// @todo Fix this to work locally and live
const isLagoon = !!process.env.LAGOON;
const __filename = fileURLToPath(import.meta.url);
const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);

async function extractMainContent(htmlString) {
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
const match = htmlString.match(bodyRegex);
// Return the captured group (content between tags) or null if no match
const html = match ? match[1] : null;

if (html) {
// Create a new JSDOM instance and parse the HTML string
const dom = new JSDOM(html);
// Extract the <main> element content
const mainElement = dom.window.document.querySelector('main, article');
// Return the inner HTML of the <main> tag, or an empty string if not found
return mainElement ? mainElement.innerHTML : '';
}
}

async function getImageExtension(buffer) {
const type = await imageType(buffer);
return type ? `.${type.ext}` : '.png';
}

function generateFolderName(content) {
const hash = crypto.createHash('md5').update(content).digest('hex');
return hash.substring(0, 12);
}

async function downloadImage(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch image: ${response.statusText}`);
}
return Buffer.from(await response.arrayBuffer());
} catch (error) {
console.warn(
`Warning: Failed to download image from ${url}:`,
error.message,
);
return null;
}
}

function isValidUrl(string) {
try {
new URL(string);
return true;
} catch (_) {
return false;
}
}

export async function htmlToMarkdown(url) {
if (!isValidUrl(url)) {
throw new Error('Invalid URL provided: ' + url);
}

// Fetch HTML content
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.statusText}`);
}
const fullHtml = await response.text();

const html = await extractMainContent(fullHtml);
// Generate folder name based on HTML content
const folderName = generateFolderName(html);
const outputDir = path.join(__dirname, folderName);
const imagesDir = path.join(outputDir, 'images');

await fs.ensureDir(outputDir);
await fs.ensureDir(imagesDir);

// Parse HTML using JSDOM
const dom = new JSDOM(html);
const document = dom.window.document;

// Process images before conversion
const images = document.querySelectorAll('img');
const imageMap = new Map();

for (const img of images) {
const srcAttribute = img.getAttribute('src');
if (!srcAttribute) continue;

// Resolve relative URLs to absolute URLs
const absoluteUrl = new URL(srcAttribute, url).href;

const imageBuffer = await downloadImage(absoluteUrl);
if (!imageBuffer) continue;

const extension = await getImageExtension(imageBuffer);
const filename = `image-${crypto.randomBytes(4).toString('hex')}${extension}`;
const imagePath = path.join(imagesDir, filename);

await fs.writeFile(imagePath, imageBuffer);
imageMap.set(srcAttribute, path.join('images', filename));
img.setAttribute('src', path.join('images', filename));
}

// Configure Turndown
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
hr: '---',
bulletListMarker: '-',
strongDelimiter: '**',
});

// Add custom rules
turndownService.addRule('tables', {
filter: 'table',
replacement: function (content, node) {
const rows = node.querySelectorAll('tr');
const headers = Array.from(rows[0]?.querySelectorAll('th,td') || [])
.map((cell) => cell.textContent.trim())
.join(' | ');

const separator = headers
.split('|')
.map(() => '---')
.join(' | ');

const body = Array.from(rows)
.slice(1)
.map((row) =>
Array.from(row.querySelectorAll('td'))
.map((cell) => cell.textContent.trim())
.join(' | '),
)
.join('\n');

return `\n${headers}\n${separator}\n${body}\n\n`;
},
});

// Convert to Markdown
let markdown = turndownService.turndown(document.body);

// Clean up the markdown
markdown = markdown
.replace(/\n\s*\n\s*\n/g, '\n\n')
.replace(/!\[\]\(/g, '![image](')
.trim();

// Save markdown file
const mdPath = path.join(outputDir, 'content.md');
await fs.writeFile(mdPath, markdown);

return {
markdownPath: mdPath,
warnings: [], // You could add warnings for failed image downloads etc.
outputDir,
};
}
172 changes: 172 additions & 0 deletions apps/converter/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import { parse } from '@textlint/markdown-to-ast';
import express from 'express';
import { readFileSync } from 'fs';
import { toHtml } from 'hast-util-to-html';
import { fromMarkdown } from 'mdast-util-from-markdown';
import { toHast } from 'mdast-util-to-hast';

import { htmlToMarkdown } from './htmlToMarkdown.js';
import { wordToMarkdown } from './wordToMarkdown.js';

const app = express();
const PORT = 3000;

function markdownToHtmlTable(markdownTable) {
// Split the markdown table into lines
const lines = markdownTable.trim().split('\n');

// Extract headers (first line)
const headers = lines[0]
.split('|')
.map((header) => header.trim())
.map((header) => header.replace(/^<p>/, '').replace(/<\/p>$/, ''))
.filter((header) => header !== '');

// Remove separator line (second line with ---)
const dataLines = lines.slice(2);

// Create HTML table
let htmlTable = '<table>\n<thead>\n<tr>';

// Add headers
headers.forEach((header) => {
htmlTable += `\n<th>${header}</th>`;
});

htmlTable += '\n</tr>\n</thead>\n<tbody>';

// Add table rows
dataLines.forEach((line) => {
const cells = line
.split('|')
.map((cell) => cell.trim())
.map((cell) => cell.replace(/^<p>/, '').replace(/<\/p>$/, ''))
.filter((cell) => cell !== '');

if (cells.length > 0) {
htmlTable += '\n<tr>';
cells.forEach((cell) => {
htmlTable += `\n<td>${cell}</td>`;
});
htmlTable += '\n</tr>';
}
});
htmlTable += '\n</tbody>\n</table>';

return htmlTable;
}

// Express endpoint
app.get('/convert', async (req, res) => {
const filePath = req.query.path;

if (!filePath) {
return res.status(400).json({
error: "Please provide a Word document path as 'path' query parameter",
});
}

try {
// First convert Word to Markdown
const { markdownPath, warnings, outputDir } =
await wordToMarkdown(filePath);

// Then read and process the Markdown
const markdown = readFileSync(markdownPath, 'utf-8');
const mdast = fromMarkdown(markdown);

const md = readFileSync(markdownPath, 'utf-8');
const ast = parse(md);

mdast.children.forEach(async (element, index) => {
const hast = toHast(element, { allowDangerousHtml: true });
const html = toHtml(hast, { allowDangerousHtml: true });
element.htmlValue = html;
element.type = ast.children[index].type;
element.raw = ast.children[index].raw;
if (element.type == 'Table') {
element.htmlValue = markdownToHtmlTable(html);
}

if (ast.children[index].children[0].type == 'Image') {
element.type = 'Image';
element.src = `${outputDir}/${ast.children[index].children[0].url}`;
}
});

// Return the processed content along with conversion info
res.json({
content: mdast.children,
outputDirectory: outputDir,
warnings: warnings,
});
} catch (error) {
if (error.code === 'ENOENT') {
res.status(404).json({ error: `File not found: ${filePath}` });
} else {
res.status(500).json({
error: 'Error processing document',
details: error.message,
});
}
}
});

app.get('/html-convert', async (req, res) => {
const filePath = req.query.path;

if (!filePath) {
return res.status(400).json({
error: "Please provide a URLas 'path' query parameter",
});
}

try {
// First convert Word to Markdown
const { markdownPath, warnings, outputDir } =
await htmlToMarkdown(filePath);

// Then read and process the Markdown
const markdown = readFileSync(markdownPath, 'utf-8');
const mdast = fromMarkdown(markdown);

const md = readFileSync(markdownPath, 'utf-8');
const ast = parse(md);

mdast.children.forEach(async (element, index) => {
const hast = toHast(element, { allowDangerousHtml: true });
const html = toHtml(hast, { allowDangerousHtml: true });
element.htmlValue = html;
element.type = ast.children[index].type;
element.raw = ast.children[index].raw;
if (element.type == 'Table') {
element.htmlValue = markdownToHtmlTable(html);
}

if (ast.children[index].children[0].type == 'Image') {
element.type = 'Image';
element.src = `${outputDir}/${ast.children[index].children[0].url}`;
}
});

// Return the processed content along with conversion info
res.json({
content: mdast.children,
outputDirectory: outputDir,
warnings: warnings,
});
} catch (error) {
if (error.code === 'ENOENT') {
res.status(404).json({ error: `File not found: ${filePath}` });
} else {
res.status(500).json({
error: 'Error processing document',
details: error.message,
});
}
}
});

app.listen(PORT, () => {
console.log(`Server running on http://localhost:${PORT}`);
});
Loading

0 comments on commit 71d8d01

Please sign in to comment.