Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SLB 495 Import content AI #376

Merged
merged 5 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .lagoon/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ RUN --mount=type=cache,target=/tmp/cache pnpm i && \
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/cms" /tmp/.deploy/cms --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/publisher" /tmp/.deploy/publisher --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/preview" /tmp/.deploy/preview --prod
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/converter" /tmp/.deploy/converter --prod

# ====================================================================================================
# CLI IMAGE
Expand Down Expand Up @@ -119,3 +120,13 @@ RUN npm install -g [email protected]
COPY --from=builder /tmp/.deploy/publisher /app

CMD pnpm publisher

# ====================================================================================================
# CONVERTER IMAGE
# ====================================================================================================

FROM uselagoon/node-18 as convertmd

RUN npm install -g [email protected]
COPY --from=builder /tmp/.deploy/converter /app
CMD pnpm start
170 changes: 170 additions & 0 deletions apps/converter/htmlToMarkdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import crypto from 'crypto';
import fs from 'fs-extra';
import imageType from 'image-type';
import { JSDOM } from 'jsdom';
import fetch from 'node-fetch';
import path from 'path';
import TurndownService from 'turndown';
import { fileURLToPath } from 'url';

// @todo Fix this to work locally and live
const isLagoon = !!process.env.LAGOON;
const __filename = fileURLToPath(import.meta.url);
const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);

async function extractMainContent(htmlString) {
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
const match = htmlString.match(bodyRegex);
// Return the captured group (content between tags) or null if no match
const html = match ? match[1] : null;

if (html) {
// Create a new JSDOM instance and parse the HTML string
const dom = new JSDOM(html);
// Extract the <main> element content
const mainElement = dom.window.document.querySelector('main, article');
// Return the inner HTML of the <main> tag, or an empty string if not found
return mainElement ? mainElement.innerHTML : '';
}
}

async function getImageExtension(buffer) {
const type = await imageType(buffer);
return type ? `.${type.ext}` : '.png';
}

function generateFolderName(content) {
const hash = crypto.createHash('md5').update(content).digest('hex');
return hash.substring(0, 12);
}

async function downloadImage(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch image: ${response.statusText}`);
}
return Buffer.from(await response.arrayBuffer());
} catch (error) {
console.warn(
`Warning: Failed to download image from ${url}:`,
error.message,
);
return null;
}
}

function isValidUrl(string) {
try {
new URL(string);
return true;
} catch (_) {
return false;
}
}

export async function htmlToMarkdown(url) {
if (!isValidUrl(url)) {
throw new Error('Invalid URL provided: ' + url);
}

// Fetch HTML content
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.statusText}`);
}
const fullHtml = await response.text();

const html = await extractMainContent(fullHtml);
// Generate folder name based on HTML content
const folderName = generateFolderName(html);
const outputDir = path.join(__dirname, folderName);
const imagesDir = path.join(outputDir, 'images');

await fs.ensureDir(outputDir);
await fs.ensureDir(imagesDir);

// Parse HTML using JSDOM
const dom = new JSDOM(html);
const document = dom.window.document;

// Process images before conversion
const images = document.querySelectorAll('img');
const imageMap = new Map();

for (const img of images) {
const srcAttribute = img.getAttribute('src');
if (!srcAttribute) continue;

// Resolve relative URLs to absolute URLs
const absoluteUrl = new URL(srcAttribute, url).href;

const imageBuffer = await downloadImage(absoluteUrl);
if (!imageBuffer) continue;

const extension = await getImageExtension(imageBuffer);
const filename = `image-${crypto.randomBytes(4).toString('hex')}${extension}`;
const imagePath = path.join(imagesDir, filename);

await fs.writeFile(imagePath, imageBuffer);
imageMap.set(srcAttribute, path.join('images', filename));
img.setAttribute('src', path.join('images', filename));
}

// Configure Turndown
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
hr: '---',
bulletListMarker: '-',
strongDelimiter: '**',
});

// Add custom rules
turndownService.addRule('tables', {
filter: 'table',
replacement: function (content, node) {
const rows = node.querySelectorAll('tr');
const headers = Array.from(rows[0]?.querySelectorAll('th,td') || [])
.map((cell) => cell.textContent.trim())
.join(' | ');

const separator = headers
.split('|')
.map(() => '---')
.join(' | ');

const body = Array.from(rows)
.slice(1)
.map((row) =>
Array.from(row.querySelectorAll('td'))
.map((cell) => cell.textContent.trim())
.join(' | '),
)
.join('\n');

return `\n${headers}\n${separator}\n${body}\n\n`;
},
});

// Convert to Markdown
let markdown = turndownService.turndown(document.body);

// Clean up the markdown
markdown = markdown
.replace(/\n\s*\n\s*\n/g, '\n\n')
.replace(/!\[\]\(/g, '![image](')
.trim();

// Save markdown file
const mdPath = path.join(outputDir, 'content.md');
await fs.writeFile(mdPath, markdown);

return {
markdownPath: mdPath,
warnings: [], // You could add warnings for failed image downloads etc.
outputDir,
};
}
172 changes: 172 additions & 0 deletions apps/converter/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import { parse } from '@textlint/markdown-to-ast';
import express from 'express';
import { readFileSync } from 'fs';
import { toHtml } from 'hast-util-to-html';
import { fromMarkdown } from 'mdast-util-from-markdown';
import { toHast } from 'mdast-util-to-hast';

import { htmlToMarkdown } from './htmlToMarkdown.js';
import { wordToMarkdown } from './wordToMarkdown.js';

const app = express();
const PORT = 3000;

function markdownToHtmlTable(markdownTable) {
// Split the markdown table into lines
const lines = markdownTable.trim().split('\n');

// Extract headers (first line)
const headers = lines[0]
.split('|')
.map((header) => header.trim())
.map((header) => header.replace(/^<p>/, '').replace(/<\/p>$/, ''))
.filter((header) => header !== '');

// Remove separator line (second line with ---)
const dataLines = lines.slice(2);

// Create HTML table
let htmlTable = '<table>\n<thead>\n<tr>';

// Add headers
headers.forEach((header) => {
htmlTable += `\n<th>${header}</th>`;
});

htmlTable += '\n</tr>\n</thead>\n<tbody>';

// Add table rows
dataLines.forEach((line) => {
const cells = line
.split('|')
.map((cell) => cell.trim())
.map((cell) => cell.replace(/^<p>/, '').replace(/<\/p>$/, ''))
.filter((cell) => cell !== '');

if (cells.length > 0) {
htmlTable += '\n<tr>';
cells.forEach((cell) => {
htmlTable += `\n<td>${cell}</td>`;
});
htmlTable += '\n</tr>';
}
});
htmlTable += '\n</tbody>\n</table>';

return htmlTable;
}

// Express endpoint
app.get('/convert', async (req, res) => {
const filePath = req.query.path;

if (!filePath) {
return res.status(400).json({
error: "Please provide a Word document path as 'path' query parameter",
});
}

try {
// First convert Word to Markdown
const { markdownPath, warnings, outputDir } =
await wordToMarkdown(filePath);

// Then read and process the Markdown
const markdown = readFileSync(markdownPath, 'utf-8');
const mdast = fromMarkdown(markdown);

const md = readFileSync(markdownPath, 'utf-8');
const ast = parse(md);

mdast.children.forEach(async (element, index) => {
const hast = toHast(element, { allowDangerousHtml: true });
const html = toHtml(hast, { allowDangerousHtml: true });
element.htmlValue = html;
element.type = ast.children[index].type;
element.raw = ast.children[index].raw;
if (element.type == 'Table') {
element.htmlValue = markdownToHtmlTable(html);
}

if (ast.children[index].children[0].type == 'Image') {
element.type = 'Image';
element.src = `${outputDir}/${ast.children[index].children[0].url}`;
}
});

// Return the processed content along with conversion info
res.json({
content: mdast.children,
outputDirectory: outputDir,
warnings: warnings,
});
} catch (error) {
if (error.code === 'ENOENT') {
res.status(404).json({ error: `File not found: ${filePath}` });
} else {
res.status(500).json({
error: 'Error processing document',
details: error.message,
});
}
}
});

app.get('/html-convert', async (req, res) => {
const filePath = req.query.path;

if (!filePath) {
return res.status(400).json({
error: "Please provide a URLas 'path' query parameter",
});
}

try {
// First convert Word to Markdown
const { markdownPath, warnings, outputDir } =
await htmlToMarkdown(filePath);

// Then read and process the Markdown
const markdown = readFileSync(markdownPath, 'utf-8');
const mdast = fromMarkdown(markdown);

const md = readFileSync(markdownPath, 'utf-8');
const ast = parse(md);

mdast.children.forEach(async (element, index) => {
const hast = toHast(element, { allowDangerousHtml: true });
const html = toHtml(hast, { allowDangerousHtml: true });
element.htmlValue = html;
element.type = ast.children[index].type;
element.raw = ast.children[index].raw;
if (element.type == 'Table') {
element.htmlValue = markdownToHtmlTable(html);
}

if (ast.children[index].children[0].type == 'Image') {
element.type = 'Image';
element.src = `${outputDir}/${ast.children[index].children[0].url}`;
}
});

// Return the processed content along with conversion info
res.json({
content: mdast.children,
outputDirectory: outputDir,
warnings: warnings,
});
} catch (error) {
if (error.code === 'ENOENT') {
res.status(404).json({ error: `File not found: ${filePath}` });
} else {
res.status(500).json({
error: 'Error processing document',
details: error.message,
});
}
}
});

app.listen(PORT, () => {
console.log(`Server running on http://localhost:${PORT}`);
});
Loading