Merge pull request #376 from AmazeeLabs/feature/SLB-495-dev-merge

SLB 495 Import content AI
AmazeeLabs · Nov 27, 2024 · 71d8d01 · 71d8d01
2 parents 53929a4 + e79453c
commit 71d8d01
Show file tree

Hide file tree

Showing 29 changed files with 5,516 additions and 44 deletions.
diff --git a/.lagoon/Dockerfile b/.lagoon/Dockerfile
@@ -51,6 +51,7 @@ RUN --mount=type=cache,target=/tmp/cache pnpm i && \
 RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/cms" /tmp/.deploy/cms --prod
 RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/publisher" /tmp/.deploy/publisher --prod
 RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/preview" /tmp/.deploy/preview --prod
+RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/converter" /tmp/.deploy/converter --prod
 
 # ====================================================================================================
 # CLI IMAGE
@@ -119,3 +120,13 @@ RUN npm install -g [email protected]
 COPY --from=builder /tmp/.deploy/publisher /app
 
 CMD pnpm publisher
+
+# ====================================================================================================
+# CONVERTER IMAGE
+# ====================================================================================================
+
+FROM uselagoon/node-18 as convertmd
+
+RUN npm install -g [email protected]
+COPY --from=builder /tmp/.deploy/converter /app
+CMD pnpm start
diff --git a/apps/converter/htmlToMarkdown.js b/apps/converter/htmlToMarkdown.js
@@ -0,0 +1,170 @@
+import crypto from 'crypto';
+import fs from 'fs-extra';
+import imageType from 'image-type';
+import { JSDOM } from 'jsdom';
+import fetch from 'node-fetch';
+import path from 'path';
+import TurndownService from 'turndown';
+import { fileURLToPath } from 'url';
+
+// @todo Fix this to work locally and live
+const isLagoon = !!process.env.LAGOON;
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = isLagoon
+  ? '/app/web/sites/default/files/converted'
+  : path.dirname(__filename);
+
+async function extractMainContent(htmlString) {
+  const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
+  const match = htmlString.match(bodyRegex);
+  // Return the captured group (content between tags) or null if no match
+  const html = match ? match[1] : null;
+
+  if (html) {
+    // Create a new JSDOM instance and parse the HTML string
+    const dom = new JSDOM(html);
+    // Extract the <main> element content
+    const mainElement = dom.window.document.querySelector('main, article');
+    // Return the inner HTML of the <main> tag, or an empty string if not found
+    return mainElement ? mainElement.innerHTML : '';
+  }
+}
+
+async function getImageExtension(buffer) {
+  const type = await imageType(buffer);
+  return type ? `.${type.ext}` : '.png';
+}
+
+function generateFolderName(content) {
+  const hash = crypto.createHash('md5').update(content).digest('hex');
+  return hash.substring(0, 12);
+}
+
+async function downloadImage(url) {
+  try {
+    const response = await fetch(url);
+    if (!response.ok) {
+      throw new Error(`Failed to fetch image: ${response.statusText}`);
+    }
+    return Buffer.from(await response.arrayBuffer());
+  } catch (error) {
+    console.warn(
+      `Warning: Failed to download image from ${url}:`,
+      error.message,
+    );
+    return null;
+  }
+}
+
+function isValidUrl(string) {
+  try {
+    new URL(string);
+    return true;
+  } catch (_) {
+    return false;
+  }
+}
+
+export async function htmlToMarkdown(url) {
+  if (!isValidUrl(url)) {
+    throw new Error('Invalid URL provided: ' + url);
+  }
+
+  // Fetch HTML content
+  const response = await fetch(url);
+  if (!response.ok) {
+    throw new Error(`Failed to fetch page: ${response.statusText}`);
+  }
+  const fullHtml = await response.text();
+
+  const html = await extractMainContent(fullHtml);
+  // Generate folder name based on HTML content
+  const folderName = generateFolderName(html);
+  const outputDir = path.join(__dirname, folderName);
+  const imagesDir = path.join(outputDir, 'images');
+
+  await fs.ensureDir(outputDir);
+  await fs.ensureDir(imagesDir);
+
+  // Parse HTML using JSDOM
+  const dom = new JSDOM(html);
+  const document = dom.window.document;
+
+  // Process images before conversion
+  const images = document.querySelectorAll('img');
+  const imageMap = new Map();
+
+  for (const img of images) {
+    const srcAttribute = img.getAttribute('src');
+    if (!srcAttribute) continue;
+
+    // Resolve relative URLs to absolute URLs
+    const absoluteUrl = new URL(srcAttribute, url).href;
+
+    const imageBuffer = await downloadImage(absoluteUrl);
+    if (!imageBuffer) continue;
+
+    const extension = await getImageExtension(imageBuffer);
+    const filename = `image-${crypto.randomBytes(4).toString('hex')}${extension}`;
+    const imagePath = path.join(imagesDir, filename);
+
+    await fs.writeFile(imagePath, imageBuffer);
+    imageMap.set(srcAttribute, path.join('images', filename));
+    img.setAttribute('src', path.join('images', filename));
+  }
+
+  // Configure Turndown
+  const turndownService = new TurndownService({
+    headingStyle: 'atx',
+    codeBlockStyle: 'fenced',
+    hr: '---',
+    bulletListMarker: '-',
+    strongDelimiter: '**',
+  });
+
+  // Add custom rules
+  turndownService.addRule('tables', {
+    filter: 'table',
+    replacement: function (content, node) {
+      const rows = node.querySelectorAll('tr');
+      const headers = Array.from(rows[0]?.querySelectorAll('th,td') || [])
+        .map((cell) => cell.textContent.trim())
+        .join(' | ');
+
+      const separator = headers
+        .split('|')
+        .map(() => '---')
+        .join(' | ');
+
+      const body = Array.from(rows)
+        .slice(1)
+        .map((row) =>
+          Array.from(row.querySelectorAll('td'))
+            .map((cell) => cell.textContent.trim())
+            .join(' | '),
+        )
+        .join('\n');
+
+      return `\n${headers}\n${separator}\n${body}\n\n`;
+    },
+  });
+
+  // Convert to Markdown
+  let markdown = turndownService.turndown(document.body);
+
+  // Clean up the markdown
+  markdown = markdown
+    .replace(/\n\s*\n\s*\n/g, '\n\n')
+    .replace(/!\[\]\(/g, '![image](')
+    .trim();
+
+  // Save markdown file
+  const mdPath = path.join(outputDir, 'content.md');
+  await fs.writeFile(mdPath, markdown);
+
+  return {
+    markdownPath: mdPath,
+    warnings: [], // You could add warnings for failed image downloads etc.
+    outputDir,
+  };
+}
diff --git a/apps/converter/index.js b/apps/converter/index.js
@@ -0,0 +1,172 @@
+import { parse } from '@textlint/markdown-to-ast';
+import express from 'express';
+import { readFileSync } from 'fs';
+import { toHtml } from 'hast-util-to-html';
+import { fromMarkdown } from 'mdast-util-from-markdown';
+import { toHast } from 'mdast-util-to-hast';
+
+import { htmlToMarkdown } from './htmlToMarkdown.js';
+import { wordToMarkdown } from './wordToMarkdown.js';
+
+const app = express();
+const PORT = 3000;
+
+function markdownToHtmlTable(markdownTable) {
+  // Split the markdown table into lines
+  const lines = markdownTable.trim().split('\n');
+
+  // Extract headers (first line)
+  const headers = lines[0]
+    .split('|')
+    .map((header) => header.trim())
+    .map((header) => header.replace(/^<p>/, '').replace(/<\/p>$/, ''))
+    .filter((header) => header !== '');
+
+  // Remove separator line (second line with ---)
+  const dataLines = lines.slice(2);
+
+  // Create HTML table
+  let htmlTable = '<table>\n<thead>\n<tr>';
+
+  // Add headers
+  headers.forEach((header) => {
+    htmlTable += `\n<th>${header}</th>`;
+  });
+
+  htmlTable += '\n</tr>\n</thead>\n<tbody>';
+
+  // Add table rows
+  dataLines.forEach((line) => {
+    const cells = line
+      .split('|')
+      .map((cell) => cell.trim())
+      .map((cell) => cell.replace(/^<p>/, '').replace(/<\/p>$/, ''))
+      .filter((cell) => cell !== '');
+
+    if (cells.length > 0) {
+      htmlTable += '\n<tr>';
+      cells.forEach((cell) => {
+        htmlTable += `\n<td>${cell}</td>`;
+      });
+      htmlTable += '\n</tr>';
+    }
+  });
+  htmlTable += '\n</tbody>\n</table>';
+
+  return htmlTable;
+}
+
+// Express endpoint
+app.get('/convert', async (req, res) => {
+  const filePath = req.query.path;
+
+  if (!filePath) {
+    return res.status(400).json({
+      error: "Please provide a Word document path as 'path' query parameter",
+    });
+  }
+
+  try {
+    // First convert Word to Markdown
+    const { markdownPath, warnings, outputDir } =
+      await wordToMarkdown(filePath);
+
+    // Then read and process the Markdown
+    const markdown = readFileSync(markdownPath, 'utf-8');
+    const mdast = fromMarkdown(markdown);
+
+    const md = readFileSync(markdownPath, 'utf-8');
+    const ast = parse(md);
+
+    mdast.children.forEach(async (element, index) => {
+      const hast = toHast(element, { allowDangerousHtml: true });
+      const html = toHtml(hast, { allowDangerousHtml: true });
+      element.htmlValue = html;
+      element.type = ast.children[index].type;
+      element.raw = ast.children[index].raw;
+      if (element.type == 'Table') {
+        element.htmlValue = markdownToHtmlTable(html);
+      }
+
+      if (ast.children[index].children[0].type == 'Image') {
+        element.type = 'Image';
+        element.src = `${outputDir}/${ast.children[index].children[0].url}`;
+      }
+    });
+
+    // Return the processed content along with conversion info
+    res.json({
+      content: mdast.children,
+      outputDirectory: outputDir,
+      warnings: warnings,
+    });
+  } catch (error) {
+    if (error.code === 'ENOENT') {
+      res.status(404).json({ error: `File not found: ${filePath}` });
+    } else {
+      res.status(500).json({
+        error: 'Error processing document',
+        details: error.message,
+      });
+    }
+  }
+});
+
+app.get('/html-convert', async (req, res) => {
+  const filePath = req.query.path;
+
+  if (!filePath) {
+    return res.status(400).json({
+      error: "Please provide a URLas 'path' query parameter",
+    });
+  }
+
+  try {
+    // First convert Word to Markdown
+    const { markdownPath, warnings, outputDir } =
+      await htmlToMarkdown(filePath);
+
+    // Then read and process the Markdown
+    const markdown = readFileSync(markdownPath, 'utf-8');
+    const mdast = fromMarkdown(markdown);
+
+    const md = readFileSync(markdownPath, 'utf-8');
+    const ast = parse(md);
+
+    mdast.children.forEach(async (element, index) => {
+      const hast = toHast(element, { allowDangerousHtml: true });
+      const html = toHtml(hast, { allowDangerousHtml: true });
+      element.htmlValue = html;
+      element.type = ast.children[index].type;
+      element.raw = ast.children[index].raw;
+      if (element.type == 'Table') {
+        element.htmlValue = markdownToHtmlTable(html);
+      }
+
+      if (ast.children[index].children[0].type == 'Image') {
+        element.type = 'Image';
+        element.src = `${outputDir}/${ast.children[index].children[0].url}`;
+      }
+    });
+
+    // Return the processed content along with conversion info
+    res.json({
+      content: mdast.children,
+      outputDirectory: outputDir,
+      warnings: warnings,
+    });
+  } catch (error) {
+    if (error.code === 'ENOENT') {
+      res.status(404).json({ error: `File not found: ${filePath}` });
+    } else {
+      res.status(500).json({
+        error: 'Error processing document',
+        details: error.message,
+      });
+    }
+  }
+});
+
+app.listen(PORT, () => {
+  console.log(`Server running on http://localhost:${PORT}`);
+});