-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #376 from AmazeeLabs/feature/SLB-495-dev-merge
SLB 495 Import content AI
- Loading branch information
Showing
29 changed files
with
5,516 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,7 @@ RUN --mount=type=cache,target=/tmp/cache pnpm i && \ | |
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/cms" /tmp/.deploy/cms --prod | ||
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/publisher" /tmp/.deploy/publisher --prod | ||
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/preview" /tmp/.deploy/preview --prod | ||
RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/converter" /tmp/.deploy/converter --prod | ||
|
||
# ==================================================================================================== | ||
# CLI IMAGE | ||
|
@@ -119,3 +120,13 @@ RUN npm install -g [email protected] | |
COPY --from=builder /tmp/.deploy/publisher /app | ||
|
||
CMD pnpm publisher | ||
|
||
# ==================================================================================================== | ||
# CONVERTER IMAGE | ||
# ==================================================================================================== | ||
|
||
FROM uselagoon/node-18 as convertmd | ||
|
||
RUN npm install -g [email protected] | ||
COPY --from=builder /tmp/.deploy/converter /app | ||
CMD pnpm start |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
import crypto from 'crypto'; | ||
import fs from 'fs-extra'; | ||
import imageType from 'image-type'; | ||
import { JSDOM } from 'jsdom'; | ||
import fetch from 'node-fetch'; | ||
import path from 'path'; | ||
import TurndownService from 'turndown'; | ||
import { fileURLToPath } from 'url'; | ||
|
||
// @todo Fix this to work locally and live | ||
const isLagoon = !!process.env.LAGOON; | ||
const __filename = fileURLToPath(import.meta.url); | ||
const __dirname = isLagoon | ||
? '/app/web/sites/default/files/converted' | ||
: path.dirname(__filename); | ||
|
||
async function extractMainContent(htmlString) { | ||
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i; | ||
const match = htmlString.match(bodyRegex); | ||
// Return the captured group (content between tags) or null if no match | ||
const html = match ? match[1] : null; | ||
|
||
if (html) { | ||
// Create a new JSDOM instance and parse the HTML string | ||
const dom = new JSDOM(html); | ||
// Extract the <main> element content | ||
const mainElement = dom.window.document.querySelector('main, article'); | ||
// Return the inner HTML of the <main> tag, or an empty string if not found | ||
return mainElement ? mainElement.innerHTML : ''; | ||
} | ||
} | ||
|
||
async function getImageExtension(buffer) { | ||
const type = await imageType(buffer); | ||
return type ? `.${type.ext}` : '.png'; | ||
} | ||
|
||
function generateFolderName(content) { | ||
const hash = crypto.createHash('md5').update(content).digest('hex'); | ||
return hash.substring(0, 12); | ||
} | ||
|
||
async function downloadImage(url) { | ||
try { | ||
const response = await fetch(url); | ||
if (!response.ok) { | ||
throw new Error(`Failed to fetch image: ${response.statusText}`); | ||
} | ||
return Buffer.from(await response.arrayBuffer()); | ||
} catch (error) { | ||
console.warn( | ||
`Warning: Failed to download image from ${url}:`, | ||
error.message, | ||
); | ||
return null; | ||
} | ||
} | ||
|
||
function isValidUrl(string) { | ||
try { | ||
new URL(string); | ||
return true; | ||
} catch (_) { | ||
return false; | ||
} | ||
} | ||
|
||
export async function htmlToMarkdown(url) { | ||
if (!isValidUrl(url)) { | ||
throw new Error('Invalid URL provided: ' + url); | ||
} | ||
|
||
// Fetch HTML content | ||
const response = await fetch(url); | ||
if (!response.ok) { | ||
throw new Error(`Failed to fetch page: ${response.statusText}`); | ||
} | ||
const fullHtml = await response.text(); | ||
|
||
const html = await extractMainContent(fullHtml); | ||
// Generate folder name based on HTML content | ||
const folderName = generateFolderName(html); | ||
const outputDir = path.join(__dirname, folderName); | ||
const imagesDir = path.join(outputDir, 'images'); | ||
|
||
await fs.ensureDir(outputDir); | ||
await fs.ensureDir(imagesDir); | ||
|
||
// Parse HTML using JSDOM | ||
const dom = new JSDOM(html); | ||
const document = dom.window.document; | ||
|
||
// Process images before conversion | ||
const images = document.querySelectorAll('img'); | ||
const imageMap = new Map(); | ||
|
||
for (const img of images) { | ||
const srcAttribute = img.getAttribute('src'); | ||
if (!srcAttribute) continue; | ||
|
||
// Resolve relative URLs to absolute URLs | ||
const absoluteUrl = new URL(srcAttribute, url).href; | ||
|
||
const imageBuffer = await downloadImage(absoluteUrl); | ||
if (!imageBuffer) continue; | ||
|
||
const extension = await getImageExtension(imageBuffer); | ||
const filename = `image-${crypto.randomBytes(4).toString('hex')}${extension}`; | ||
const imagePath = path.join(imagesDir, filename); | ||
|
||
await fs.writeFile(imagePath, imageBuffer); | ||
imageMap.set(srcAttribute, path.join('images', filename)); | ||
img.setAttribute('src', path.join('images', filename)); | ||
} | ||
|
||
// Configure Turndown | ||
const turndownService = new TurndownService({ | ||
headingStyle: 'atx', | ||
codeBlockStyle: 'fenced', | ||
hr: '---', | ||
bulletListMarker: '-', | ||
strongDelimiter: '**', | ||
}); | ||
|
||
// Add custom rules | ||
turndownService.addRule('tables', { | ||
filter: 'table', | ||
replacement: function (content, node) { | ||
const rows = node.querySelectorAll('tr'); | ||
const headers = Array.from(rows[0]?.querySelectorAll('th,td') || []) | ||
.map((cell) => cell.textContent.trim()) | ||
.join(' | '); | ||
|
||
const separator = headers | ||
.split('|') | ||
.map(() => '---') | ||
.join(' | '); | ||
|
||
const body = Array.from(rows) | ||
.slice(1) | ||
.map((row) => | ||
Array.from(row.querySelectorAll('td')) | ||
.map((cell) => cell.textContent.trim()) | ||
.join(' | '), | ||
) | ||
.join('\n'); | ||
|
||
return `\n${headers}\n${separator}\n${body}\n\n`; | ||
}, | ||
}); | ||
|
||
// Convert to Markdown | ||
let markdown = turndownService.turndown(document.body); | ||
|
||
// Clean up the markdown | ||
markdown = markdown | ||
.replace(/\n\s*\n\s*\n/g, '\n\n') | ||
.replace(/!\[\]\(/g, '![image](') | ||
.trim(); | ||
|
||
// Save markdown file | ||
const mdPath = path.join(outputDir, 'content.md'); | ||
await fs.writeFile(mdPath, markdown); | ||
|
||
return { | ||
markdownPath: mdPath, | ||
warnings: [], // You could add warnings for failed image downloads etc. | ||
outputDir, | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
import { parse } from '@textlint/markdown-to-ast'; | ||
import express from 'express'; | ||
import { readFileSync } from 'fs'; | ||
import { toHtml } from 'hast-util-to-html'; | ||
import { fromMarkdown } from 'mdast-util-from-markdown'; | ||
import { toHast } from 'mdast-util-to-hast'; | ||
|
||
import { htmlToMarkdown } from './htmlToMarkdown.js'; | ||
import { wordToMarkdown } from './wordToMarkdown.js'; | ||
|
||
const app = express(); | ||
const PORT = 3000; | ||
|
||
function markdownToHtmlTable(markdownTable) { | ||
// Split the markdown table into lines | ||
const lines = markdownTable.trim().split('\n'); | ||
|
||
// Extract headers (first line) | ||
const headers = lines[0] | ||
.split('|') | ||
.map((header) => header.trim()) | ||
.map((header) => header.replace(/^<p>/, '').replace(/<\/p>$/, '')) | ||
.filter((header) => header !== ''); | ||
|
||
// Remove separator line (second line with ---) | ||
const dataLines = lines.slice(2); | ||
|
||
// Create HTML table | ||
let htmlTable = '<table>\n<thead>\n<tr>'; | ||
|
||
// Add headers | ||
headers.forEach((header) => { | ||
htmlTable += `\n<th>${header}</th>`; | ||
}); | ||
|
||
htmlTable += '\n</tr>\n</thead>\n<tbody>'; | ||
|
||
// Add table rows | ||
dataLines.forEach((line) => { | ||
const cells = line | ||
.split('|') | ||
.map((cell) => cell.trim()) | ||
.map((cell) => cell.replace(/^<p>/, '').replace(/<\/p>$/, '')) | ||
.filter((cell) => cell !== ''); | ||
|
||
if (cells.length > 0) { | ||
htmlTable += '\n<tr>'; | ||
cells.forEach((cell) => { | ||
htmlTable += `\n<td>${cell}</td>`; | ||
}); | ||
htmlTable += '\n</tr>'; | ||
} | ||
}); | ||
htmlTable += '\n</tbody>\n</table>'; | ||
|
||
return htmlTable; | ||
} | ||
|
||
// Express endpoint | ||
app.get('/convert', async (req, res) => { | ||
const filePath = req.query.path; | ||
|
||
if (!filePath) { | ||
return res.status(400).json({ | ||
error: "Please provide a Word document path as 'path' query parameter", | ||
}); | ||
} | ||
|
||
try { | ||
// First convert Word to Markdown | ||
const { markdownPath, warnings, outputDir } = | ||
await wordToMarkdown(filePath); | ||
|
||
// Then read and process the Markdown | ||
const markdown = readFileSync(markdownPath, 'utf-8'); | ||
const mdast = fromMarkdown(markdown); | ||
|
||
const md = readFileSync(markdownPath, 'utf-8'); | ||
const ast = parse(md); | ||
|
||
mdast.children.forEach(async (element, index) => { | ||
const hast = toHast(element, { allowDangerousHtml: true }); | ||
const html = toHtml(hast, { allowDangerousHtml: true }); | ||
element.htmlValue = html; | ||
element.type = ast.children[index].type; | ||
element.raw = ast.children[index].raw; | ||
if (element.type == 'Table') { | ||
element.htmlValue = markdownToHtmlTable(html); | ||
} | ||
|
||
if (ast.children[index].children[0].type == 'Image') { | ||
element.type = 'Image'; | ||
element.src = `${outputDir}/${ast.children[index].children[0].url}`; | ||
} | ||
}); | ||
|
||
// Return the processed content along with conversion info | ||
res.json({ | ||
content: mdast.children, | ||
outputDirectory: outputDir, | ||
warnings: warnings, | ||
}); | ||
} catch (error) { | ||
if (error.code === 'ENOENT') { | ||
res.status(404).json({ error: `File not found: ${filePath}` }); | ||
} else { | ||
res.status(500).json({ | ||
error: 'Error processing document', | ||
details: error.message, | ||
}); | ||
} | ||
} | ||
}); | ||
|
||
app.get('/html-convert', async (req, res) => { | ||
const filePath = req.query.path; | ||
|
||
if (!filePath) { | ||
return res.status(400).json({ | ||
error: "Please provide a URLas 'path' query parameter", | ||
}); | ||
} | ||
|
||
try { | ||
// First convert Word to Markdown | ||
const { markdownPath, warnings, outputDir } = | ||
await htmlToMarkdown(filePath); | ||
|
||
// Then read and process the Markdown | ||
const markdown = readFileSync(markdownPath, 'utf-8'); | ||
const mdast = fromMarkdown(markdown); | ||
|
||
const md = readFileSync(markdownPath, 'utf-8'); | ||
const ast = parse(md); | ||
|
||
mdast.children.forEach(async (element, index) => { | ||
const hast = toHast(element, { allowDangerousHtml: true }); | ||
const html = toHtml(hast, { allowDangerousHtml: true }); | ||
element.htmlValue = html; | ||
element.type = ast.children[index].type; | ||
element.raw = ast.children[index].raw; | ||
if (element.type == 'Table') { | ||
element.htmlValue = markdownToHtmlTable(html); | ||
} | ||
|
||
if (ast.children[index].children[0].type == 'Image') { | ||
element.type = 'Image'; | ||
element.src = `${outputDir}/${ast.children[index].children[0].url}`; | ||
} | ||
}); | ||
|
||
// Return the processed content along with conversion info | ||
res.json({ | ||
content: mdast.children, | ||
outputDirectory: outputDir, | ||
warnings: warnings, | ||
}); | ||
} catch (error) { | ||
if (error.code === 'ENOENT') { | ||
res.status(404).json({ error: `File not found: ${filePath}` }); | ||
} else { | ||
res.status(500).json({ | ||
error: 'Error processing document', | ||
details: error.message, | ||
}); | ||
} | ||
} | ||
}); | ||
|
||
app.listen(PORT, () => { | ||
console.log(`Server running on http://localhost:${PORT}`); | ||
}); |
Oops, something went wrong.