|
1 | 1 | import TurndownService from 'turndown'; |
2 | 2 | import { type ImageType } from '../readFile/type'; |
3 | | -import { matchMdImg } from '@fastgpt/global/common/string/markdown'; |
4 | 3 | import { getNanoid } from '@fastgpt/global/common/string/tools'; |
5 | 4 | // @ts-ignore |
6 | 5 | const turndownPluginGfm = require('joplin-turndown-plugin-gfm'); |
7 | 6 |
|
| 7 | +const MAX_HTML_SIZE = 100 * 1000; // 100k characters limit |
| 8 | + |
8 | 9 | const processBase64Images = (htmlContent: string) => { |
9 | | - const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g; |
| 10 | + // 优化后的正则: |
| 11 | + // 1. 使用精确的 base64 字符集 [A-Za-z0-9+/=]+ 避免回溯 |
| 12 | + // 2. 明确捕获 mime 类型和 base64 数据 |
| 13 | + // 3. 减少不必要的捕获组 |
| 14 | + const base64Regex = /src="data:([^;]+);base64,([A-Za-z0-9+/=]+)"/g; |
10 | 15 | const images: ImageType[] = []; |
11 | 16 |
|
12 | | - const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => { |
| 17 | + const processedHtml = htmlContent.replace(base64Regex, (_match, mime, base64Data) => { |
13 | 18 | const uuid = `IMAGE_${getNanoid(12)}_IMAGE`; |
14 | 19 | images.push({ |
15 | 20 | uuid, |
@@ -63,12 +68,18 @@ export const html2md = ( |
63 | 68 |
|
64 | 69 | // Base64 img to id, otherwise it will occupy memory when going to md |
65 | 70 | const { processedHtml, images } = processBase64Images(html); |
| 71 | + |
| 72 | + // if html is too large, return the original html |
| 73 | + if (processedHtml.length > MAX_HTML_SIZE) { |
| 74 | + return { rawText: processedHtml, imageList: [] }; |
| 75 | + } |
| 76 | + |
66 | 77 | const md = turndownService.turndown(processedHtml); |
67 | | - const { text, imageList } = matchMdImg(md); |
| 78 | + // const { text, imageList } = matchMdImg(md); |
68 | 79 |
|
69 | 80 | return { |
70 | | - rawText: text, |
71 | | - imageList: [...images, ...imageList] |
| 81 | + rawText: md, |
| 82 | + imageList: images |
72 | 83 | }; |
73 | 84 | } catch (error) { |
74 | 85 | console.log('html 2 markdown error', error); |
|
0 commit comments