Markdown match base64 performance

c121914yu · c121914yu · commit e2ddc054655c · 2025-10-26T00:43:17.000+08:00
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
@@ -118,4 +118,9 @@ FastGPT 是一个 AI Agent 构建平台,通过 Flow 提供开箱即用的数据
 
 ## 代码规范
 
-- 尽可能使用 type 进行类型声明，而不是 interface。
+- 尽可能使用 type 进行类型声明，而不是 interface。
+
+## Agent 设计规范
+
+1. 对于功能的实习和复杂问题修复，优先进行文档设计，并于让用户确认后，再进行执行修复。
+2. 采用"设计文档-测试示例-代码编写-测试运行-修正代码/文档"的工作模式，以测试为核心来确保设计的正确性。
diff --git a/document/content/docs/upgrading/4-14/4140.mdx b/document/content/docs/upgrading/4-14/4140.mdx
@@ -9,6 +9,7 @@ description: 'FastGPT V4.14.0 更新说明'
 
 ## ⚙️ 优化
 
+1. 匹配 Markdown 中 Base64 图片正则性能。
 
 ## 🐛 修复
 
diff --git a/document/data/doc-last-modified.json b/document/data/doc-last-modified.json
@@ -114,7 +114,7 @@
   "document/content/docs/upgrading/4-13/4130.mdx": "2025-09-30T16:00:10+08:00",
   "document/content/docs/upgrading/4-13/4131.mdx": "2025-09-30T15:47:06+08:00",
   "document/content/docs/upgrading/4-13/4132.mdx": "2025-10-21T11:46:53+08:00",
-  "document/content/docs/upgrading/4-14/4140.mdx": "2025-10-23T19:11:11+08:00",
+  "document/content/docs/upgrading/4-14/4140.mdx": "2025-10-24T16:45:35+08:00",
   "document/content/docs/upgrading/4-8/40.mdx": "2025-08-02T19:38:37+08:00",
   "document/content/docs/upgrading/4-8/41.mdx": "2025-08-02T19:38:37+08:00",
   "document/content/docs/upgrading/4-8/42.mdx": "2025-08-02T19:38:37+08:00",
diff --git a/packages/global/common/string/markdown.ts b/packages/global/common/string/markdown.ts
@@ -173,18 +173,20 @@ export const markdownProcess = async ({
 };
 
 export const matchMdImg = (text: string) => {
-  const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g;
+  // 优化后的正则:
+  // 1. 使用 [^\]]* 匹配 alt 文本(更精确)
+  // 2. 使用 [A-Za-z0-9+/=]+ 匹配 base64 数据(避免回溯)
+  // 3. 明确匹配 data:image/ 前缀
+  const base64Regex = /!\[([^\]]*)\]\((data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))\)/g;
   const imageList: ImageType[] = [];
 
-  text = text.replace(base64Regex, (match, altText, base64Url) => {
+  text = text.replace(base64Regex, (_match, altText, _fullDataUrl, mime, base64Data) => {
     const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
-    const mime = base64Url.split(';')[0].split(':')[1];
-    const base64 = base64Url.split(',')[1];
 
     imageList.push({
       uuid,
-      base64,
-      mime
+      base64: base64Data,
+      mime: `image/${mime}`
     });
 
     // 保持原有的 alt 文本，只替换 base64 部分
diff --git a/packages/service/worker/htmlStr2Md/utils.ts b/packages/service/worker/htmlStr2Md/utils.ts
@@ -1,15 +1,20 @@
 import TurndownService from 'turndown';
 import { type ImageType } from '../readFile/type';
-import { matchMdImg } from '@fastgpt/global/common/string/markdown';
 import { getNanoid } from '@fastgpt/global/common/string/tools';
 // @ts-ignore
 const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
 
+const MAX_HTML_SIZE = 100 * 1000; // 100k characters limit
+
 const processBase64Images = (htmlContent: string) => {
-  const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
+  // 优化后的正则:
+  // 1. 使用精确的 base64 字符集 [A-Za-z0-9+/=]+ 避免回溯
+  // 2. 明确捕获 mime 类型和 base64 数据
+  // 3. 减少不必要的捕获组
+  const base64Regex = /src="data:([^;]+);base64,([A-Za-z0-9+/=]+)"/g;
   const images: ImageType[] = [];
 
-  const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
+  const processedHtml = htmlContent.replace(base64Regex, (_match, mime, base64Data) => {
     const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
     images.push({
       uuid,
@@ -63,12 +68,18 @@ export const html2md = (
 
     // Base64 img to id, otherwise it will occupy memory when going to md
     const { processedHtml, images } = processBase64Images(html);
+
+    // if html is too large, return the original html
+    if (processedHtml.length > MAX_HTML_SIZE) {
+      return { rawText: processedHtml, imageList: [] };
+    }
+
     const md = turndownService.turndown(processedHtml);
-    const { text, imageList } = matchMdImg(md);
+    // const { text, imageList } = matchMdImg(md);
 
     return {
-      rawText: text,
-      imageList: [...images, ...imageList]
+      rawText: md,
+      imageList: images
     };
   } catch (error) {
     console.log('html 2 markdown error', error);
diff --git a/test/cases/global/common/string/markdown.test.ts b/test/cases/global/common/string/markdown.test.ts
diff --git a/test/cases/service/worker/htmlStr2Md.test.ts b/test/cases/service/worker/htmlStr2Md.test.ts

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ description: 'FastGPT V4.14.0 更新说明'`
`9`	`9`
`10`	`10`	`## ⚙️ 优化`
`11`	`11`
	`12`	`+1. 匹配 Markdown 中 Base64 图片正则性能。`
`12`	`13`
`13`	`14`	`## 🐛 修复`
`14`	`15`