Skip to content

Commit e2ddc05

Browse files
committed
Markdown match base64 performance
1 parent 5ab2b62 commit e2ddc05

File tree

7 files changed

+896
-14
lines changed

7 files changed

+896
-14
lines changed

CLAUDE.md renamed to .claude/CLAUDE.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,4 +118,9 @@ FastGPT 是一个 AI Agent 构建平台,通过 Flow 提供开箱即用的数据
118118

119119
## 代码规范
120120

121-
- 尽可能使用 type 进行类型声明,而不是 interface。
121+
- 尽可能使用 type 进行类型声明,而不是 interface。
122+
123+
## Agent 设计规范
124+
125+
1. 对于功能的实习和复杂问题修复,优先进行文档设计,并于让用户确认后,再进行执行修复。
126+
2. 采用"设计文档-测试示例-代码编写-测试运行-修正代码/文档"的工作模式,以测试为核心来确保设计的正确性。

document/content/docs/upgrading/4-14/4140.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ description: 'FastGPT V4.14.0 更新说明'
99

1010
## ⚙️ 优化
1111

12+
1. 匹配 Markdown 中 Base64 图片正则性能。
1213

1314
## 🐛 修复
1415

document/data/doc-last-modified.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
"document/content/docs/upgrading/4-13/4130.mdx": "2025-09-30T16:00:10+08:00",
115115
"document/content/docs/upgrading/4-13/4131.mdx": "2025-09-30T15:47:06+08:00",
116116
"document/content/docs/upgrading/4-13/4132.mdx": "2025-10-21T11:46:53+08:00",
117-
"document/content/docs/upgrading/4-14/4140.mdx": "2025-10-23T19:11:11+08:00",
117+
"document/content/docs/upgrading/4-14/4140.mdx": "2025-10-24T16:45:35+08:00",
118118
"document/content/docs/upgrading/4-8/40.mdx": "2025-08-02T19:38:37+08:00",
119119
"document/content/docs/upgrading/4-8/41.mdx": "2025-08-02T19:38:37+08:00",
120120
"document/content/docs/upgrading/4-8/42.mdx": "2025-08-02T19:38:37+08:00",

packages/global/common/string/markdown.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,18 +173,20 @@ export const markdownProcess = async ({
173173
};
174174

175175
export const matchMdImg = (text: string) => {
176-
const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g;
176+
// 优化后的正则:
177+
// 1. 使用 [^\]]* 匹配 alt 文本(更精确)
178+
// 2. 使用 [A-Za-z0-9+/=]+ 匹配 base64 数据(避免回溯)
179+
// 3. 明确匹配 data:image/ 前缀
180+
const base64Regex = /!\[([^\]]*)\]\((data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))\)/g;
177181
const imageList: ImageType[] = [];
178182

179-
text = text.replace(base64Regex, (match, altText, base64Url) => {
183+
text = text.replace(base64Regex, (_match, altText, _fullDataUrl, mime, base64Data) => {
180184
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
181-
const mime = base64Url.split(';')[0].split(':')[1];
182-
const base64 = base64Url.split(',')[1];
183185

184186
imageList.push({
185187
uuid,
186-
base64,
187-
mime
188+
base64: base64Data,
189+
mime: `image/${mime}`
188190
});
189191

190192
// 保持原有的 alt 文本,只替换 base64 部分

packages/service/worker/htmlStr2Md/utils.ts

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
import TurndownService from 'turndown';
22
import { type ImageType } from '../readFile/type';
3-
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
43
import { getNanoid } from '@fastgpt/global/common/string/tools';
54
// @ts-ignore
65
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
76

7+
const MAX_HTML_SIZE = 100 * 1000; // 100k characters limit
8+
89
const processBase64Images = (htmlContent: string) => {
9-
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
10+
// 优化后的正则:
11+
// 1. 使用精确的 base64 字符集 [A-Za-z0-9+/=]+ 避免回溯
12+
// 2. 明确捕获 mime 类型和 base64 数据
13+
// 3. 减少不必要的捕获组
14+
const base64Regex = /src="data:([^;]+);base64,([A-Za-z0-9+/=]+)"/g;
1015
const images: ImageType[] = [];
1116

12-
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
17+
const processedHtml = htmlContent.replace(base64Regex, (_match, mime, base64Data) => {
1318
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
1419
images.push({
1520
uuid,
@@ -63,12 +68,18 @@ export const html2md = (
6368

6469
// Base64 img to id, otherwise it will occupy memory when going to md
6570
const { processedHtml, images } = processBase64Images(html);
71+
72+
// if html is too large, return the original html
73+
if (processedHtml.length > MAX_HTML_SIZE) {
74+
return { rawText: processedHtml, imageList: [] };
75+
}
76+
6677
const md = turndownService.turndown(processedHtml);
67-
const { text, imageList } = matchMdImg(md);
78+
// const { text, imageList } = matchMdImg(md);
6879

6980
return {
70-
rawText: text,
71-
imageList: [...images, ...imageList]
81+
rawText: md,
82+
imageList: images
7283
};
7384
} catch (error) {
7485
console.log('html 2 markdown error', error);

0 commit comments

Comments
 (0)