From c922e11aab768403eb1c9b3497a655c918f5666a Mon Sep 17 00:00:00 2001 From: Ludwig Kent <124366668+Gavin-WangSC@users.noreply.github.com> Date: Thu, 10 Apr 2025 23:29:52 +0800 Subject: [PATCH 1/5] feat: Added LaTeX error modification using deepseek-reasoner --- writer.py | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 7 deletions(-) diff --git a/writer.py b/writer.py index b8fe6f5..3526ee0 100644 --- a/writer.py +++ b/writer.py @@ -7,6 +7,9 @@ import os import glob import yaml +import re +import subprocess +import tempfile path_to = f'src/content/blog/{datetime.datetime.now().strftime("%Y-%m-%d")}' @@ -84,25 +87,164 @@ def outline(topic): def write_from_outline(outline): global deepseek, existing_posts_text return generate([ - {"role": "system", "content": "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:1) 中文与英文、数字之间需要有空格;2) 中文标点与英文、数字之间不加空格;3) 使用全角中文标点;4) 专有名词大小写正确;5) 英文、数字使用半角字符;6) 使用直角引号「」。"}, - {"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,避免使用列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\infty$ 1和($\\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。请遵循中文排版规范,确保中英文之间有空格,使用正确的标点符号。直接输出正文。"} + {"role": "system", "content": "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符;使用直角引号「」。"}, + {"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,不能使用任何列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\infty$ 1和($\\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。请遵循中文排版规范,使用正确的标点符号。直接输出正文。"} ], deepseek, "deepseek-reasoner") def summary(article): global deepseek return generate([ - {"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。遵循以下中文排版规范:1) 中文与英文、数字之间需要有空格;2) 中文标点与英文、数字之间不加空格;3) 使用全角中文标点;4) 专有名词大小写正确;5) 英文、数字使用半角字符。注意简介被作为副标题使用,不是一句句子,不要以句号结尾。"}, + {"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。注意简介被作为副标题使用,不是一句句子,不要以句号结尾。"}, {"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"} ], deepseek, "deepseek-chat") +# LaTeX error handling +def remove_latex_comments(latex_str: str) -> str: + lines = latex_str.splitlines() + cleaned_lines = [] + for line in lines: + m = re.search(r'(? (bool, list): + stack = [] + errors = [] + for index, char in enumerate(latex_str): + if char == '{': + stack.append(index) + elif char == '}': + if not stack: + errors.append(f"位置 {index}: 右大括号 '}}' 没有对应的左大括号") + else: + stack.pop() + if stack: + for pos in stack: + errors.append(f"位置 {pos}: 左大括号 '{{' 没有对应的右大括号") + return (len(errors) == 0), errors + +def check_environment_matching(latex_str: str) -> (bool, list): + errors = [] + env_stack = [] + pattern = re.compile(r'\\(begin|end)\s*{([^}]+)}') + for m in pattern.finditer(latex_str): + cmd = m.group(1) + env = m.group(2).strip() + pos = m.start() + if cmd == "begin": + env_stack.append((env, pos)) + else: # cmd == "end" + if not env_stack: + errors.append(f"位置 {pos}: \\end{{{env}}} 没有对应的 \\begin") + else: + last_env, last_pos = env_stack.pop() + if last_env != env: + errors.append(f"位置 {last_pos} 的 \\begin{{{last_env}}} 与位置 {pos} 的 \\end{{{env}}} 不匹配") + if env_stack: + for env, pos in env_stack: + errors.append(f"位置 {pos}: \\begin{{{env}}} 没有对应的 \\end") + return (len(errors) == 0), errors + +def run_static_checks(latex_snippet: str) -> list: + cleaned = remove_latex_comments(latex_snippet) + errors = [] + ok_braces, brace_errors = check_balanced_braces(cleaned) + ok_env, env_errors = check_environment_matching(cleaned) + if not ok_braces: + errors.extend(["大括号错误: " + err for err in brace_errors]) + if not ok_env: + errors.extend(["环境匹配错误: " + err for err in env_errors]) + return errors + +def check_with_pdflatex(latex_snippet: str) -> list: + """ + call pdflatex for compilation checking and return the error messages detected in the compilation log. + """ + template = r""" +\documentclass{article} +\usepackage{amsmath} +\begin{document} +%s +\end{document} + """ % latex_snippet + + errors = [] + with tempfile.TemporaryDirectory() as tmpdirname: + tex_file = os.path.join(tmpdirname, "temp.tex") + with open(tex_file, "w", encoding="utf-8") as f: + f.write(template) + try: + proc = subprocess.run( + ["pdflatex", "-interaction=nonstopmode", tex_file], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=tmpdirname, timeout=15 + ) + output = proc.stdout.decode("utf-8") + proc.stderr.decode("utf-8") + for line in output.splitlines(): + if line.startswith("!"): + errors.append(line.strip()) + if proc.returncode != 0 and not errors: + errors.append("pdflatex 返回非 0 错误码,编译可能存在问题。") + except Exception as e: + errors.append(f"调用 pdflatex 编译时出错: {e}") + return errors + +def extract_latex_segments(markdown_text: str) -> list: + """ + extract latex segments from markdown + """ + segments = [] + block_pattern = re.compile(r'\$\$([\s\S]+?)\$\$', re.MULTILINE) + segments.extend(block_pattern.findall(markdown_text)) + inline_pattern = re.compile(r'(? dict: + segments = extract_latex_segments(markdown_text) + report = {} + for idx, seg in enumerate(segments): + seg = seg.strip() + static_errors = run_static_checks(seg) + pdflatex_errors = check_with_pdflatex(seg) + report[f"公式段 {idx+1}"] = { + "原始内容": seg, + "静态检测错误": static_errors, + "pdflatex 检测错误": pdflatex_errors + } + return report + +def modify_latex(markdown_text: str, error): + global deepseek + return generate([ + {"role": "system", "content": "你是LaTeX校验员。以下是一段Markdown文本,其中的LaTeX代码有错误,请基于报错修正。同时文本要遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。直接在输出中输出文本内容。"}, + {"role": "user", "content": f"<原文>\n{markdown_text}\n\n\n<报错>\n{error}\n"} + ], deepseek, "deepseek-reasoner") + +is_latin = lambda ch: '\u0000' <= ch <= '\u007F' or '\u00A0' <= ch <= '\u024F' +is_nonspace_latin = lambda ch: is_latin(ch) and not ch.isspace() and not ch in """*()[]{}"'/-@#""" +is_nonpunct_cjk = lambda ch: not is_latin(ch) and ch not in "·!¥…()—【】、;:‘’“”,。《》?「」" + +def beautify_string(text): + res = "" + for idx in range(len(text)): + if idx and ( + (is_nonspace_latin(text[idx]) and is_nonpunct_cjk(text[idx - 1])) or + (is_nonspace_latin(text[idx - 1]) and is_nonpunct_cjk(text[idx])) + ): res += " " + res += text[idx] + return res + start = time.time() print(" Generating topic:") -topic = extract_topic(topics_text) +topic = beautify_string(extract_topic(topics_text)) print(f" Determined topic: {topic}; time spent {time.time() - start:.1f} s") start = time.time() print(" Generating outline:") -outline_result = outline(topic) +outline_result = beautify_string(outline(topic)) print(f" Determined outline: time spent {time.time() - start:.1f} s") start = time.time() @@ -110,9 +252,20 @@ def summary(article): article = write_from_outline(outline_result) print(f" Article written: time spent {time.time() - start:.1f} s") +if latex_errors(article): + print(" latex_errors exist") + start = time.time() + article = modify_latex(article, latex_errors(article)) + print(f" LaTeX errors fixed: time spent {time.time() - start:.1f} s") + +start = time.time() +article = beautify_string(article) +print(f" Article beautified: time spent {time.time() - start:.1f} s") + + start = time.time() print(" Generating summary:") -summary_result = summary(article) +summary_result = beautify_string(summary(article)) print(f" Decided Summary: {summary_result}; time spent {time.time() - start:.1f} s") lines = iter(article.split("\n")) @@ -146,4 +299,4 @@ def summary(article): with open(f"{path_to}/index.md", "w", encoding="utf-8") as f: f.write(markdown_file) -print(f" Composed article: {path_to}/index.md") +print(f" Composed article: {path_to}/index.md") \ No newline at end of file From 69ee4f35bd98ab8ef833cbc4091ecc264be76312 Mon Sep 17 00:00:00 2001 From: Ludwig Kent <124366668+Gavin-WangSC@users.noreply.github.com> Date: Fri, 11 Apr 2025 20:39:09 +0800 Subject: [PATCH 2/5] . --- .github/workflows/auto-writer.yml | 2 +- writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/auto-writer.yml b/.github/workflows/auto-writer.yml index 194bc5e..adac53f 100644 --- a/.github/workflows/auto-writer.yml +++ b/.github/workflows/auto-writer.yml @@ -14,7 +14,7 @@ jobs: uses: actions/checkout@v4 - name: Install Python Dependencies - run: pip install openai bs4 requests pyyaml + run: pip install openai bs4 requests pyyaml tempfile subprocess - name: Compose New Article env: diff --git a/writer.py b/writer.py index 3526ee0..b890be9 100644 --- a/writer.py +++ b/writer.py @@ -98,7 +98,7 @@ def summary(article): {"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"} ], deepseek, "deepseek-chat") -# LaTeX error handling +# LaTeX error handling def remove_latex_comments(latex_str: str) -> str: lines = latex_str.splitlines() cleaned_lines = [] From 35675ab5c878be16b7e82c42bcbaca7893dd2780 Mon Sep 17 00:00:00 2001 From: Ludwig Kent <124366668+Gavin-WangSC@users.noreply.github.com> Date: Fri, 11 Apr 2025 21:23:37 +0800 Subject: [PATCH 3/5] . --- writer.py | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/writer.py b/writer.py index a772b17..b890be9 100644 --- a/writer.py +++ b/writer.py @@ -98,6 +98,145 @@ def summary(article): {"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"} ], deepseek, "deepseek-chat") +# LaTeX error handling +def remove_latex_comments(latex_str: str) -> str: + lines = latex_str.splitlines() + cleaned_lines = [] + for line in lines: + m = re.search(r'(? (bool, list): + stack = [] + errors = [] + for index, char in enumerate(latex_str): + if char == '{': + stack.append(index) + elif char == '}': + if not stack: + errors.append(f"位置 {index}: 右大括号 '}}' 没有对应的左大括号") + else: + stack.pop() + if stack: + for pos in stack: + errors.append(f"位置 {pos}: 左大括号 '{{' 没有对应的右大括号") + return (len(errors) == 0), errors + +def check_environment_matching(latex_str: str) -> (bool, list): + errors = [] + env_stack = [] + pattern = re.compile(r'\\(begin|end)\s*{([^}]+)}') + for m in pattern.finditer(latex_str): + cmd = m.group(1) + env = m.group(2).strip() + pos = m.start() + if cmd == "begin": + env_stack.append((env, pos)) + else: # cmd == "end" + if not env_stack: + errors.append(f"位置 {pos}: \\end{{{env}}} 没有对应的 \\begin") + else: + last_env, last_pos = env_stack.pop() + if last_env != env: + errors.append(f"位置 {last_pos} 的 \\begin{{{last_env}}} 与位置 {pos} 的 \\end{{{env}}} 不匹配") + if env_stack: + for env, pos in env_stack: + errors.append(f"位置 {pos}: \\begin{{{env}}} 没有对应的 \\end") + return (len(errors) == 0), errors + +def run_static_checks(latex_snippet: str) -> list: + cleaned = remove_latex_comments(latex_snippet) + errors = [] + ok_braces, brace_errors = check_balanced_braces(cleaned) + ok_env, env_errors = check_environment_matching(cleaned) + if not ok_braces: + errors.extend(["大括号错误: " + err for err in brace_errors]) + if not ok_env: + errors.extend(["环境匹配错误: " + err for err in env_errors]) + return errors + +def check_with_pdflatex(latex_snippet: str) -> list: + """ + call pdflatex for compilation checking and return the error messages detected in the compilation log. + """ + template = r""" +\documentclass{article} +\usepackage{amsmath} +\begin{document} +%s +\end{document} + """ % latex_snippet + + errors = [] + with tempfile.TemporaryDirectory() as tmpdirname: + tex_file = os.path.join(tmpdirname, "temp.tex") + with open(tex_file, "w", encoding="utf-8") as f: + f.write(template) + try: + proc = subprocess.run( + ["pdflatex", "-interaction=nonstopmode", tex_file], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=tmpdirname, timeout=15 + ) + output = proc.stdout.decode("utf-8") + proc.stderr.decode("utf-8") + for line in output.splitlines(): + if line.startswith("!"): + errors.append(line.strip()) + if proc.returncode != 0 and not errors: + errors.append("pdflatex 返回非 0 错误码,编译可能存在问题。") + except Exception as e: + errors.append(f"调用 pdflatex 编译时出错: {e}") + return errors + +def extract_latex_segments(markdown_text: str) -> list: + """ + extract latex segments from markdown + """ + segments = [] + block_pattern = re.compile(r'\$\$([\s\S]+?)\$\$', re.MULTILINE) + segments.extend(block_pattern.findall(markdown_text)) + inline_pattern = re.compile(r'(? dict: + segments = extract_latex_segments(markdown_text) + report = {} + for idx, seg in enumerate(segments): + seg = seg.strip() + static_errors = run_static_checks(seg) + pdflatex_errors = check_with_pdflatex(seg) + report[f"公式段 {idx+1}"] = { + "原始内容": seg, + "静态检测错误": static_errors, + "pdflatex 检测错误": pdflatex_errors + } + return report + +def modify_latex(markdown_text: str, error): + global deepseek + return generate([ + {"role": "system", "content": "你是LaTeX校验员。以下是一段Markdown文本,其中的LaTeX代码有错误,请基于报错修正。同时文本要遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。直接在输出中输出文本内容。"}, + {"role": "user", "content": f"<原文>\n{markdown_text}\n\n\n<报错>\n{error}\n"} + ], deepseek, "deepseek-reasoner") + +is_latin = lambda ch: '\u0000' <= ch <= '\u007F' or '\u00A0' <= ch <= '\u024F' +is_nonspace_latin = lambda ch: is_latin(ch) and not ch.isspace() and not ch in """*()[]{}"'/-@#""" +is_nonpunct_cjk = lambda ch: not is_latin(ch) and ch not in "·!¥…()—【】、;:‘’“”,。《》?「」" + +def beautify_string(text): + res = "" + for idx in range(len(text)): + if idx and ( + (is_nonspace_latin(text[idx]) and is_nonpunct_cjk(text[idx - 1])) or + (is_nonspace_latin(text[idx - 1]) and is_nonpunct_cjk(text[idx])) + ): res += " " + res += text[idx] + return res + start = time.time() print(" Generating topic:") topic = beautify_string(extract_topic(topics_text)) @@ -110,7 +249,7 @@ def summary(article): start = time.time() print(" Generating article:") -article = beautify_string(write_from_outline(outline_result)) +article = write_from_outline(outline_result) print(f" Article written: time spent {time.time() - start:.1f} s") if latex_errors(article): From 8c517c08d136e19db0cd2d1df0bb8c7245798242 Mon Sep 17 00:00:00 2001 From: Ludwig Kent <124366668+Gavin-WangSC@users.noreply.github.com> Date: Mon, 5 May 2025 01:24:03 +0800 Subject: [PATCH 4/5] feat: extended the latex checks; changed the modificatioin process in to a while loop; only put LaTeX errors in the prompt when requesting a rewriting (the error prompts haven't been tested yet, they are generated by llms); isolated LaTeX when beautifying; --- writer.py | 310 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 184 insertions(+), 126 deletions(-) diff --git a/writer.py b/writer.py index b890be9..3ebc1b2 100644 --- a/writer.py +++ b/writer.py @@ -8,8 +8,7 @@ import glob import yaml import re -import subprocess -import tempfile +from typing import List, Tuple, Dict path_to = f'src/content/blog/{datetime.datetime.now().strftime("%Y-%m-%d")}' @@ -99,143 +98,201 @@ def summary(article): ], deepseek, "deepseek-chat") # LaTeX error handling -def remove_latex_comments(latex_str: str) -> str: - lines = latex_str.splitlines() - cleaned_lines = [] - for line in lines: - m = re.search(r'(? (bool, list): - stack = [] - errors = [] - for index, char in enumerate(latex_str): - if char == '{': - stack.append(index) - elif char == '}': +def extract_latex_segments(markdown_text: str) -> List[Tuple[str, int, int]]: + segments: List[Tuple[str,int,int]] = [] + block_pattern = re.compile(r'(\$\$[\s\S]+?\$\$)', re.DOTALL) + for m in block_pattern.finditer(markdown_text): + segments.append((m.group(1), m.start(), m.end())) + + inline_pattern = re.compile(r'(? List[str]: + errors: List[str] = [] + + # 命令后多余空格 (忽略 \tt, \it, \bf) + for m in re.finditer(r"\\([a-zA-Z]+)(\s+)", latex_str): + cmd = m.group(1) + if cmd not in ('tt', 'it', 'bf'): + errors.append(f"命令 '\\{cmd}' 后跟有空格,建议去掉空格。") + + # 引用前多余空格,建议用 '~' + if re.search(r"\s+\\ref\{", latex_str): + errors.append("'\\ref' 前有空格,应使用 '~\\ref{...}' 保持断开。") + + # 省略号 '...' 而非 \dots 或 \ldots + if re.search(r'(? (bool, list): - errors = [] - env_stack = [] - pattern = re.compile(r'\\(begin|end)\s*{([^}]+)}') - for m in pattern.finditer(latex_str): - cmd = m.group(1) - env = m.group(2).strip() + for pos in stack: + errors.append(f"位置 {pos}: 多余 '{{' 。") + + # \begin / \end 匹配(修正 \end raw-string 报错) + env_stack: List[Tuple[str, int]] = [] + for m in re.finditer(r"\\(begin|end)\s*\{([^}]+)\}", latex_str): + cmd, env = m.group(1), m.group(2) pos = m.start() - if cmd == "begin": + if cmd == 'begin': env_stack.append((env, pos)) - else: # cmd == "end" - if not env_stack: - errors.append(f"位置 {pos}: \\end{{{env}}} 没有对应的 \\begin") + else: # cmd == 'end' + if not env_stack or env_stack[-1][0] != env: + # 注意这里用双反斜杠来正确表示 '\end' + errors.append(f"位置 {pos}: '\\end{{{env}}}' 无匹配或顺序错误。") else: - last_env, last_pos = env_stack.pop() - if last_env != env: - errors.append(f"位置 {last_pos} 的 \\begin{{{last_env}}} 与位置 {pos} 的 \\end{{{env}}} 不匹配") - if env_stack: - for env, pos in env_stack: - errors.append(f"位置 {pos}: \\begin{{{env}}} 没有对应的 \\end") - return (len(errors) == 0), errors - -def run_static_checks(latex_snippet: str) -> list: - cleaned = remove_latex_comments(latex_snippet) - errors = [] - ok_braces, brace_errors = check_balanced_braces(cleaned) - ok_env, env_errors = check_environment_matching(cleaned) - if not ok_braces: - errors.extend(["大括号错误: " + err for err in brace_errors]) - if not ok_env: - errors.extend(["环境匹配错误: " + err for err in env_errors]) - return errors + env_stack.pop() + # 剩余未闭合的 begin + for env, pos in env_stack: + errors.append(f"位置 {pos}: '\\begin{{{env}}}' 未关闭。") -def check_with_pdflatex(latex_snippet: str) -> list: - """ - call pdflatex for compilation checking and return the error messages detected in the compilation log. - """ - template = r""" -\documentclass{article} -\usepackage{amsmath} -\begin{document} -%s -\end{document} - """ % latex_snippet - - errors = [] - with tempfile.TemporaryDirectory() as tmpdirname: - tex_file = os.path.join(tmpdirname, "temp.tex") - with open(tex_file, "w", encoding="utf-8") as f: - f.write(template) - try: - proc = subprocess.run( - ["pdflatex", "-interaction=nonstopmode", tex_file], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=tmpdirname, timeout=15 - ) - output = proc.stdout.decode("utf-8") + proc.stderr.decode("utf-8") - for line in output.splitlines(): - if line.startswith("!"): - errors.append(line.strip()) - if proc.returncode != 0 and not errors: - errors.append("pdflatex 返回非 0 错误码,编译可能存在问题。") - except Exception as e: - errors.append(f"调用 pdflatex 编译时出错: {e}") - return errors + # 括号前多余空格 + if re.search(r"\s+\(", latex_str): + errors.append("左括号 '(' 前有空格,应去除。") -def extract_latex_segments(markdown_text: str) -> list: - """ - extract latex segments from markdown - """ - segments = [] - block_pattern = re.compile(r'\$\$([\s\S]+?)\$\$', re.MULTILINE) - segments.extend(block_pattern.findall(markdown_text)) - inline_pattern = re.compile(r'(? dict: - segments = extract_latex_segments(markdown_text) + return errors + +def latex_errors(markdown_text: str) -> Dict[Tuple[str, int], List[str]]: report = {} - for idx, seg in enumerate(segments): - seg = seg.strip() - static_errors = run_static_checks(seg) - pdflatex_errors = check_with_pdflatex(seg) - report[f"公式段 {idx+1}"] = { - "原始内容": seg, - "静态检测错误": static_errors, - "pdflatex 检测错误": pdflatex_errors - } + for seg, start_idx, _ in extract_latex_segments(markdown_text): + errs = latex_checks(seg) + if errs: + report[(seg, start_idx)] = errs return report -def modify_latex(markdown_text: str, error): - global deepseek - return generate([ - {"role": "system", "content": "你是LaTeX校验员。以下是一段Markdown文本,其中的LaTeX代码有错误,请基于报错修正。同时文本要遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。直接在输出中输出文本内容。"}, - {"role": "user", "content": f"<原文>\n{markdown_text}\n\n\n<报错>\n{error}\n"} - ], deepseek, "deepseek-reasoner") +def modify_latex(markdown_text: str, error_report: Dict[Tuple[str,int], List[str]]) -> str: + """ + 遍历 error_report,按 start_idx 从大到小替换, + 保证后面的替换不影响前面的 start_idx。 + """ + corrected = markdown_text + items = sorted(error_report.items(), key=lambda x: x[0][1], reverse=True) + + for (seg, start_idx), errs in items: + end_idx = start_idx + len(seg) + context = corrected[max(0, start_idx-50): end_idx+50] + user_msg = ( + f"修正此 LaTeX 片段(包含 $ 定界符):\n{seg}\n\n" + "检测到错误:\n- " + "\n- ".join(errs) + + "\n\n上下文:\n" + context + + "\n\n请只返回修正后的完整片段,不要添加其它标记。" + ) + fixed = generate([ + {"role":"system","content":"你是 LaTeX 专家,负责修正以下代码:"}, + {"role":"user","content":user_msg} + ], deepseek, "deepseek-reasoner").strip() + + # 去掉```,如果不小心生成了 + if fixed.startswith("```") and fixed.endswith("```"): + fixed = "\n".join(fixed.splitlines()[1:-1]).strip() + + # 给重新生成的丢失的加上 $/$$,如果ds忘记了 + if not fixed.startswith('$'): + if seg.startswith('$$') and seg.endswith('$$'): + fixed = '$$' + fixed + '$$' + elif seg.startswith('$') and seg.endswith('$'): + fixed = '$' + fixed + '$' + + # 最终替换 + corrected = corrected[:start_idx] + fixed + corrected[end_idx:] + + return corrected is_latin = lambda ch: '\u0000' <= ch <= '\u007F' or '\u00A0' <= ch <= '\u024F' is_nonspace_latin = lambda ch: is_latin(ch) and not ch.isspace() and not ch in """*()[]{}"'/-@#""" is_nonpunct_cjk = lambda ch: not is_latin(ch) and ch not in "·!¥…()—【】、;:‘’“”,。《》?「」" -def beautify_string(text): - res = "" - for idx in range(len(text)): - if idx and ( - (is_nonspace_latin(text[idx]) and is_nonpunct_cjk(text[idx - 1])) or - (is_nonspace_latin(text[idx - 1]) and is_nonpunct_cjk(text[idx])) - ): res += " " - res += text[idx] - return res +# beautify的时候跳过 LaTeX +def beautify_string(text: str) -> str: + segments = extract_latex_segments(text) + segments.sort(key=lambda x: x[1]) + + result_parts = [] + last_end = 0 + + for seg_content, seg_start, seg_end in segments: + non_latex_part = text[last_end:seg_start] + processed_part = "" + for i, char in enumerate(non_latex_part): + if i > 0 and ( + (is_nonspace_latin(char) and is_nonpunct_cjk(non_latex_part[i-1])) or + (is_nonspace_latin(non_latex_part[i-1]) and is_nonpunct_cjk(char)) + ): + processed_part += " " + processed_part += char + result_parts.append(processed_part) + + result_parts.append(seg_content) + last_end = seg_end + + final_part = text[last_end:] + processed_final_part = "" + for i, char in enumerate(final_part): + if i > 0 and ( + (is_nonspace_latin(char) and is_nonpunct_cjk(final_part[i-1])) or + (is_nonspace_latin(final_part[i-1]) and is_nonpunct_cjk(char)) + ): + processed_final_part += " " + processed_final_part += char + result_parts.append(processed_final_part) + + return "".join(result_parts) start = time.time() print(" Generating topic:") @@ -252,11 +309,12 @@ def beautify_string(text): article = write_from_outline(outline_result) print(f" Article written: time spent {time.time() - start:.1f} s") -if latex_errors(article): - print(" latex_errors exist") - start = time.time() +start = time.time() +while latex_errors(article): + print("latex_errors still exist") article = modify_latex(article, latex_errors(article)) - print(f" LaTeX errors fixed: time spent {time.time() - start:.1f} s") + +print(f" LaTeX errors fixed: time spent {time.time() - start:.1f} s") start = time.time() article = beautify_string(article) From f9991d074ecead4eaa4cde181b8c6b43b1970b11 Mon Sep 17 00:00:00 2001 From: Ludwig Kent <124366668+Gavin-WangSC@users.noreply.github.com> Date: Mon, 5 May 2025 09:33:07 +0800 Subject: [PATCH 5/5] unimportant dependencies --- .github/workflows/auto-writer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/auto-writer.yml b/.github/workflows/auto-writer.yml index adac53f..194bc5e 100644 --- a/.github/workflows/auto-writer.yml +++ b/.github/workflows/auto-writer.yml @@ -14,7 +14,7 @@ jobs: uses: actions/checkout@v4 - name: Install Python Dependencies - run: pip install openai bs4 requests pyyaml tempfile subprocess + run: pip install openai bs4 requests pyyaml - name: Compose New Article env: