Skip to content

Commit 57903b4

Browse files
committed
Release v0.1.8
1 parent 3df3054 commit 57903b4

File tree

2 files changed

+110
-62
lines changed

2 files changed

+110
-62
lines changed

editscore/utils.py

Lines changed: 109 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -30,46 +30,49 @@ def format_value(match):
3030

3131
def repair_reasoning_field_robust(json_str: str) -> str:
3232
"""
33-
使用正则表达式和先行断言,健壮地修复 "reasoning" 字段内部未转义的双引号。
34-
此方法可以处理 "reasoning" 字段不是最后一个字段的情况。
33+
Robustly repair unescaped double quotes inside the "reasoning" field of a JSON string.
34+
This function uses regular expressions and a lookahead assertion to locate
35+
the end of the "reasoning" value, even if it is not the last field in the JSON.
3536
3637
Args:
37-
json_str: 可能包含格式错误的JSON字符串。
38+
json_str (str): A possibly malformed JSON string that may contain
39+
unescaped quotes within the "reasoning" field.
3840
3941
Returns:
40-
修复后的JSON字符串。
42+
str: A repaired JSON string with properly escaped quotes inside "reasoning".
4143
"""
42-
# 1. 定义新的正则表达式,使用正向先行断言来定位 "reasoning" 值的结束位置
43-
# re.DOTALL 标志让 '.' 可以匹配包括换行符在内的任意字符
44+
# 1. Define a regex pattern that locates the reasoning value using a lookahead.
45+
# The re.DOTALL flag allows '.' to match newline characters.
4446
pattern = re.compile(
45-
# --- 第1个捕获组: reasoning 字段的 "前缀" ---
47+
# --- Group 1: prefix part including the "reasoning" key and opening quote ---
4648
r'("reasoning"\s*:\s*")'
4749

48-
# --- 第2个捕获组: reasoning 字段的 "内容" ---
50+
# --- Group 2: content inside the reasoning string (non-greedy) ---
4951
r'(.*?)'
5052

51-
# --- 正向先行断言: 寻找值的结束边界,但不消耗它 ---
52-
# 匹配到 "reasoning" 值的结束双引号,这个双引号后面必须跟着一个逗号或一个右花括号
53+
# --- Lookahead assertion ---
54+
# Match the ending quote of the "reasoning" value,
55+
# but only if it is followed by a comma or closing brace.
5356
r'(?="\s*[,}])',
5457

5558
re.DOTALL
5659
)
5760

58-
# 2. 定义一个更简单的替换函数
61+
# 2. Define a replacement function to escape quotes inside the "reasoning" content.
5962
def replacer(match):
60-
# 提取出两个捕获组
61-
prefix = match.group(1) # 例如: '"reasoning" : "'
62-
content = match.group(2) # 例如: 'Overall building...'
63-
64-
# 只在 "内容" 部分进行替换,将所有双引号转义
63+
prefix = match.group(1) # e.g., '"reasoning": "'
64+
content = match.group(2) # e.g., 'Overall building...'
65+
66+
# Escape all unescaped double quotes inside the reasoning text.
6567
fixed_content = content.replace('"', '\\"')
66-
67-
# 重新组合。注意:我们不需要处理后缀,因为它没有被匹配和消耗掉。
68+
69+
# Reassemble the full matched segment. The suffix is not consumed by the pattern,
70+
# so we just return the prefix + repaired content.
6871
return prefix + fixed_content
6972

70-
# 3. 使用 re.sub 执行查找和替换
73+
# 3. Apply the regex substitution across the entire JSON string.
7174
repaired_str = pattern.sub(replacer, json_str)
72-
75+
7376
return repaired_str
7477

7578
def read_file_to_string(file_path):
@@ -202,59 +205,104 @@ def normalize_quotes(s: str) -> str:
202205

203206
def fallback_repair_json(input_str: str) -> str:
204207
"""
205-
A last-resort JSON repair function.
206-
It tries to reconstruct a valid JSON object with the target structure:
207-
{"reasoning": "...", "score": [float, float]}
208-
even if the original input string is heavily corrupted.
208+
Last-resort JSON repair that tries to preserve the 'reasoning' text
209+
even when it contains unescaped quotes or other corruption.
210+
211+
Target output:
212+
{"reasoning": "<text>", "score": [float, float]}
213+
214+
Approach:
215+
1. Locate 'reasoning' key position and 'score' key position.
216+
2. Extract the raw substring between them (reasoning_raw).
217+
3. Clean only the outer noise (leading/trailing quotes, commas, braces),
218+
but preserve internal punctuation.
219+
4. Unescape common escape sequences and normalize quotes.
220+
5. Extract numeric scores robustly.
221+
6. Return a valid JSON string.
222+
"""
209223

210-
Args:
211-
input_str (str): Possibly malformed JSON string.
224+
s = input_str
212225

213-
Returns:
214-
str: A repaired and valid JSON string.
215-
"""
226+
# Normalize whitespace for easier searching (but keep original for slicing)
227+
lowered = s.lower()
216228

217-
# 1. Try to extract the reasoning text between "reasoning" and "score"
218-
reasoning_match = re.search(
219-
r'"?reasoning"?\s*[::]\s*["\']?(.*?)["\']?\s*,\s*"?score"?',
220-
input_str,
221-
re.DOTALL | re.IGNORECASE,
222-
)
229+
# 1) find the start of reasoning key (case-insensitive)
230+
m_reason = re.search(r'"?reasoning"?\s*[::]', lowered)
231+
m_score = re.search(r'"?score"?\s*[::]', lowered)
223232

224-
if reasoning_match:
225-
reasoning_text = reasoning_match.group(1).strip()
226-
else:
227-
# If not found, fallback to an empty string
228-
reasoning_text = ""
229-
230-
# 2. Clean and normalize the reasoning content
231-
reasoning_text = reasoning_text.replace('\\"', '"') # Unescape existing escapes
232-
reasoning_text = re.sub(r'["“”]', '"', reasoning_text) # Normalize quotes
233-
reasoning_text = reasoning_text.strip()
234-
# Escape any remaining unescaped double quotes to avoid breaking JSON
235-
reasoning_text = reasoning_text.replace('"', '\\"')
236-
237-
# 3. Try to extract the score list (two floats)
238-
score_match = re.search(
239-
r'"?score"?\s*[::]\s*\[?([^\]]+)\]?', input_str, re.DOTALL | re.IGNORECASE
240-
)
233+
reasoning_text = ""
241234
scores = []
242-
if score_match:
243-
# Extract numeric values using regex
244-
nums = re.findall(r"-?\d+(?:\.\d+)?", score_match.group(1))
235+
236+
if m_reason and m_score:
237+
# compute the real indices in the original string
238+
start_idx = m_reason.end() # right after colon in 'reasoning:'
239+
score_start_idx = m_score.start()
240+
241+
# 2) slice the original string between reasoning value start and score key start
242+
reasoning_raw = s[start_idx:score_start_idx]
243+
244+
# 3) clean outer noise but preserve inner content:
245+
# - strip whitespace and outer commas/braces
246+
reasoning_raw = reasoning_raw.strip()
247+
# remove leading commas/braces/colons
248+
reasoning_raw = re.sub(r'^[\s,{\[]+', '', reasoning_raw)
249+
# remove trailing commas/braces/colons (but keep inner punctuation)
250+
reasoning_raw = re.sub(r'[\s,}\]]+$', '', reasoning_raw)
251+
252+
# If the reasoning starts with a quote char, drop it (we'll re-escape later).
253+
if reasoning_raw.startswith(("'", '"')):
254+
reasoning_raw = reasoning_raw[1:]
255+
# If it ends with a quote char (common), drop it.
256+
if reasoning_raw.endswith(("'", '"')):
257+
reasoning_raw = reasoning_raw[:-1]
258+
259+
# 4) normalize escapes:
260+
# Replace common escaped sequences (\" -> "), but avoid creating unbalanced quotes.
261+
reasoning_raw = reasoning_raw.replace('\\"', '"').replace("\\'", "'")
262+
# Replace fancy quotes with straight quotes (optional)
263+
reasoning_raw = re.sub(r'[“”]', '"', reasoning_raw)
264+
reasoning_raw = re.sub(r"[‘’]", "'", reasoning_raw)
265+
266+
# Trim again
267+
reasoning_text = reasoning_raw.strip()
268+
else:
269+
# If we couldn't find both keys, try a looser regex capturing 'reasoning' value
270+
m_loose = re.search(r'"?reasoning"?\s*[::]\s*["\']?(.*?)["\']?\s*(,|$)', s, re.DOTALL | re.IGNORECASE)
271+
if m_loose:
272+
reasoning_text = m_loose.group(1).strip()
273+
# normalize escapes as above
274+
reasoning_text = reasoning_text.replace('\\"', '"').replace("\\'", "'")
275+
reasoning_text = re.sub(r'[“”]', '"', reasoning_text)
276+
reasoning_text = re.sub(r"[‘’]", "'", reasoning_text)
277+
278+
# 5) Extract two numeric scores anywhere after the 'score' key (robust)
279+
if m_score:
280+
# slice from score key to the end
281+
score_slice = s[m_score.end():]
282+
# find numbers (integers or floats)
283+
nums = re.findall(r'-?\d+(?:\.\d+)?', score_slice)
245284
try:
246285
scores = [float(n) for n in nums[:2]]
247-
except ValueError:
248-
pass
286+
except Exception:
287+
scores = []
288+
else:
289+
# fallback: try to find any two numbers in the whole string
290+
nums = re.findall(r'-?\d+(?:\.\d+)?', s)
291+
try:
292+
scores = [float(n) for n in nums[:2]]
293+
except Exception:
294+
scores = []
249295

250-
# Fill missing values with default zeros
296+
# Ensure we always return two floats
251297
if len(scores) < 2:
252298
scores += [0.0] * (2 - len(scores))
253299

254-
# 4. Construct the repaired JSON object
255-
repaired_obj = {"reasoning": reasoning_text, "score": scores}
300+
# 6) Construct final object. Let json.dumps handle escaping inside the reasoning.
301+
repaired_obj = {
302+
"reasoning": reasoning_text,
303+
"score": scores
304+
}
256305

257-
# 5. Return a valid JSON string
258306
return json.dumps(repaired_obj, ensure_ascii=False)
259307

260308
def robust_json_fix(s: str):

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "editscore"
7-
version = "0.1.7"
7+
version = "0.1.8"
88
authors = [
99
{ name="Xin Luo", email="[email protected]" },
1010
{ name="Jiahao Wang", email="[email protected]" },

0 commit comments

Comments
 (0)