@@ -30,46 +30,49 @@ def format_value(match):
3030
3131def repair_reasoning_field_robust (json_str : str ) -> str :
3232 """
33- 使用正则表达式和先行断言,健壮地修复 "reasoning" 字段内部未转义的双引号。
34- 此方法可以处理 "reasoning" 字段不是最后一个字段的情况。
33+ Robustly repair unescaped double quotes inside the "reasoning" field of a JSON string.
34+ This function uses regular expressions and a lookahead assertion to locate
35+ the end of the "reasoning" value, even if it is not the last field in the JSON.
3536
3637 Args:
37- json_str: 可能包含格式错误的JSON字符串。
38+ json_str (str): A possibly malformed JSON string that may contain
39+ unescaped quotes within the "reasoning" field.
3840
3941 Returns:
40- 修复后的JSON字符串。
42+ str: A repaired JSON string with properly escaped quotes inside "reasoning".
4143 """
42- # 1. 定义新的正则表达式,使用正向先行断言来定位 " reasoning" 值的结束位置
43- # re.DOTALL 标志让 '.' 可以匹配包括换行符在内的任意字符
44+ # 1. Define a regex pattern that locates the reasoning value using a lookahead.
45+ # The re.DOTALL flag allows '.' to match newline characters.
4446 pattern = re .compile (
45- # --- 第1个捕获组: reasoning 字段的 "前缀" ---
47+ # --- Group 1: prefix part including the "reasoning" key and opening quote ---
4648 r'("reasoning"\s*:\s*")'
4749
48- # --- 第2个捕获组: reasoning 字段的 "内容" ---
50+ # --- Group 2: content inside the reasoning string (non-greedy) ---
4951 r'(.*?)'
5052
51- # --- 正向先行断言: 寻找值的结束边界,但不消耗它 ---
52- # 匹配到 "reasoning" 值的结束双引号,这个双引号后面必须跟着一个逗号或一个右花括号
53+ # --- Lookahead assertion ---
54+ # Match the ending quote of the "reasoning" value,
55+ # but only if it is followed by a comma or closing brace.
5356 r'(?="\s*[,}])' ,
5457
5558 re .DOTALL
5659 )
5760
58- # 2. 定义一个更简单的替换函数
61+ # 2. Define a replacement function to escape quotes inside the "reasoning" content.
5962 def replacer (match ):
60- # 提取出两个捕获组
61- prefix = match .group (1 ) # 例如: '"reasoning" : "'
62- content = match .group (2 ) # 例如: 'Overall building...'
63-
64- # 只在 "内容" 部分进行替换,将所有双引号转义
63+ prefix = match .group (1 ) # e.g., '"reasoning": "'
64+ content = match .group (2 ) # e.g., 'Overall building...'
65+
66+ # Escape all unescaped double quotes inside the reasoning text.
6567 fixed_content = content .replace ('"' , '\\ "' )
66-
67- # 重新组合。注意:我们不需要处理后缀,因为它没有被匹配和消耗掉。
68+
69+ # Reassemble the full matched segment. The suffix is not consumed by the pattern,
70+ # so we just return the prefix + repaired content.
6871 return prefix + fixed_content
6972
70- # 3. 使用 re.sub 执行查找和替换
73+ # 3. Apply the regex substitution across the entire JSON string.
7174 repaired_str = pattern .sub (replacer , json_str )
72-
75+
7376 return repaired_str
7477
7578def read_file_to_string (file_path ):
@@ -202,59 +205,104 @@ def normalize_quotes(s: str) -> str:
202205
203206def fallback_repair_json (input_str : str ) -> str :
204207 """
205- A last-resort JSON repair function.
206- It tries to reconstruct a valid JSON object with the target structure:
207- {"reasoning": "...", "score": [float, float]}
208- even if the original input string is heavily corrupted.
208+ Last-resort JSON repair that tries to preserve the 'reasoning' text
209+ even when it contains unescaped quotes or other corruption.
210+
211+ Target output:
212+ {"reasoning": "<text>", "score": [float, float]}
213+
214+ Approach:
215+ 1. Locate 'reasoning' key position and 'score' key position.
216+ 2. Extract the raw substring between them (reasoning_raw).
217+ 3. Clean only the outer noise (leading/trailing quotes, commas, braces),
218+ but preserve internal punctuation.
219+ 4. Unescape common escape sequences and normalize quotes.
220+ 5. Extract numeric scores robustly.
221+ 6. Return a valid JSON string.
222+ """
209223
210- Args:
211- input_str (str): Possibly malformed JSON string.
224+ s = input_str
212225
213- Returns:
214- str: A repaired and valid JSON string.
215- """
226+ # Normalize whitespace for easier searching (but keep original for slicing)
227+ lowered = s .lower ()
216228
217- # 1. Try to extract the reasoning text between "reasoning" and "score"
218- reasoning_match = re .search (
219- r'"?reasoning"?\s*[::]\s*["\']?(.*?)["\']?\s*,\s*"?score"?' ,
220- input_str ,
221- re .DOTALL | re .IGNORECASE ,
222- )
229+ # 1) find the start of reasoning key (case-insensitive)
230+ m_reason = re .search (r'"?reasoning"?\s*[::]' , lowered )
231+ m_score = re .search (r'"?score"?\s*[::]' , lowered )
223232
224- if reasoning_match :
225- reasoning_text = reasoning_match .group (1 ).strip ()
226- else :
227- # If not found, fallback to an empty string
228- reasoning_text = ""
229-
230- # 2. Clean and normalize the reasoning content
231- reasoning_text = reasoning_text .replace ('\\ "' , '"' ) # Unescape existing escapes
232- reasoning_text = re .sub (r'["“”]' , '"' , reasoning_text ) # Normalize quotes
233- reasoning_text = reasoning_text .strip ()
234- # Escape any remaining unescaped double quotes to avoid breaking JSON
235- reasoning_text = reasoning_text .replace ('"' , '\\ "' )
236-
237- # 3. Try to extract the score list (two floats)
238- score_match = re .search (
239- r'"?score"?\s*[::]\s*\[?([^\]]+)\]?' , input_str , re .DOTALL | re .IGNORECASE
240- )
233+ reasoning_text = ""
241234 scores = []
242- if score_match :
243- # Extract numeric values using regex
244- nums = re .findall (r"-?\d+(?:\.\d+)?" , score_match .group (1 ))
235+
236+ if m_reason and m_score :
237+ # compute the real indices in the original string
238+ start_idx = m_reason .end () # right after colon in 'reasoning:'
239+ score_start_idx = m_score .start ()
240+
241+ # 2) slice the original string between reasoning value start and score key start
242+ reasoning_raw = s [start_idx :score_start_idx ]
243+
244+ # 3) clean outer noise but preserve inner content:
245+ # - strip whitespace and outer commas/braces
246+ reasoning_raw = reasoning_raw .strip ()
247+ # remove leading commas/braces/colons
248+ reasoning_raw = re .sub (r'^[\s,{\[]+' , '' , reasoning_raw )
249+ # remove trailing commas/braces/colons (but keep inner punctuation)
250+ reasoning_raw = re .sub (r'[\s,}\]]+$' , '' , reasoning_raw )
251+
252+ # If the reasoning starts with a quote char, drop it (we'll re-escape later).
253+ if reasoning_raw .startswith (("'" , '"' )):
254+ reasoning_raw = reasoning_raw [1 :]
255+ # If it ends with a quote char (common), drop it.
256+ if reasoning_raw .endswith (("'" , '"' )):
257+ reasoning_raw = reasoning_raw [:- 1 ]
258+
259+ # 4) normalize escapes:
260+ # Replace common escaped sequences (\" -> "), but avoid creating unbalanced quotes.
261+ reasoning_raw = reasoning_raw .replace ('\\ "' , '"' ).replace ("\\ '" , "'" )
262+ # Replace fancy quotes with straight quotes (optional)
263+ reasoning_raw = re .sub (r'[“”]' , '"' , reasoning_raw )
264+ reasoning_raw = re .sub (r"[‘’]" , "'" , reasoning_raw )
265+
266+ # Trim again
267+ reasoning_text = reasoning_raw .strip ()
268+ else :
269+ # If we couldn't find both keys, try a looser regex capturing 'reasoning' value
270+ m_loose = re .search (r'"?reasoning"?\s*[::]\s*["\']?(.*?)["\']?\s*(,|$)' , s , re .DOTALL | re .IGNORECASE )
271+ if m_loose :
272+ reasoning_text = m_loose .group (1 ).strip ()
273+ # normalize escapes as above
274+ reasoning_text = reasoning_text .replace ('\\ "' , '"' ).replace ("\\ '" , "'" )
275+ reasoning_text = re .sub (r'[“”]' , '"' , reasoning_text )
276+ reasoning_text = re .sub (r"[‘’]" , "'" , reasoning_text )
277+
278+ # 5) Extract two numeric scores anywhere after the 'score' key (robust)
279+ if m_score :
280+ # slice from score key to the end
281+ score_slice = s [m_score .end ():]
282+ # find numbers (integers or floats)
283+ nums = re .findall (r'-?\d+(?:\.\d+)?' , score_slice )
245284 try :
246285 scores = [float (n ) for n in nums [:2 ]]
247- except ValueError :
248- pass
286+ except Exception :
287+ scores = []
288+ else :
289+ # fallback: try to find any two numbers in the whole string
290+ nums = re .findall (r'-?\d+(?:\.\d+)?' , s )
291+ try :
292+ scores = [float (n ) for n in nums [:2 ]]
293+ except Exception :
294+ scores = []
249295
250- # Fill missing values with default zeros
296+ # Ensure we always return two floats
251297 if len (scores ) < 2 :
252298 scores += [0.0 ] * (2 - len (scores ))
253299
254- # 4. Construct the repaired JSON object
255- repaired_obj = {"reasoning" : reasoning_text , "score" : scores }
300+ # 6) Construct final object. Let json.dumps handle escaping inside the reasoning.
301+ repaired_obj = {
302+ "reasoning" : reasoning_text ,
303+ "score" : scores
304+ }
256305
257- # 5. Return a valid JSON string
258306 return json .dumps (repaired_obj , ensure_ascii = False )
259307
260308def robust_json_fix (s : str ):
0 commit comments