update get errors.

shibing624 · shibing624 · commit 937c4b53a3a9 · 2024-12-13T21:45:40.000+08:00
diff --git a/pycorrector/gpt/gpt_corrector.py b/pycorrector/gpt/gpt_corrector.py
@@ -13,7 +13,7 @@
 sys.path.append('../..')
 from pycorrector.utils.tokenizer import split_text_into_sentences_by_length
 from pycorrector.gpt.gpt_model import GptModel
-from pycorrector.utils.error_utils import get_errors_for_diff_length
+from pycorrector.utils.error_utils import get_errors
 
 
 class GptCorrector(GptModel):
@@ -87,7 +87,7 @@ def correct_batch(
         new_corrected_sentences = []
         corrected_details = []
         for idx, corrected_sent in enumerate(corrected_sentences):
-            new_corrected_sent, sub_details = get_errors_for_diff_length(corrected_sent, sentences[idx])
+            new_corrected_sent, sub_details = get_errors(corrected_sent, sentences[idx])
             new_corrected_sentences.append(new_corrected_sent)
             corrected_details.append(sub_details)
         return [{'source': s, 'target': c, 'errors': e} for s, c, e in
diff --git a/pycorrector/macbert/macbert_corrector.py b/pycorrector/macbert/macbert_corrector.py
@@ -15,7 +15,7 @@
 
 sys.path.append('../..')
 from pycorrector.utils.tokenizer import split_text_into_sentences_by_length
-from pycorrector.utils.error_utils import get_errors_for_same_length
+from pycorrector.utils.error_utils import get_errors
 
 device = torch.device("mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
                       else "cuda" if torch.cuda.is_available() else "cpu")
@@ -109,7 +109,7 @@ def correct_batch(
         new_corrected_sentences = []
         corrected_details = []
         for idx, corrected_sent in enumerate(corrected_sentences):
-            new_corrected_sent, sub_details = get_errors_for_same_length(corrected_sent, sentences[idx])
+            new_corrected_sent, sub_details = get_errors(corrected_sent, sentences[idx])
             new_corrected_sentences.append(new_corrected_sent)
             corrected_details.append(sub_details)
         return [{'source': s, 'target': c, 'errors': e} for s, c, e in
diff --git a/pycorrector/seq2seq/conv_seq2seq_corrector.py b/pycorrector/seq2seq/conv_seq2seq_corrector.py
@@ -17,7 +17,7 @@
 from pycorrector.utils.tokenizer import split_text_into_sentences_by_length
 from pycorrector.utils.get_file import get_file
 from pycorrector.detector import USER_DATA_DIR
-from pycorrector.utils.error_utils import get_errors_for_diff_length
+from pycorrector.utils.error_utils import get_errors
 
 device = torch.device("mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
                       else "cuda" if torch.cuda.is_available() else "cpu")
@@ -85,7 +85,7 @@ def correct_batch(self, sentences: List[str], max_length: int = 128, silent: boo
         new_corrected_sentences = []
         corrected_details = []
         for idx, corrected_sent in enumerate(corrected_sentences):
-            new_corrected_sent, sub_details = get_errors_for_diff_length(corrected_sent, sentences[idx])
+            new_corrected_sent, sub_details = get_errors(corrected_sent, sentences[idx])
             new_corrected_sentences.append(new_corrected_sent)
             corrected_details.append(sub_details)
         return [{'source': s, 'target': c, 'errors': e} for s, c, e in
diff --git a/pycorrector/t5/t5_corrector.py b/pycorrector/t5/t5_corrector.py
@@ -15,7 +15,7 @@
 
 sys.path.append('../..')
 from pycorrector.utils.tokenizer import split_text_into_sentences_by_length
-from pycorrector.utils.error_utils import get_errors_for_same_length
+from pycorrector.utils.error_utils import get_errors
 
 device = torch.device("mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
                       else "cuda" if torch.cuda.is_available() else "cpu")
@@ -83,7 +83,7 @@ def correct_batch(self, sentences: List[str], max_length: int = 128, batch_size:
         new_corrected_sentences = []
         corrected_details = []
         for idx, corrected_sent in enumerate(corrected_sentences):
-            new_corrected_sent, sub_details = get_errors_for_same_length(corrected_sent, sentences[idx])
+            new_corrected_sent, sub_details = get_errors(corrected_sent, sentences[idx])
             new_corrected_sentences.append(new_corrected_sent)
             corrected_details.append(sub_details)
         return [{'source': s, 'target': c, 'errors': e} for s, c, e in
diff --git a/pycorrector/utils/error_utils.py b/pycorrector/utils/error_utils.py
@@ -5,79 +5,37 @@
 """
 
 import operator
+import difflib
 
-from pycorrector.utils.text_utils import is_chinese_char
 
-
-def get_errors_for_diff_length(corrected_text, origin_text):
+def get_errors(corrected_text, origin_text):
     """Get errors between corrected text and origin text"""
-    new_corrected_text = ""
     errors = []
-    i, j = 0, 0
     unk_tokens = [' ', '“', '”', '‘', '’', '琊', '\n', '…', '擤', '\t', '玕', '']
 
-    while i < len(origin_text) and j < len(corrected_text):
-        if origin_text[i] in unk_tokens:
-            new_corrected_text += origin_text[i]
-            i += 1
-        elif corrected_text[j] in unk_tokens:
-            new_corrected_text += corrected_text[j]
-            j += 1
-        # Deal with Chinese characters
-        elif is_chinese_char(origin_text[i]) and is_chinese_char(corrected_text[j]):
-            # If the two characters are the same, then the two pointers move forward together
-            if origin_text[i] == corrected_text[j]:
+    s = difflib.SequenceMatcher(None, origin_text, corrected_text)
+    new_corrected_text = ""
+    for tag, i1, i2, j1, j2 in s.get_opcodes():
+        if tag == 'replace':
+            for i, j in zip(range(i1, i2), range(j1, j2)):
+                if origin_text[i] not in unk_tokens and corrected_text[j] not in unk_tokens:
+                    errors.append((origin_text[i], corrected_text[j], i))
                 new_corrected_text += corrected_text[j]
-                i += 1
-                j += 1
-            else:
-                # Check for insertion errors
-                if j + 1 < len(corrected_text) and origin_text[i] == corrected_text[j + 1]:
-                    errors.append(('', corrected_text[j], j))
-                    new_corrected_text += corrected_text[j]
-                    j += 1
-                # Check for deletion errors
-                elif i + 1 < len(origin_text) and origin_text[i + 1] == corrected_text[j]:
+        elif tag == 'delete':
+            for i in range(i1, i2):
+                if origin_text[i] not in unk_tokens:
                     errors.append((origin_text[i], '', i))
-                    i += 1
-                # Check for replacement errors
-                else:
-                    errors.append((origin_text[i], corrected_text[j], i))
-                    new_corrected_text += corrected_text[j]
-                    i += 1
-                    j += 1
-        else:
-            new_corrected_text += origin_text[i]
-            if origin_text[i] == corrected_text[j]:
-                j += 1
-            i += 1
-    errors = sorted(errors, key=operator.itemgetter(2))
-    return corrected_text, errors
-
-
-def get_errors_for_same_length(corrected_text, origin_text):
-    """Get new corrected text and errors between corrected text and origin text"""
-    errors = []
-    unk_tokens = [' ', '“', '”', '‘', '’', '琊', '\n', '…', '擤', '\t', '玕', '']
+                new_corrected_text += origin_text[i]
+        elif tag == 'insert':
+            for j in range(j1, j2):
+                if corrected_text[j] not in unk_tokens:
+                    errors.append(('', corrected_text[j], j))
+                new_corrected_text += corrected_text[j]
+        elif tag == 'equal':
+            new_corrected_text += origin_text[i1:i2]
 
-    for i, ori_char in enumerate(origin_text):
-        if i >= len(corrected_text):
-            continue
-        if ori_char in unk_tokens:
-            # deal with unk word
-            corrected_text = corrected_text[:i] + ori_char + corrected_text[i:]
-            continue
-        if ori_char != corrected_text[i]:
-            if not is_chinese_char(ori_char):
-                # pass not chinese char
-                corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
-                continue
-            if not is_chinese_char(corrected_text[i]):
-                corrected_text = corrected_text[:i] + corrected_text[i + 1:]
-                continue
-            errors.append((ori_char, corrected_text[i], i))
-    errors = sorted(errors, key=operator.itemgetter(2))
-    return corrected_text, errors
+    errors = sorted(errors, key=lambda x: x[2])
+    return new_corrected_text, errors
 
 
 if __name__ == '__main__':
@@ -100,5 +58,6 @@ def get_errors_for_same_length(corrected_text, origin_text):
         ('我喜欢吃鸡，公鸡、母鸡、白切鸡、乌鸡、紫燕鸡', '我喜欢吃鸡，公鸡、母鸡、切鸡、乌鸡、紫燕鸡'),  # 少字
     ]
     for pair in sentence_pairs:
-        new_corrected_text, errors = get_errors_for_same_length(pair[0], pair[1])
+        new_corrected_text, errors = get_errors(pair[0], pair[1])
         print(f"{new_corrected_text} {errors}")
+        print('--' * 42 + '\n')