Merge pull request zyddnys#747 from popcion/main

Fix the bug where unfiltered content after translation is incorrectly displayed as filtered in log, and improve the filtering logic
moeflow-com · Nov 23, 2024 · 661fd5e · 661fd5e
2 parents cc89fe6 + e5b3794
commit 661fd5e
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -451,7 +451,7 @@ Colorizer: **mc2**
 --save-text-file SAVE_TEXT_FILE              Like --save-text but with a specified file path.
 --filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
                                              usage: --text-filter ".*badtext.*"
---pre-dict FILe_PATH                         Path to the pre-translation dictionary file. One entry per line,
+--pre-dict FILE_PATH                         Path to the pre-translation dictionary file. One entry per line,
                                              Comments can be added with `#` and `//`.
                                              usage: //Example
                                                     dog cat #Example

diff --git a/README_CN.md b/README_CN.md
@@ -205,15 +205,13 @@ FIL: Filipino (Tagalog)
 --save-text-file SAVE_TEXT_FILE              Like --save-text but with a specified file path.
 --filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
                                              usage: --text-filter ".*badtext.*"
---filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
-                                             usage: --text-filter ".*badtext.*"
---pre-dict FILe_PATH                         Path to the pre-translation dictionary file. One entry per line,
+--pre-dict FILE_PATH                         Path to the pre-translation dictionary file. One entry per line,
                                              Comments can be added with `#` and `//`.
                                              usage: //Example
                                                     dog cat #Example
                                                     abc def
                                                     abc
---post-dict                                  file_path Path to the post-translation dictionary file. Same as above.
+--post-dict FILE_PATH                        Path to the post-translation dictionary file. Same as above.
 --skip-lang                                  Skip translation if source image is one of the provide languages, 
                                              use comma to separate multiple languages. Example: JPN,ENG
 --prep-manual                                Prepare for manual typesetting by outputting blank,

diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py
@@ -620,28 +620,38 @@ async def _run_text_translation(self, ctx: Context):
             diff_target_regions = []    # Target language regions with different translation  
             same_non_target_regions = []  # Non-target language regions with identical translation  
             diff_non_target_regions = []  # Non-target language regions with different translation  
+            has_target_lang_in_translation_regions = []
 
             for region in ctx.text_regions:  
                 text_equal = region.text.lower().strip() == region.translation.lower().strip()  
                 has_target_lang = False  
+                has_target_lang_in_translation = False
 
                 # Target language detection  
                 if ctx.target_lang in ['CHS', 'CHT']:  # Chinese  
-                    has_target_lang = bool(re.search('[\u4e00-\u9fff]', region.text))  
+                    has_target_lang = bool(re.search('[\u4e00-\u9fff]', region.text)) 
+                    has_target_lang_in_translation = bool(re.search('[\u4e00-\u9fff]', region.translation))
                 elif ctx.target_lang == 'JPN':  # Japanese  
                     has_target_lang = bool(re.search('[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', region.translation)) 
                 elif ctx.target_lang == 'KOR':  # Korean  
                     has_target_lang = bool(re.search('[\uac00-\ud7af\u1100-\u11ff]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\uac00-\ud7af\u1100-\u11ff]', region.translation))  
                 elif ctx.target_lang == 'ARA':  # Arabic  
                     has_target_lang = bool(re.search('[\u0600-\u06ff]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\u0600-\u06ff]', region.translation))  
                 elif ctx.target_lang == 'THA':  # Thai  
                     has_target_lang = bool(re.search('[\u0e00-\u0e7f]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\u0e00-\u0e7f]', region.translation))  
                 elif ctx.target_lang == 'RUS':  # Russian  
                     has_target_lang = bool(re.search('[\u0400-\u04ff]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\u0400-\u04ff]', region.translation))  
                 elif ctx.target_lang == 'UKR':  # Ukrainian  
                     has_target_lang = bool(re.search('[\u0400-\u04ff]', region.text))  
+                    has_target_lang_in_translation = bool(re.search('[\u0400-\u04ff]', region.translation)) 
                 elif ctx.target_lang == 'IND':  # Indonesian  
                     has_target_lang = bool(re.search('[A-Za-z]', region.text))
+                    has_target_lang_in_translation = bool(re.search('[A-Za-z]', region.translation))
 
                 # Skip numeric translations and filtered text  
                 if region.translation.isnumeric():  
@@ -655,27 +665,43 @@ async def _run_text_translation(self, ctx: Context):
                     continue  
 
                 if has_target_lang:  
-                    if text_equal:  
-                        logger.info(f'Filtered out: {region.translation}')  
-                        logger.info('Reason: Translation identical to original')  
+                    if text_equal:    
                         same_target_regions.append(region)  
                     else:  
                         diff_target_regions.append(region)  
                 else:  
                     if text_equal:  
-                        logger.info(f'Filtered out: {region.translation}')  
-                        logger.info('Reason: Translation identical to original')  
                         same_non_target_regions.append(region)  
                     else:  
                         diff_non_target_regions.append(region)  
-
+
+                if has_target_lang_in_translation:
+                        has_target_lang_in_translation_regions.append(region)
+
             # If any different translations exist, retain all target language regions  
             if diff_target_regions or diff_non_target_regions:  
                 new_text_regions.extend(same_target_regions)  
                 new_text_regions.extend(diff_target_regions)  
+
+            # Keep all non_target_lang regions with different translations (if translation contains target language characters)  
+            for region in diff_non_target_regions:  
+                if region in has_target_lang_in_translation_regions:  
+                    new_text_regions.append(region)   
+                else:  
+                    logger.info(f'Filtered out: {region.translation}')  
+                    logger.info('Reason: Translation does not contain target language characters')  
 
-            # Retain all non-target language regions with different translations (It appears empty, it clears all contents.) 
-            new_text_regions.extend(diff_non_target_regions)  
+            # No different translations exist, clear all content.  
+            if not (diff_target_regions or diff_non_target_regions):  
+                for region in same_target_regions:  
+                    logger.info(f'Filtered out: {region.translation}')  
+                    logger.info('Reason: Translation identical to original -the whole page-')  
+
+            # Clear non_target_lang_regions with identical translations.  
+            for region in same_non_target_regions:  
+                logger.info(f'Filtered out: {region.translation}')  
+                logger.info('Reason: Translation identical to original -one textine-')
+
 
         else:  
             # Process non-special language scenarios using original logic