optimise algorithm

YaoFANGUK · Apr 13, 2021 · 1545ed8 · 1545ed8
1 parent 646fa5f
commit 1545ed8
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,9 @@
 简体中文 | [English](README_en.md)
 
 ## 项目简介
+![License](https://img.shields.io/badge/License-Apache%202-red.svg)
+![python version](https://img.shields.io/badge/Python-3.7+-blue.svg)
+![support os](https://img.shields.io/badge/OS-Windows/macOS/Linux-green.svg)
 
 video-subtitle-extractor是一款将视频中的硬字幕提取为外挂字幕文件(srt格式)的软件。
 主要实现了以下功能：
@@ -30,7 +33,7 @@ video-subtitle-extractor是一款将视频中的硬字幕提取为外挂字幕
 - GUI版：
 
 <div align="center">
-  <img src="demo.gif"/>
+  <img src="design/demo.gif"/>
 </div>
 
 - CLI版：
@@ -39,23 +42,18 @@ video-subtitle-extractor是一款将视频中的硬字幕提取为外挂字幕
 
 
 ## 在线运行
-1. <a href="https://colab.research.google.com/" target="_blank">点击登陆Colab</a>
 
-2. 点击"文件"，"打开笔记本"，"GitHub"，复制以下链接:
+- 使用**Google Colab Notebook**(免费GPU): <a href="https://colab.research.google.com/github/YaoFANGUK/video-subtitle-extractor/blob/main/google_colab.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
 
-https://github.com/YaoFANGUK/video-subtitle-extractor/blob/main/google_colab.ipynb
-
-<img src="https://z3.ax1x.com/2021/03/30/ciG7Ps.png">   
-
-> PS: google Colab只能运行CLI版本
+> PS: Google Colab只能运行CLI版本
 
 ## 使用说明
 
-#### 1. (可选) 下载安装Anaconda 
+#### 1. (推荐) 下载安装Anaconda 
 
 <a href="https://www.anaconda.com/products/individual">https://www.anaconda.com/products/individual#Downloads</a>
 
-#### 2. (可选) 使用conda创建项目虚拟环境并激活环境 (建议创建虚拟环境运行，也可以不用conda)
+#### 2. (推荐) 使用conda创建项目虚拟环境并激活环境 (建议创建虚拟环境运行，以免后续出现问题)
 
 ```shell
 conda create --name videoEnv python=3.7

diff --git a/README_en.md b/README_en.md
@@ -1,6 +1,9 @@
 [简体中文](README.md) | English
 
 ## Introduction
+![License](https://img.shields.io/badge/license-Apache%202-red.svg)
+![python version](https://img.shields.io/badge/python-3.7+-blue.svg)
+![support os](https://img.shields.io/badge/OS-Windows/macOS/Linux-green.svg)
 
 video-subtitle-extractor is used to extract hard-coded subtitles and generate **srt** file.  It includes the following implementations:
 
@@ -29,7 +32,7 @@ video-subtitle-extractor is used to extract hard-coded subtitles and generate **
 <img src="https://z3.ax1x.com/2021/04/09/cNrA1A.png">
 
 <div align="center">
-  <img src="demo.gif"/>
+  <img src="design/demo.gif"/>
 </div>
 
 - Command Line Interface: 
@@ -85,14 +88,7 @@ python main.py
 
 ## Run on Google Colab
 
-1. <a href="https://colab.research.google.com/" target="_blank">Login Colab</a>
-2. Click "File", "Open Notebook", "GitHub" and then copy the following link:
-
-https://github.com/YaoFANGUK/video-subtitle-extractor/blob/main/google_colab.ipynb
-
-<img src="https://z3.ax1x.com/2021/03/30/ciG7Ps.png">   
-
-
+- **Google Colab Notebook with free GPU**: <a href="https://colab.research.google.com/github/YaoFANGUK/video-subtitle-extractor/blob/main/google_colab.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
 
 ## Debug
 

diff --git a/config.py b/config.py
@@ -109,7 +109,11 @@ class BackgroundColor(Enum):
 
 # 文本相似度阈值
 # 用于去重时判断两行字幕是不是统一行
-TEXT_SIMILARITY_THRESHOLD = 0.92
+# 采用动态算法实现相似度阈值判断: 对于短文本要求较低的阈值，对于长文本要求较高的阈值
+THRESHOLD_TEXT_SIMILARITY = 0.8
+
+# 字幕提取中置信度低于0.8的不要
+DROP_SCORE = 0.8
 # --------------------- 请根据自己的实际情况改 end-----------------------------
 
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
diff --git a/demo.gif → design/demo.gif b/demo.gif → design/demo.gif
diff --git a/main.py b/main.py
@@ -245,7 +245,7 @@ def extract_subtitles(self):
             # 获取文本坐标
             coordinates = self.__get_coordinates(dt_box)
             # 将结果写入txt文本中
-            text_res = [res[0] for res in rec_res]
+            text_res = [(res[0], res[1]) for res in rec_res]
             for content, coordinate in zip(text_res, coordinates):
                 if self.sub_area is not None:
                     s_ymin = self.sub_area[0]
@@ -257,14 +257,15 @@ def extract_subtitles(self):
                     ymin = coordinate[2]
                     ymax = coordinate[3]
                     if s_xmin <= xmin and xmax <= s_xmax and s_ymin <= ymin and ymax <= s_ymax:
-                        print(content)
-                        f.write(f'{os.path.splitext(frame)[0]}\t'
-                                f'{coordinate}\t'
-                                f'{content}\n')
+                        print(content[0], content[1])
+                        if content[1] > config.DROP_SCORE:
+                            f.write(f'{os.path.splitext(frame)[0]}\t'
+                                    f'{coordinate}\t'
+                                    f'{content[0]}\n')
                 else:
                     f.write(f'{os.path.splitext(frame)[0]}\t'
                             f'{coordinate}\t'
-                            f'{content}\n')
+                            f'{content[0]}\n')
         # 关闭文件
         f.close()
 
@@ -366,19 +367,24 @@ def generate_subtitle_file(self):
         subtitle_content = self._remove_duplicate_subtitle()
         print(os.path.splitext(self.video_path)[0])
         srt_filename = os.path.join(os.path.splitext(self.video_path)[0] + '.srt')
+        # 保存持续时间不足1秒的字幕行，用于后续处理
+        post_process_subtitle = []
         with open(srt_filename, mode='w', encoding='utf-8') as f:
             for index, content in enumerate(subtitle_content):
                 line_code = index + 1
                 frame_start = self._frame_to_timecode(int(content[0]))
                 # 比较起始帧号与结束帧号， 如果字幕持续时间不足1秒，则将显示时间设为1s
                 if abs(int(content[1]) - int(content[0])) < self.fps:
                     frame_end = self._frame_to_timecode(int(int(content[0]) + self.fps))
+                    post_process_subtitle.append(line_code)
                 else:
                     frame_end = self._frame_to_timecode(int(content[1]))
                 frame_content = content[2]
                 subtitle_line = f'{line_code}\n{frame_start} --> {frame_end}\n{frame_content}\n'
                 f.write(subtitle_line)
         print(f'字幕文件生成位置：{srt_filename}')
+        # 返回持续时间低于1s的字幕行
+        return post_process_subtitle
 
     def _analyse_subtitle_frame(self):
         """
@@ -616,17 +622,35 @@ def _remove_duplicate_subtitle(self):
             for j in content_list[index:]:
                 # 计算当前行与下一行的Levenshtein距离
                 distance = ratio(i[1], j[1])
-                if distance < config.TEXT_SIMILARITY_THRESHOLD or j == content_list[-1]:
+                if distance < config.THRESHOLD_TEXT_SIMILARITY or j == content_list[-1]:
                     # 定义字幕结束帧帧号
                     end_frame = content_list[content_list.index(j) - 1][0]
                     if end_frame == start_frame:
                         end_frame = j[0]
+                    # 如果是第一行字幕，直接添加进列表
                     if len(unique_subtitle_list) < 1:
                         unique_subtitle_list.append((start_frame, end_frame, i[1]))
                     else:
-                        if ratio(unique_subtitle_list[-1][2].replace(' ', ''),
-                                 i[1].replace(' ', '')) < config.TEXT_SIMILARITY_THRESHOLD:
+                        string_a = unique_subtitle_list[-1][2].replace(' ', '')
+                        string_b = i[1].replace(' ', '')
+                        similarity_ratio = ratio(string_a, string_b)
+                        # 打印相似度
+                        # print(f'{similarity_ratio}: {unique_subtitle_list[-1][2]} vs {i[1]}')
+                        # 如果相似度小于阈值，说明该两行字幕不一样
+                        if similarity_ratio < config.THRESHOLD_TEXT_SIMILARITY:
                             unique_subtitle_list.append((start_frame, end_frame, i[1]))
+                        else:
+                            # 如果大于阈值，但又不完全相同，说明两行字幕相似
+                            # 可能出现以下情况: "但如何进人并接管上海" vs "但如何进入并接管上海"
+                            # OCR识别出现了错误识别
+                            if similarity_ratio < 1:
+                                # TODO:
+                                # 1) 取出两行字幕的并集
+                                # 2) 纠错
+                                # print(f'{round(similarity_ratio, 2)}, 需要手动纠错:\n {string_a} vs\n {string_b}')
+                                # 保存较长的
+                                if len(string_a) < len(string_b):
+                                    unique_subtitle_list[-1] = (start_frame, end_frame, i[1])
                     index += 1
                     break
                 else:
@@ -777,4 +801,3 @@ def __delete_frame_cache(self):
     se = SubtitleExtractor(video_path)
     # 开始提取字幕
     se.run()
-
diff --git a/test.flv b/test.flv