modified the errors in detect_with_ocr and change the dir order of resources like speaker audios

1c807388 · 翟艳秋（20软） · 0c8c815e · 1c807388 · 1c807388 · 1c807388
Commit 1c807388 authored May 19, 2023 by 翟艳秋（20软）
93 changed files
--- a/.gitignore
+++ b/.gitignore
 __pycache__
 .vscode
 .idea
+chineseocr_lite
+exp
+chineseocr_usage.py
+easyOCR_usage.py
+dist
+build
--- a/assemble_dialog.py
+++ b/assemble_dialog.py
@@ -83,7 +83,7 @@ class Assemble_Dialog(QDialog, Ui_Dialog):
 if __name__ == '__main__':
    app = QApplication(sys.argv)
-    app.setWindowIcon(QIcon("./images/eagle_2.ico"))
+    app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
    dialog = Assemble_Dialog()
    dialog.show()
    sys.exit(app.exec_())
\ No newline at end of file
--- a/constant.py
+++ b/constant.py
@@ -33,4 +33,4 @@ dir_path = os.path.dirname(os.path.abspath(__file__))
 class Pathes:
-    speaker_conf_path = os.path.join(dir_path, "speakers.json")
+    speaker_conf_path = os.path.join(dir_path, "res/speakers.json")
--- a/detect_dialog.py
+++ b/detect_dialog.py
@@ -23,6 +23,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
        self.pushButton_2.clicked.connect(self.openTableFile)
        self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).clicked.connect(self.start_detect)
        self.prompt_dialog = Prompt_Dialog()
    def init_self(self):
        self.lineEdit.setText(self.projectContext.video_path)
        self.lineEdit_2.setText(self.projectContext.excel_path)
@@ -59,7 +60,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
 if __name__ == '__main__':
    app = QApplication(sys.argv)
-    app.setWindowIcon(QIcon("./images/eagle_2.ico"))
+    app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
    dialog = Detect_Dialog()
    dialog.show()
    sys.exit(app.exec_())
\ No newline at end of file
--- a/detect_with_ocr.py
+++ b/detect_with_ocr.py
@@ -19,8 +19,10 @@ import os
 import cv2
 import numpy as np
 from paddleocr import PaddleOCR
+# from easyOCR_usage import EasyOCR
+# from chineseocr_usage import ChineseOCR
 import sys
-print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__))
+# print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__))
 import difflib
 import re
@@ -33,6 +35,8 @@ up_b, down_b = 0, 0
 # 初始化ocr工具
 ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False)
+# ocr = EasyOCR()
+# ocr = ChineseOCR()
 # 正常语速为4字/秒
 normal_speed = 4
@@ -65,7 +69,8 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
        # print("img:", img)
        # gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # cv2.imshow('img', gray)
-        # cv2.imshow(img)
+        # cv2.waitKey(0)
+        # cv2.destroyAllWindows()
        cnt += 1
        if img is None or cnt > 10000:
            break
@@ -83,42 +88,43 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
        for x in res:
            # print("x:", x)
            rect, (txt, confidence) = x
+            [x1,y1],[x2,y2],[x3,y3],[x4,y4] = rect
            # font_size = rect[2][1] - rect[0][1]
-            mid = (rect[0][0] + rect[1][0]) / 2
+            mid = (x1 + x2) / 2
-            gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0])))
+            gradient = np.arctan(abs((y2 - y1) / (x2 - x1)))
            # 可能是字幕的文本
-            if confidence > 0.9 and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1:
+            conf_thred = 0.9
+            # conf_thred = 0.8
+            if confidence > conf_thred and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1:
                if bottom_position is None:
-                    bottom_position = rect[0][1]
+                    bottom_position = y1
                # 判断是否与前一文本相同（是不是同一个字幕），非同一字幕的前提下，取对应上下边界，
                keys = subtitle_position.keys()
-                if abs(rect[0][1] - bottom_position) < 10:
+                if abs(y1 - bottom_position) < 10:
                    if pre_txt is None or pre_txt != txt:
                        txt_cnt += 1
                        pre_txt = txt
-                        if (rect[0][0], rect[2][1]) in keys:
+                        if (y1, y3) in keys:
-                            subtitle_position[(rect[0][1], rect[2][1])] += 1
+                            subtitle_position[(y1, y3)] += 1
                        else:
                            replace = False
                            for k in keys:
                                # 更新键值为最宽的上下限
-                                if abs(rect[0][1] - k[0]) + abs(rect[2][1] - k[1]) < 10:
+                                if abs(y1 - k[0]) + abs(y3 - k[1]) < 10:
-                                    new_k = min(k[0], rect[0][1]), max(k[1], rect[2][1])
+                                    subtitle_position[k] += 1
+                                    new_k = min(k[0], y1), max(k[1], y3)
                                    if new_k != k:
                                        subtitle_position[new_k] = subtitle_position[k]
-                                        subtitle_position[new_k] += 1
                                        subtitle_position.pop(k)
-                                    else:
-                                        subtitle_position[k] += 1
                                    replace = True
                                    break
                            if not replace:
-                                subtitle_position[(rect[0][1], rect[2][1])] = 1
+                                subtitle_position[(y1, y3)] = 1
        if txt_cnt == 3:
            break
    print(subtitle_position)
    up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get)
-    return up_bounding + height, down_bounding + height
+    return int(up_bounding + height), int(down_bounding + height)
 def erasePunc(txt: str) -> str:
@@ -167,10 +173,11 @@ def normalize(text: str) -> str:
    text = text.translate(table)
    text = text.strip(' ，。、【】_·：-@‘［；')
    # 促成首尾匹配的（）
-    if text[-1] == '）' and text[0] != '（':
+    if len(text) > 0:
-        text = '（' + text
+        if text[-1] == '）' and text[0] != '（':
-    elif text[-1] != '）' and text[0] == '（':
+            text = '（' + text
-        text = text + '）'
+        elif text[-1] != '）' and text[0] == '（':
+            text = text + '）'
    return text
@@ -191,9 +198,12 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
        img = cv2.resize(img, (int(img.shape[1] * 1.5), int(img.shape[0] * 1.5)))
    res = ocr.ocr(img, cls=True)
    sorted(res, key=lambda text: text[0][0][1])
+    sorted(res, key=lambda text: text[0][0][0])
    if len(res) == 0:
-        return None
+        return None, 0
    possible_txt = []
+    conf = 0
+    print(res)
    for x in res:
        # cv2.imshow("cut", img)
        # cv2.waitKey(0)
@@ -204,18 +214,35 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
        gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0])))
        # log.append("文本：{}，置信度：{}，中心点：{}，斜率：{}，字体大小：{}".format(txt, confidence, mid / img.shape[1], gradient,
        # font_size)) 置信度>0.7 & 斜率<0.1 & 字幕偏移量<=25 & 字幕中心在画面宽的0.4-0.6之间
-        if confidence > 0.7 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \
+        # print("文本：{}，置信度：{}，中心点：{}，斜率：{}，字体大小：{}".format(txt, confidence, mid / img.shape[1], gradient, font_size))
-                abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= 25:
+        # print("差距：{}".format(abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30)))
+        conf_thred1 = 0.7
+        conf_thred2 = 0.85
+        # conf_thred1 = 0.1
+        # conf_thred2 = 0.4
+        # conf_thred1 = 0.5
+        # conf_thred2 = 0.7
+        if confidence > conf_thred1 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \
+                abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= font_size - 10:
            subTitle += txt
+            conf = max(conf,confidence)
+            # possible_txt.append([txt, mid/img.shape[1]])
+            possible_txt.append(txt)
        # 如果字幕在一行中分为两个（或以上）对话文本
-        elif confidence > 0.85 and gradient < 0.1:
+        elif confidence > conf_thred2 and gradient < 0.1:
            if 0.3 < mid / img.shape[1] < 0.4 or 0.6 < mid / img.shape[1] < 0.7:
+                # possible_txt.append([txt, mid/img.shape[1]])
                possible_txt.append(txt)
+                conf = max(conf, confidence)
+    # sorted(possible_txt, key=lambda pos : pos[1])
+    # print(possible_txt)
    if len(possible_txt) >= 2:
-        subTitle = ''.join(possible_txt)
+        # subTitle = ' '.join([x[0] for x in possible_txt])
+        subTitle = ' '.join(possible_txt)
+    print(subTitle, conf)
    if len(subTitle) > 0:
-        return subTitle
+        return subTitle, conf
-    return None
+    return None, 0
 def process_video(video_path: str, begin: float, end: float, book_path: str, sheet_name: str, state=None, mainWindow: MainWindow=None):
@@ -243,6 +270,7 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
    video = cv2.VideoCapture(video_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    lastSubTitle = None
+    lastConf = 0
    # res是在视频遍历过程中获取的字幕文件，不掺杂对旁白的分析
    res = []
    cnt = 0
@@ -279,9 +307,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
            mainWindow.projectContext.nd_process = state[0]
            mainWindow.projectContext.last_time = cur_time
-            subTitle = detect_subtitle(frame)
+            subTitle, conf = detect_subtitle(frame)
            if subTitle is not None:
                subTitle = normalize(subTitle)
+                if len(subTitle) == 0:
+                    subTitle = None
            # 第一次找到字幕
            if lastSubTitle is None and subTitle is not None:
                start_time = cur_time
@@ -315,10 +345,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
                    add_to_list(mainWindow, "字幕", [round(start_time, 3), round(end_time, 3), lastSubTitle, ''])
                    start_time = end_time
                else:
-                    lastSubTitle = subTitle if len(subTitle) > len(lastSubTitle) else lastSubTitle
+                    lastSubTitle = subTitle if conf > lastConf else lastSubTitle
                    continue
            # 当前字幕与上一段字幕不一样
            lastSubTitle = subTitle
+            lastConf = conf
 def add_to_list(mainWindow: MainWindow, element_type: str, li: list):

--- a/log/day2023_03_13.log
+++ b/log/day2023_03_13.log
--- a/log/day2023_03_15.log
+++ b/log/day2023_03_15.log
--- a/log/day2023_05_19.log
+++ b/log/day2023_05_19.log
--- a/main_window.py
+++ b/main_window.py
--- a/main_window.ui
+++ b/main_window.ui
--- a/main_window_ui.py
+++ b/main_window_ui.py
--- a/management.py
+++ b/management.py
--- a/operation_dialog.py
+++ b/operation_dialog.py
--- a/operation_dialog.ui
+++ b/operation_dialog.ui
--- a/operation_dialog_ui.py
+++ b/operation_dialog_ui.py
--- a/conf.ini
+++ b/conf.ini
--- a/ffmpeg-4.3.1/LICENSE.txt
+++ b/ffmpeg-4.3.1/LICENSE.txt
--- a/ffmpeg-4.3.1/README.txt
+++ b/ffmpeg-4.3.1/README.txt
--- a/ffmpeg-4.3.1/bin/ffmpeg.exe
+++ b/ffmpeg-4.3.1/bin/ffmpeg.exe
--- a/ffmpeg-4.3.1/bin/ffplay.exe
+++ b/ffmpeg-4.3.1/bin/ffplay.exe
--- a/ffmpeg-4.3.1/bin/ffprobe.exe
+++ b/ffmpeg-4.3.1/bin/ffprobe.exe
--- a/ffmpeg-4.3.1/doc/bootstrap.min.css
+++ b/ffmpeg-4.3.1/doc/bootstrap.min.css
--- a/ffmpeg-4.3.1/doc/default.css
+++ b/ffmpeg-4.3.1/doc/default.css
--- a/ffmpeg-4.3.1/doc/developer.html
+++ b/ffmpeg-4.3.1/doc/developer.html
--- a/ffmpeg-4.3.1/doc/faq.html
+++ b/ffmpeg-4.3.1/doc/faq.html
--- a/ffmpeg-4.3.1/doc/fate.html
+++ b/ffmpeg-4.3.1/doc/fate.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-all.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-all.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-bitstream-filters.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-bitstream-filters.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-codecs.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-codecs.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-devices.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-devices.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-filters.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-filters.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-formats.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-formats.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-protocols.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-protocols.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-resampler.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-resampler.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-scaler.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-scaler.html
--- a/ffmpeg-4.3.1/doc/ffmpeg-utils.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg-utils.html
--- a/ffmpeg-4.3.1/doc/ffmpeg.html
+++ b/ffmpeg-4.3.1/doc/ffmpeg.html
--- a/ffmpeg-4.3.1/doc/ffplay-all.html
+++ b/ffmpeg-4.3.1/doc/ffplay-all.html
--- a/ffmpeg-4.3.1/doc/ffplay.html
+++ b/ffmpeg-4.3.1/doc/ffplay.html
--- a/ffmpeg-4.3.1/doc/ffprobe-all.html
+++ b/ffmpeg-4.3.1/doc/ffprobe-all.html
--- a/ffmpeg-4.3.1/doc/ffprobe.html
+++ b/ffmpeg-4.3.1/doc/ffprobe.html
--- a/ffmpeg-4.3.1/doc/general.html
+++ b/ffmpeg-4.3.1/doc/general.html
--- a/ffmpeg-4.3.1/doc/git-howto.html
+++ b/ffmpeg-4.3.1/doc/git-howto.html
--- a/ffmpeg-4.3.1/doc/libavcodec.html
+++ b/ffmpeg-4.3.1/doc/libavcodec.html
--- a/ffmpeg-4.3.1/doc/libavdevice.html
+++ b/ffmpeg-4.3.1/doc/libavdevice.html
--- a/ffmpeg-4.3.1/doc/libavfilter.html
+++ b/ffmpeg-4.3.1/doc/libavfilter.html
--- a/ffmpeg-4.3.1/doc/libavformat.html
+++ b/ffmpeg-4.3.1/doc/libavformat.html
--- a/ffmpeg-4.3.1/doc/libavutil.html
+++ b/ffmpeg-4.3.1/doc/libavutil.html
--- a/ffmpeg-4.3.1/doc/libswresample.html
+++ b/ffmpeg-4.3.1/doc/libswresample.html
--- a/ffmpeg-4.3.1/doc/libswscale.html
+++ b/ffmpeg-4.3.1/doc/libswscale.html
--- a/ffmpeg-4.3.1/doc/mailing-list-faq.html
+++ b/ffmpeg-4.3.1/doc/mailing-list-faq.html
--- a/ffmpeg-4.3.1/doc/nut.html
+++ b/ffmpeg-4.3.1/doc/nut.html
--- a/ffmpeg-4.3.1/doc/platform.html
+++ b/ffmpeg-4.3.1/doc/platform.html
--- a/ffmpeg-4.3.1/doc/style.min.css
+++ b/ffmpeg-4.3.1/doc/style.min.css
--- a/ffmpeg-4.3.1/presets/ffprobe.xsd
+++ b/ffmpeg-4.3.1/presets/ffprobe.xsd
--- a/ffmpeg-4.3.1/presets/libvpx-1080p.ffpreset
+++ b/ffmpeg-4.3.1/presets/libvpx-1080p.ffpreset
--- a/ffmpeg-4.3.1/presets/libvpx-1080p50_60.ffpreset
+++ b/ffmpeg-4.3.1/presets/libvpx-1080p50_60.ffpreset
--- a/ffmpeg-4.3.1/presets/libvpx-360p.ffpreset
+++ b/ffmpeg-4.3.1/presets/libvpx-360p.ffpreset
--- a/ffmpeg-4.3.1/presets/libvpx-720p.ffpreset
+++ b/ffmpeg-4.3.1/presets/libvpx-720p.ffpreset
--- a/ffmpeg-4.3.1/presets/libvpx-720p50_60.ffpreset
+++ b/ffmpeg-4.3.1/presets/libvpx-720p50_60.ffpreset
--- a/images/changeColor.py
+++ b/images/changeColor.py
--- a/images/eagle_2.ico
+++ b/images/eagle_2.ico
--- a/images/slider.svg
+++ b/images/slider.svg
--- a/images/播放.svg
+++ b/images/播放.svg
--- a/images/暂停.svg
+++ b/images/暂停.svg
--- a/speaker_audio/Xiaochen.wav
+++ b/speaker_audio/Xiaochen.wav
--- a/speaker_audio/Xiaohan.wav
+++ b/speaker_audio/Xiaohan.wav
--- a/speaker_audio/Xiaomo.wav
+++ b/speaker_audio/Xiaomo.wav
--- a/speaker_audio/Xiaoqiu.wav
+++ b/speaker_audio/Xiaoqiu.wav
--- a/speaker_audio/Xiaorui.wav
+++ b/speaker_audio/Xiaorui.wav
--- a/speaker_audio/Xiaoshuang.wav
+++ b/speaker_audio/Xiaoshuang.wav
--- a/speaker_audio/Xiaoxiao.wav
+++ b/speaker_audio/Xiaoxiao.wav
--- a/speaker_audio/Xiaoxuan.wav
+++ b/speaker_audio/Xiaoxuan.wav
--- a/speaker_audio/Xiaoyan.wav
+++ b/speaker_audio/Xiaoyan.wav
--- a/speaker_audio/Xiaoyou.wav
+++ b/speaker_audio/Xiaoyou.wav
--- a/speaker_audio/Yunxi.wav
+++ b/speaker_audio/Yunxi.wav
--- a/speaker_audio/Yunyang.wav
+++ b/speaker_audio/Yunyang.wav
--- a/speaker_audio/Yunye.wav
+++ b/speaker_audio/Yunye.wav
--- a/res/speakers.json
+++ b/res/speakers.json
--- a/setting_dialog.py
+++ b/setting_dialog.py
--- a/setting_dialog.ui
+++ b/setting_dialog.ui
--- a/setting_dialog_ui.py
+++ b/setting_dialog_ui.py
--- a/speakers.json
+++ b/speakers.json
--- a/speech_synthesis.py
+++ b/speech_synthesis.py
--- a/split_wav.py
+++ b/split_wav.py
--- a/start.py
+++ b/start.py
--- a/test.mp4
+++ b/test.mp4
--- a/test.xlsx
+++ b/test.xlsx
--- a/test37second.xlsx
+++ b/test37second.xlsx
--- a/testTableWidget.py
+++ b/testTableWidget.py
--- a/utils.py
+++ b/utils.py
--- a/主main_window.ui
+++ b/主main_window.ui
--- a/无障碍电影制作系统.spec
+++ b/无障碍电影制作系统.spec