1. [modified] 将detect_with_ocr中确认当前帧是否超过限定时间段的判断提前到取帧之前；

2. [modified] 修改detect_with_asr中的旁白区间的写入判断，避免在视频开头处插入旁白推荐字数错误的情况

1. [modified] 将detect_with_ocr中确认当前帧是否超过限定时间段的判断提前到取帧之前；
9671a120 · 翟艳秋（20软） · 6cd70d8a · 9671a120 · 9671a120 · 9671a120
Commit 9671a120 authored Mar 31, 2022 by 翟艳秋（20软）
Showing with 31 additions and 29 deletions

infer_path.py PaddlePaddle_DeepSpeech2/infer_path.py +7 -2

detect_with_ocr.py detect_with_ocr.py +15 -19

main_gui.py main_gui.py +5 -5

speech_synthesis.py speech_synthesis.py +4 -3

No files found.
--- a/PaddlePaddle_DeepSpeech2/infer_path.py
+++ b/PaddlePaddle_DeepSpeech2/infer_path.py
@@ -53,6 +53,8 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
    texts = ''
    narratages = []
    last_time = 0
+    # 已检测到字幕
+    subtitle_detected = False
    # 执行识别
    for i, audio_path in enumerate(audios_path):
        print("{}开始处理{}".format(paddle.get_device(), audio_path))
@@ -68,8 +70,10 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
            device=paddle.get_device()
        )
        if text:
-            if i == 0 or (i > 0 and time_stamps[i][0] - last_time >= 1):
+            if not subtitle_detected or (subtitle_detected and time_stamps[i][0] - last_time >= 1):
-                recommend_lens = int((time_stamps[i][0] - last_time) * normal_speed)
+                recommend_lens = int((time_stamps[i][0] - last_time) * normal_speed) if subtitle_detected else int(
+                    (time_stamps[i][0] + pre_time) * normal_speed)
+                print("插入旁白，推荐字数为%d" % recommend_lens)
                # narratages.append(["", "", "", "插入旁白，推荐字数为%d" % recommend_lens])
                write_to_sheet(book_name, sheet_name, ["", "", "", "插入旁白，推荐字数为%d" % recommend_lens])
            # narratages.append([round(time_stamps[i][0] + pre_time, 2), round(time_stamps[i][1] + pre_time, 2),
@@ -77,6 +81,7 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
            write_to_sheet(book_name, sheet_name,
                           [round(time_stamps[i][0] + pre_time, 2), round(time_stamps[i][1] + pre_time, 2), text, ''])
            last_time = time_stamps[i][1]
+            subtitle_detected = True
        print(
            "第%d个分割音频 对应时间为%.2f-%.2f 识别结果: %s" % (i, time_stamps[i][0] + pre_time, time_stamps[i][1] + pre_time, text))
        state[0] = float((i + 1) / len(audios_path)) if state[0] is None or state[0] < 0.99 else 0.99

--- a/detect_with_ocr.py
+++ b/detect_with_ocr.py
@@ -14,7 +14,7 @@ from detect_with_asr import create_sheet, write_to_sheet
 up_b, down_b = 0, 0
 # 初始化ocr工具
-ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
+ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False)
 # 正常语速为4字/秒
 normal_speed = 4
@@ -45,7 +45,7 @@ def get_position(video_path, start_time):
            continue
        img = img[height:]
        res = ocr.ocr(img, cls=True)
-        sorted(res, key=lambda x: x[0][0][1])
+        sorted(res, key=lambda text: text[0][0][1])
        bottom_position = None
        if len(res) == 0:
            continue
@@ -86,8 +86,8 @@ def get_position(video_path, start_time):
        if txt_cnt == 3:
            break
    print(subtitle_position)
-    up_b, down_b = max(subtitle_position, key=subtitle_position.get)
+    up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get)
-    return up_b + height, down_b + height
+    return up_bounding + height, down_bounding + height
 def erasePunc(txt):
@@ -144,11 +144,9 @@ def detect_subtitle(img):
    img = img[int(up_b) - 30:int(down_b) + 30]
    # img = cv2.resize(img, (int(img.shape[1] * 0.5), int(img.shape[0] * 0.5)))
    res = ocr.ocr(img, cls=True)
-    sorted(res, key=lambda x: x[0][0][1])
+    sorted(res, key=lambda text: text[0][0][1])
-    bottom_position = None
    if len(res) == 0:
        return None
-    # log = []
    possible_txt = []
    for x in res:
        rect, (txt, confidence) = x
@@ -196,6 +194,16 @@ def process_video(video_path, begin, end, book_path, sheet_name, state):
        if frame is None:
            break
        cnt += 1
+        # 判断当前帧是否已超限制
+        if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 > end:
+            if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time > 1:
+                print('--------------------------------------------------')
+                recommend_lens = int((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time) * normal_speed)
+                write_to_sheet(book_path, sheet_name, ['', '', '', '插入旁白，推荐字数为%d' % recommend_lens])
+                # 判断当前是否有字幕需要被保存下来
+            if end_time < start_time:
+                write_to_sheet(book_path, sheet_name, [round(start_time, 2), round(end, 2), lastSubTitle, ''])
+            break
        # 每秒取4帧画面左右
        if cnt % int(fps / 4) == 0:
            state[0] = float((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - begin) / (end - begin)) \
@@ -239,18 +247,6 @@ def process_video(video_path, begin, end, book_path, sheet_name, state):
                    continue
            # 当前字幕与上一段字幕不一样
            lastSubTitle = subTitle
-        if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 > end:
-            if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time > 1:
-                print('--------------------------------------------------')
-                # 还没有字幕被分析出来
-                # if len(res) == 0:
-                recommend_lens = int((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time) * normal_speed)
-                # else:
-                #     recommend_lens = int(res[-1][0] * normal_speed) if len(res) == 1 else int(
-                #         (res[-1][0] - res[-2][1]) * normal_speed)
-                # narratage_recommend.append(['', '', '', '插入旁白，推荐字数为%d' % recommend_lens])
-                write_to_sheet(book_path, sheet_name, ['', '', '', '插入旁白，推荐字数为%d' % recommend_lens])
-                break
 def detect_with_ocr(video_path, book_path, start_time, end_time, state):

--- a/main_gui.py
+++ b/main_gui.py
@@ -33,7 +33,7 @@ def create_detail_day() -> str:
    return daytime
-def make_print_to_file(path='./'):
+def make_print_to_file(path: str = './'):
    """将print的内容输出到log文件夹中
    :param path:设置的log文件夹路径
@@ -132,7 +132,7 @@ def find_save_file():
    outputFilePath.set(book_path)
-def trans_to_seconds(timePoint):
+def trans_to_seconds(timePoint: str) -> float:
    """将用户输入的时间字符串转换为秒数
    :param timePoint: 时间字符串
@@ -147,7 +147,7 @@ def trans_to_seconds(timePoint):
    return time_in_seconds
-def check_timePoint(timePoint) -> bool:
+def check_timePoint(timePoint: str) -> bool:
    """检查时间字符串格式是否正确
    :param timePoint: 时间字符串
@@ -179,7 +179,7 @@ def check_timePoint(timePoint) -> bool:
    return False
-def start_process(p, p_label, state, intervals=100):
+def start_process(p, p_label, state: list, intervals: int = 100):
    """启动进度条
    :param p: 进度条组件
@@ -433,7 +433,7 @@ def start_synthesis():
        messagebox.showwarning("警告", "请选择音频存放路径")
        return
    elif not os.path.exists(audio_dir):
-        messagebox.showwarning("警告", "当前音频存放路径有误，请检查一遍。")
+        messagebox.showwarning("警告", "当前音频存放路径有误，请检查一遍")
        return
    if len(caption_path) == 0:
        messagebox.showwarning("警告", "请选择字幕文件存放路径")

--- a/speech_synthesis.py
+++ b/speech_synthesis.py
@@ -3,7 +3,7 @@ import os
 import argparse
 import time
-from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, ResultReason
+from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, ResultReason
 from azure.cognitiveservices.speech.audio import AudioOutputConfig
 import openpyxl
@@ -259,8 +259,6 @@ def ss_and_export(video_path, sheet_path, output_dir, speed, caption_file, state
    adjust_volume(origin_wav_path, start_timestamp, end_timestamp)
    # 将旁白混入原音频
    mix_speech(adjusted_wav_path, narratage_paths, start_timestamp)
-    if state is not None:
-        state[0] = 1.00
    # 删除临时语音文件、提取出来的原视频音频以及调整后的视频音频
    if os.path.exists(tmp_file):
@@ -269,6 +267,9 @@ def ss_and_export(video_path, sheet_path, output_dir, speed, caption_file, state
    os.remove(origin_wav_path)
    os.remove(adjusted_wav_path)
+    if state is not None:
+        state[0] = 1.00
 if __name__ == '__main__':
    pass