Commit 9671a120 authored by 翟艳秋(20软)'s avatar 翟艳秋(20软)

1. [modified] 将detect_with_ocr中确认当前帧是否超过限定时间段的判断提前到取帧之前;

2. [modified] 修改detect_with_asr中的旁白区间的写入判断,避免在视频开头处插入旁白推荐字数错误的情况
parent 6cd70d8a
......@@ -53,6 +53,8 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
texts = ''
narratages = []
last_time = 0
# 已检测到字幕
subtitle_detected = False
# 执行识别
for i, audio_path in enumerate(audios_path):
print("{}开始处理{}".format(paddle.get_device(), audio_path))
......@@ -68,8 +70,10 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
device=paddle.get_device()
)
if text:
if i == 0 or (i > 0 and time_stamps[i][0] - last_time >= 1):
recommend_lens = int((time_stamps[i][0] - last_time) * normal_speed)
if not subtitle_detected or (subtitle_detected and time_stamps[i][0] - last_time >= 1):
recommend_lens = int((time_stamps[i][0] - last_time) * normal_speed) if subtitle_detected else int(
(time_stamps[i][0] + pre_time) * normal_speed)
print("插入旁白,推荐字数为%d" % recommend_lens)
# narratages.append(["", "", "", "插入旁白,推荐字数为%d" % recommend_lens])
write_to_sheet(book_name, sheet_name, ["", "", "", "插入旁白,推荐字数为%d" % recommend_lens])
# narratages.append([round(time_stamps[i][0] + pre_time, 2), round(time_stamps[i][1] + pre_time, 2),
......@@ -77,6 +81,7 @@ def predict_long_audio_with_paddle(wav_path, pre_time, book_name, sheet_name, st
write_to_sheet(book_name, sheet_name,
[round(time_stamps[i][0] + pre_time, 2), round(time_stamps[i][1] + pre_time, 2), text, ''])
last_time = time_stamps[i][1]
subtitle_detected = True
print(
"第%d个分割音频 对应时间为%.2f-%.2f 识别结果: %s" % (i, time_stamps[i][0] + pre_time, time_stamps[i][1] + pre_time, text))
state[0] = float((i + 1) / len(audios_path)) if state[0] is None or state[0] < 0.99 else 0.99
......
......@@ -14,7 +14,7 @@ from detect_with_asr import create_sheet, write_to_sheet
up_b, down_b = 0, 0
# 初始化ocr工具
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False)
# 正常语速为4字/秒
normal_speed = 4
......@@ -45,7 +45,7 @@ def get_position(video_path, start_time):
continue
img = img[height:]
res = ocr.ocr(img, cls=True)
sorted(res, key=lambda x: x[0][0][1])
sorted(res, key=lambda text: text[0][0][1])
bottom_position = None
if len(res) == 0:
continue
......@@ -86,8 +86,8 @@ def get_position(video_path, start_time):
if txt_cnt == 3:
break
print(subtitle_position)
up_b, down_b = max(subtitle_position, key=subtitle_position.get)
return up_b + height, down_b + height
up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get)
return up_bounding + height, down_bounding + height
def erasePunc(txt):
......@@ -144,11 +144,9 @@ def detect_subtitle(img):
img = img[int(up_b) - 30:int(down_b) + 30]
# img = cv2.resize(img, (int(img.shape[1] * 0.5), int(img.shape[0] * 0.5)))
res = ocr.ocr(img, cls=True)
sorted(res, key=lambda x: x[0][0][1])
bottom_position = None
sorted(res, key=lambda text: text[0][0][1])
if len(res) == 0:
return None
# log = []
possible_txt = []
for x in res:
rect, (txt, confidence) = x
......@@ -196,6 +194,16 @@ def process_video(video_path, begin, end, book_path, sheet_name, state):
if frame is None:
break
cnt += 1
# 判断当前帧是否已超限制
if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 > end:
if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time > 1:
print('--------------------------------------------------')
recommend_lens = int((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time) * normal_speed)
write_to_sheet(book_path, sheet_name, ['', '', '', '插入旁白,推荐字数为%d' % recommend_lens])
# 判断当前是否有字幕需要被保存下来
if end_time < start_time:
write_to_sheet(book_path, sheet_name, [round(start_time, 2), round(end, 2), lastSubTitle, ''])
break
# 每秒取4帧画面左右
if cnt % int(fps / 4) == 0:
state[0] = float((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - begin) / (end - begin)) \
......@@ -239,18 +247,6 @@ def process_video(video_path, begin, end, book_path, sheet_name, state):
continue
# 当前字幕与上一段字幕不一样
lastSubTitle = subTitle
if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 > end:
if video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time > 1:
print('--------------------------------------------------')
# 还没有字幕被分析出来
# if len(res) == 0:
recommend_lens = int((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - end_time) * normal_speed)
# else:
# recommend_lens = int(res[-1][0] * normal_speed) if len(res) == 1 else int(
# (res[-1][0] - res[-2][1]) * normal_speed)
# narratage_recommend.append(['', '', '', '插入旁白,推荐字数为%d' % recommend_lens])
write_to_sheet(book_path, sheet_name, ['', '', '', '插入旁白,推荐字数为%d' % recommend_lens])
break
def detect_with_ocr(video_path, book_path, start_time, end_time, state):
......
......@@ -33,7 +33,7 @@ def create_detail_day() -> str:
return daytime
def make_print_to_file(path='./'):
def make_print_to_file(path: str = './'):
"""将print的内容输出到log文件夹中
:param path:设置的log文件夹路径
......@@ -132,7 +132,7 @@ def find_save_file():
outputFilePath.set(book_path)
def trans_to_seconds(timePoint):
def trans_to_seconds(timePoint: str) -> float:
"""将用户输入的时间字符串转换为秒数
:param timePoint: 时间字符串
......@@ -147,7 +147,7 @@ def trans_to_seconds(timePoint):
return time_in_seconds
def check_timePoint(timePoint) -> bool:
def check_timePoint(timePoint: str) -> bool:
"""检查时间字符串格式是否正确
:param timePoint: 时间字符串
......@@ -179,7 +179,7 @@ def check_timePoint(timePoint) -> bool:
return False
def start_process(p, p_label, state, intervals=100):
def start_process(p, p_label, state: list, intervals: int = 100):
"""启动进度条
:param p: 进度条组件
......@@ -433,7 +433,7 @@ def start_synthesis():
messagebox.showwarning("警告", "请选择音频存放路径")
return
elif not os.path.exists(audio_dir):
messagebox.showwarning("警告", "当前音频存放路径有误,请检查一遍")
messagebox.showwarning("警告", "当前音频存放路径有误,请检查一遍")
return
if len(caption_path) == 0:
messagebox.showwarning("警告", "请选择字幕文件存放路径")
......
......@@ -3,7 +3,7 @@ import os
import argparse
import time
from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, ResultReason
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, ResultReason
from azure.cognitiveservices.speech.audio import AudioOutputConfig
import openpyxl
......@@ -259,8 +259,6 @@ def ss_and_export(video_path, sheet_path, output_dir, speed, caption_file, state
adjust_volume(origin_wav_path, start_timestamp, end_timestamp)
# 将旁白混入原音频
mix_speech(adjusted_wav_path, narratage_paths, start_timestamp)
if state is not None:
state[0] = 1.00
# 删除临时语音文件、提取出来的原视频音频以及调整后的视频音频
if os.path.exists(tmp_file):
......@@ -269,6 +267,9 @@ def ss_and_export(video_path, sheet_path, output_dir, speed, caption_file, state
os.remove(origin_wav_path)
os.remove(adjusted_wav_path)
if state is not None:
state[0] = 1.00
if __name__ == '__main__':
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment