Commit 453f190c authored by 翟艳秋(20软)'s avatar 翟艳秋(20软)

initial project

parents
import shutil
import time
import openpyxl
from openpyxl.styles import PatternFill, Alignment
from split_wav import *
def create_sheet(path, sheet_name, value):
"""
根据给定的表头,初始化表格,
:param path: str, 表格(book)的存储位置
:param sheet_name: str, 表(sheet)的名字
:param value: list, 表头内容为['起始时间','终止时间','字幕','建议','旁边解说脚本']
:return: None
"""
index = len(value)
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = sheet_name
# 将字幕对应的那一列扩宽一些
sheet.column_dimensions['C'].width = 50
for i in range(0, index):
for j in range(0, len(value[i])):
sheet.cell(row=i + 1, column=j + 1, value=str(value[i][j]))
workbook.save(path)
def write_to_sheet(path, sheet_name, value):
"""
向已存在的表格中写入数据
:param path:
:param sheet_name:
:param value:
:return:
"""
index = len(value)
workbook = openpyxl.load_workbook(path)
sheet = workbook.get_sheet_by_name(sheet_name)
cur_row = sheet.max_row
for i in range(0, index):
for j in range(0, len(value[i])):
sheet.cell(row=cur_row + i + 1, column=j + 1, value=str(value[i][j]))
if value[i][j] == '' or value[i][j] == '插入旁白':
sheet.cell(row=cur_row + i + 1, column=j + 1).fill = PatternFill(fill_type='solid', fgColor='ffff00')
if j == 2:
sheet.cell(row=cur_row + i + 1, column=j + 1).alignment = Alignment(wrapText=True)
workbook.save(path)
def trans_to_mono(wav_path):
"""
将音频的通道数channel转换为1
:param wav_path: str, 需要转换的音频地址
:return: new_wav_path: str, 转换后得到的新音频地址
"""
new_wav_path = wav_path[:-4] + "_1.wav"
command = 'ffmpeg -i {} -ac 1 -y {}'.format(wav_path, new_wav_path)
os.system(command)
return new_wav_path
def concat_wav(root):
txt_path = os.path.join(root, 'list.txt')
with open(txt_path, 'w', encoding='utf-8') as f:
for file_name in os.listdir(root):
if os.path.isdir(os.path.join(root, file_name)):
wav_path = os.path.join(root, file_name) + "/vocal.wav"
f.write("file \'" + wav_path + "\'\n")
output_file = os.path.join(root, 'total.wav')
command = 'ffmpeg -f concat -safe 0 -i {} -y {}'.format(txt_path, output_file)
os.system(command)
return output_file
def detect_with_asr(video_path, book_path, start_time=0, end_time=-1):
# 临时存储各种中间产物的文件夹
tmp_root = './tmp'
if not os.path.exists(tmp_root):
os.mkdir(tmp_root)
if not os.path.exists(video_path):
print("你输入的视频地址有误,请仔细检查一下")
return
# 提取出视频中的音频,分割后提取出其中的人声部分并存储
audio_path = extract_audio(video_path, tmp_root, start_time, end_time)
# root = split_audio()
# extrac_speech()
#
# # 将提取出的人声拼接,并将音频的channel调整为1
# total_wav_path = concat_wav(root)
# audio_path = trans_to_mono(total_wav_path)
# xlsx中的表格名为“旁白插入位置建议”
book_name_xlsx = book_path
sheet_name_xlsx = "旁白插入位置建议"
# 如果当前路径下不存在与视频同名的表格,则创建输出内容存放的表格
if not os.path.exists(book_name_xlsx):
table_head = [["起始时间", "终止时间", "字幕", '建议', '解说脚本']]
create_sheet(book_name_xlsx, sheet_name_xlsx, table_head)
sys.path.append("./PaddlePaddle_DeepSpeech2")
from infer_path import predict_long_audio_with_paddle
table_content = predict_long_audio_with_paddle(audio_path, book_name_xlsx, start_time)
write_to_sheet(book_name_xlsx, sheet_name_xlsx, table_content)
# 删除中间文件
# shutil.rmtree(tmp_root)
if __name__ == '__main__':
start_time = time.time()
# 给定待处理的视频路径
video_path = 'D:/heelo/zhanlang.rmvb'
detect_with_asr(video_path, "zhanlang.xlsx", 50, 5154)
print("处理视频 {} 需要时长为{} ".format(os.path.basename(video_path), time.time() - start_time))
This diff is collapsed.
import random
import time
import cv2
import numpy as np
from paddleocr import PaddleOCR
from collections import Counter
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
def random_int_list(start, stop, length):
"""
在某一段区间内取n个随机数
:param start: 随机数区间的最小值
:param stop: 随机数区间的最大值
:param length: 随机数个数
:return: 随机数数组[list]
"""
start, stop = (int(start), int(stop)) if start <= stop else (int(stop), int(start))
length = int(abs(length)) if length else 0
random_list = []
while True:
tmp = random.randint(start, stop)
if tmp not in random_list:
random_list.append(tmp)
if len(random_list) == length:
break
return random_list
def detect_subtitle(frame):
"""
判断画面中是否含字幕
:param frame: 视频的某一帧画面
:return: Ture or False
"""
frame = frame[int(frame.shape[0] * 0.7):]
subtitle = ocr.ocr(frame, cls=True)
print(subtitle)
for x in subtitle:
position, (txt, confidence) = x
height = position[2][1] - position[0][1]
mid = (position[0][0] + position[1][0]) / 2
print(height, txt)
# 求倾斜度
gradient = np.arctan(abs((position[1][1] - position[0][1]) / (position[1][0] - position[0][0])))
print(gradient)
if confidence > 0.7 and 0.4 * frame.shape[1] < mid < 0.6 * frame.shape[1] \
and gradient < 0.1:
return True
else:
continue
return False
def detect_movie(video_path, start, interval):
"""
使用整部视频进行测试,确定视频是否提供字幕
:param video_path: 视频的地址
:param start: 取随机帧的时间区间的开始时间
:param interval: 取随机帧的每段区间时长,单位为秒
:return: True or False(视频是否含字幕)
"""
video = cv2.VideoCapture(video_path)
fps = np.ceil(video.get(cv2.CAP_PROP_FPS))
start = start * fps
interval = interval * fps
random_number = 50
ans = [False] * 3
print(ans)
for i in range(3):
random_list = random_int_list(start, start + interval, random_number)
start = start + interval
for _, random_point in enumerate(random_list):
video.set(cv2.CAP_PROP_POS_FRAMES, float(random_point))
if video.isOpened():
success, frame = video.read()
if not success:
break
ans[i] = detect_subtitle(frame)
if ans[i]:
print(random_point)
break
video.release()
print(ans)
return Counter(ans).most_common(1)[0][0]
if __name__ == '__main__':
video_path = r'D:\heelo\hysxm.mp4'
start_time = time.time()
start = 90
interval = 120
print(detect_movie(video_path, start, interval))
print(time.time() - start_time)
# encoding=utf8
import os.path
import argparse
import time
from judge_subtitle import detect_movie
from detect_with_asr import detect_with_asr
from detect_with_ocr import detect_with_ocr
def trans_to_seconds(timepoint):
time_in_seconds = 0
timepoint = timepoint.split(':')
units = 1
for i in range(len(timepoint) - 1, -1, -1):
time_in_seconds += units * float(timepoint[i])
units *= 60
return time_in_seconds
def detect(video_path, start_time, end_time, book_path):
if book_path is None:
book_path = os.path.basename(video_path).split('.')[0] + ".xlsx"
else:
book_path = book_path
start_time = trans_to_seconds(start_time)
end_time = trans_to_seconds(end_time)
has_subtitle = detect_movie(video_path, start_time, 60)
if has_subtitle:
detect_with_ocr(video_path, book_path, start_time, end_time)
else:
detect_with_asr(video_path, book_path, start_time, end_time)
if __name__ == '__main__':
# 定义参数
parser = argparse.ArgumentParser(description='Speech Synthesis guideness')
parser.add_argument("--video_path", required=True, type=str, help="待处理的视频存储路径")
parser.add_argument("--start_time", required=True, type=str, help="视频中影片除开场动画外的实际开始时间点,格式为'时:分:秒',也可以输入对应的秒数")
parser.add_argument("--end_time", required=True, type=str, help="视频中影片除演职表外的实际结束时间点,格式为'时:分:秒',也可以输入对应的秒数")
parser.add_argument("--book_path", type=str, help='旁白解说表格存储路径,包含表格名,如"D:\AddCaption\hysxm.xlsx"')
args = parser.parse_args()
detect(args.video_path, args.start_time, args.end_time, args.book_path)
# coding=utf-8
import os
import argparse
from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer
from azure.cognitiveservices.speech.audio import AudioOutputConfig
import openpyxl
tmp_file = 'tmp.wav'
def speech_synthesis(text, output_file, speed):
"""
用于合成讲解音频并输出
:param text: 解说文本
:param output_file: 输出文件路径
:param speed: 指定的音频语速,默认为1.0
:return:
"""
if float(speed) != 1.0:
audio_path = tmp_file
else:
audio_path = output_file
speech_config = SpeechConfig(subscription="ffa331815f0f4c7fa418bb6c2e1c4e17", region="eastus")
speech_config.speech_synthesis_language = "zh-CN"
speech_config.speech_synthesis_voice_name = 'zh-CN-XiaomoNeural'
# 先把合成的语音文件输出得到tmp.wav中,便于可能的调速需求
audio_config = AudioOutputConfig(filename=audio_path)
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
synthesizer.speak_text_async(text)
if float(speed) != 1.0:
change_speed(output_file, speed)
def change_speed(wav_path, speed=1.5):
"""
调整语速
:param wav_path: 原音频路径
:param speed: 转换后的语速
:return:
"""
cmd_line = 'ffmpeg -y -i {} -filter:a \"atempo={}\" {}'.format(tmp_file, speed, wav_path)
os.system(cmd_line)
# 删除临时文件
os.remove(tmp_file)
def read_sheet(book_path, sheet_name=None):
"""
从表格中读出所有的内容,用dict保存(表格的格式固定,第一行为表头(起始时间|终止时间|字幕|建议|解说脚本))
:param book_path: 表格的存储路径
:param sheet_name: 想要读取的表在excel表格中的名字(可选项)
:return: sheet_content (dict) 表格中的所有内容
"""
workbook = openpyxl.load_workbook(book_path)
sheet = workbook.active
rows = sheet.max_row
cols = sheet.max_column
sheet_content = {}
# 读取xlsx中每列的内容,默认第一行是各列的列名
for i in range(1, rows + 1):
for j in range(1, cols + 1):
if i == 1:
sheet_content[sheet.cell(1, j).value] = []
else:
sheet_content[sheet.cell(1, j).value].append(sheet.cell(i, j).value)
return sheet_content
def get_narratage_text(sheet_content):
"""
根据从表格中获取到的内容,分析得到解说文本+对应开始时间
:param sheet_content: dict,keys=["起始时间","终止时间","字幕","建议","解说脚本"]
:return: narratage_text: list, 旁白文本,
narratage_start_time: list, 旁白对应开始时间
"""
narratage = sheet_content['解说脚本']
subtitle = sheet_content['字幕']
start_time = sheet_content['起始时间']
end_time = sheet_content['终止时间']
narratage_start_time = []
narratage_text = []
for i, text in enumerate(narratage):
if text is not None:
if text == '翻译':
narratage_text.append(subtitle[i])
narratage_start_time.append(float(start_time[i]) + 0.1)
else:
# 如果旁白中有换行符,即分为n段,则按照换行符进行分割,并间隔0.5s
text_split = text.split('\n')
cur_start = float(end_time[i - 1]) + 0.1 if i > 0 else 0
for x in text_split:
narratage_text.append(x)
narratage_start_time.append(cur_start)
cur_start = cur_start + len(x) / (4.5 * args.speed) + 0.5
return narratage_text, narratage_start_time
def second_to_str(seconds):
seconds = float(seconds)
hour = int(seconds / 3600)
minute = int((seconds - hour * 3600) / 60)
second = int(seconds - hour * 3600 - minute * 60)
ms = int((seconds - second - minute * 60 - hour * 3600) * 1000)
time_str = "%02d:%02d:%02d,%03d" % (hour, minute, second, ms)
return time_str
def export_caption(sheet_content, caption_file):
"""
将用户校正后的字幕输出为字幕文件(srt格式)
:param sheet_content: 用户校正后的表格内容
:return:
"""
caption = sheet_content["字幕"]
start_time = sheet_content['起始时间']
end_time = sheet_content['终止时间']
cnt = 0
with open(caption_file, "w", encoding="utf-8") as f:
for i, x in enumerate(caption):
if x is not None:
start, end = second_to_str(start_time[i]), second_to_str(end_time[i])
cnt += 1
f.write(str(cnt) + "\n")
f.write(start + " --> " + end + "\n")
f.write(x + "\n\n")
def ss_and_export():
# 旁白解说表格的位置
book_path = args.sheet_path
# 音频输出位置路径
root_path = args.output_dir
# 语速
speed = args.speed
# 字幕文件路径
caption_file = args.caption_file
# 如果文件夹不存在,则新建文件夹
if not os.path.exists(root_path):
os.mkdir(root_path)
# 读取表格,并获取旁白及对应插入位置
sheet_content = read_sheet(book_path)
narratages, start_timepoint = get_narratage_text(sheet_content)
export_caption(sheet_content, caption_file)
# 生成旁白解说语音
for i, text in enumerate(narratages):
wav_path = os.path.join(root_path, '%.2f.wav' % start_timepoint[i])
speech_synthesis(text, wav_path, speed)
if __name__ == '__main__':
# 定义参数
parser = argparse.ArgumentParser(description='Speech Synthesis guideness')
parser.add_argument("--output_dir", required=True, type=str, help="音频输出位置路径")
parser.add_argument("--sheet_path", required=True, type=str, help='旁白解说表格存储路径')
parser.add_argument("--caption_file", required=True, type=str, help="输出的字幕文件存储路径")
parser.add_argument("--speed", type=float, default=1.0, help="设置语速,默认为1.0")
args = parser.parse_args()
# 主函数执行
ss_and_export(args.output_dir,args.sheet_path,args.caption_file,args.speed)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment