initial project

453f190c · 翟艳秋（20软） · 453f190c · 453f190c · 453f190c · 453f190c
Commit 453f190c authored Jan 14, 2022 by 翟艳秋（20软）
7 changed files
--- a/README.md
+++ b/README.md
--- a/detect_with_asr.py
+++ b/detect_with_asr.py
+import shutil
+import time
+
+import openpyxl
+
+from openpyxl.styles import PatternFill, Alignment
+from split_wav import *
+
+
+def create_sheet(path, sheet_name, value):
+    """
+        根据给定的表头，初始化表格，
+        :param path: str, 表格(book)的存储位置
+        :param sheet_name: str, 表(sheet)的名字
+        :param value: list, 表头内容为['起始时间','终止时间','字幕','建议','旁边解说脚本']
+        :return: None
+    """
+    index = len(value)
+    workbook = openpyxl.Workbook()
+    sheet = workbook.active
+    sheet.title = sheet_name
+    # 将字幕对应的那一列扩宽一些
+    sheet.column_dimensions['C'].width = 50
+    for i in range(0, index):
+        for j in range(0, len(value[i])):
+            sheet.cell(row=i + 1, column=j + 1, value=str(value[i][j]))
+    workbook.save(path)
+
+
+def write_to_sheet(path, sheet_name, value):
+    """
+    向已存在的表格中写入数据
+    :param path:
+    :param sheet_name:
+    :param value:
+    :return:
+    """
+    index = len(value)
+    workbook = openpyxl.load_workbook(path)
+    sheet = workbook.get_sheet_by_name(sheet_name)
+    cur_row = sheet.max_row
+    for i in range(0, index):
+        for j in range(0, len(value[i])):
+            sheet.cell(row=cur_row + i + 1, column=j + 1, value=str(value[i][j]))
+            if value[i][j] == '' or value[i][j] == '插入旁白':
+                sheet.cell(row=cur_row + i + 1, column=j + 1).fill = PatternFill(fill_type='solid', fgColor='ffff00')
+            if j == 2:
+                sheet.cell(row=cur_row + i + 1, column=j + 1).alignment = Alignment(wrapText=True)
+    workbook.save(path)
+
+
+def trans_to_mono(wav_path):
+    """
+    将音频的通道数channel转换为1
+    :param wav_path: str, 需要转换的音频地址
+    :return: new_wav_path: str, 转换后得到的新音频地址
+    """
+    new_wav_path = wav_path[:-4] + "_1.wav"
+    command = 'ffmpeg -i {} -ac 1 -y {}'.format(wav_path, new_wav_path)
+    os.system(command)
+    return new_wav_path
+
+
+def concat_wav(root):
+    txt_path = os.path.join(root, 'list.txt')
+    with open(txt_path, 'w', encoding='utf-8') as f:
+        for file_name in os.listdir(root):
+            if os.path.isdir(os.path.join(root, file_name)):
+                wav_path = os.path.join(root, file_name) + "/vocal.wav"
+                f.write("file \'" + wav_path + "\'\n")
+    output_file = os.path.join(root, 'total.wav')
+    command = 'ffmpeg -f concat -safe 0 -i {} -y {}'.format(txt_path, output_file)
+    os.system(command)
+    return output_file
+
+
+def detect_with_asr(video_path, book_path, start_time=0, end_time=-1):
+    # 临时存储各种中间产物的文件夹
+    tmp_root = './tmp'
+    if not os.path.exists(tmp_root):
+        os.mkdir(tmp_root)
+
+    if not os.path.exists(video_path):
+        print("你输入的视频地址有误，请仔细检查一下")
+        return
+    # 提取出视频中的音频，分割后提取出其中的人声部分并存储
+    audio_path = extract_audio(video_path, tmp_root, start_time, end_time)
+    # root = split_audio()
+    # extrac_speech()
+    #
+    # # 将提取出的人声拼接，并将音频的channel调整为1
+    # total_wav_path = concat_wav(root)
+    # audio_path = trans_to_mono(total_wav_path)
+
+    # xlsx中的表格名为“旁白插入位置建议”
+    book_name_xlsx = book_path
+    sheet_name_xlsx = "旁白插入位置建议"
+
+    # 如果当前路径下不存在与视频同名的表格，则创建输出内容存放的表格
+    if not os.path.exists(book_name_xlsx):
+        table_head = [["起始时间", "终止时间", "字幕", '建议', '解说脚本']]
+        create_sheet(book_name_xlsx, sheet_name_xlsx, table_head)
+
+    sys.path.append("./PaddlePaddle_DeepSpeech2")
+    from infer_path import predict_long_audio_with_paddle
+    table_content = predict_long_audio_with_paddle(audio_path, book_name_xlsx, start_time)
+    write_to_sheet(book_name_xlsx, sheet_name_xlsx, table_content)
+
+    # 删除中间文件
+    # shutil.rmtree(tmp_root)
+
+
+if __name__ == '__main__':
+    start_time = time.time()
+    # 给定待处理的视频路径
+    video_path = 'D:/heelo/zhanlang.rmvb'
+
+    detect_with_asr(video_path, "zhanlang.xlsx", 50, 5154)
+    print("处理视频 {} 需要时长为{} ".format(os.path.basename(video_path), time.time() - start_time))
--- a/detect_with_ocr.py
+++ b/detect_with_ocr.py
--- a/judge_subtitle.py
+++ b/judge_subtitle.py
+import random
+import time
+
+import cv2
+import numpy as np
+from paddleocr import PaddleOCR
+from collections import Counter
+
+ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)
+
+
+def random_int_list(start, stop, length):
+    """
+    在某一段区间内取n个随机数
+    :param start: 随机数区间的最小值
+    :param stop: 随机数区间的最大值
+    :param length: 随机数个数
+    :return: 随机数数组[list]
+    """
+    start, stop = (int(start), int(stop)) if start <= stop else (int(stop), int(start))
+    length = int(abs(length)) if length else 0
+    random_list = []
+    while True:
+        tmp = random.randint(start, stop)
+        if tmp not in random_list:
+            random_list.append(tmp)
+        if len(random_list) == length:
+            break
+    return random_list
+
+
+def detect_subtitle(frame):
+    """
+    判断画面中是否含字幕
+    :param frame: 视频的某一帧画面
+    :return: Ture or False
+    """
+    frame = frame[int(frame.shape[0] * 0.7):]
+    subtitle = ocr.ocr(frame, cls=True)
+    print(subtitle)
+    for x in subtitle:
+        position, (txt, confidence) = x
+        height = position[2][1] - position[0][1]
+        mid = (position[0][0] + position[1][0]) / 2
+        print(height, txt)
+        # 求倾斜度
+        gradient = np.arctan(abs((position[1][1] - position[0][1]) / (position[1][0] - position[0][0])))
+        print(gradient)
+        if confidence > 0.7 and 0.4 * frame.shape[1] < mid < 0.6 * frame.shape[1] \
+                and gradient < 0.1:
+            return True
+        else:
+            continue
+    return False
+
+
+def detect_movie(video_path, start, interval):
+    """
+    使用整部视频进行测试，确定视频是否提供字幕
+    :param video_path: 视频的地址
+    :param start: 取随机帧的时间区间的开始时间
+    :param interval: 取随机帧的每段区间时长，单位为秒
+    :return: True or False（视频是否含字幕）
+    """
+    video = cv2.VideoCapture(video_path)
+    fps = np.ceil(video.get(cv2.CAP_PROP_FPS))
+    start = start * fps
+    interval = interval * fps
+    random_number = 50
+    ans = [False] * 3
+    print(ans)
+    for i in range(3):
+        random_list = random_int_list(start, start + interval, random_number)
+        start = start + interval
+        for _, random_point in enumerate(random_list):
+            video.set(cv2.CAP_PROP_POS_FRAMES, float(random_point))
+            if video.isOpened():
+                success, frame = video.read()
+                if not success:
+                    break
+                ans[i] = detect_subtitle(frame)
+            if ans[i]:
+                print(random_point)
+                break
+    video.release()
+    print(ans)
+    return Counter(ans).most_common(1)[0][0]
+
+
+if __name__ == '__main__':
+    video_path = r'D:\heelo\hysxm.mp4'
+    start_time = time.time()
+    start = 90
+    interval = 120
+    print(detect_movie(video_path, start, interval))
+    print(time.time() - start_time)
--- a/narratage_detection.py
+++ b/narratage_detection.py
+# encoding=utf8
+import os.path
+import argparse
+import time
+
+from judge_subtitle import detect_movie
+from detect_with_asr import detect_with_asr
+from detect_with_ocr import detect_with_ocr
+
+
+def trans_to_seconds(timepoint):
+    time_in_seconds = 0
+    timepoint = timepoint.split(':')
+    units = 1
+    for i in range(len(timepoint) - 1, -1, -1):
+        time_in_seconds += units * float(timepoint[i])
+        units *= 60
+    return time_in_seconds
+
+
+def detect(video_path, start_time, end_time, book_path):
+    if book_path is None:
+        book_path = os.path.basename(video_path).split('.')[0] + ".xlsx"
+    else:
+        book_path = book_path
+
+    start_time = trans_to_seconds(start_time)
+    end_time = trans_to_seconds(end_time)
+
+    has_subtitle = detect_movie(video_path, start_time, 60)
+    if has_subtitle:
+        detect_with_ocr(video_path, book_path, start_time, end_time)
+    else:
+        detect_with_asr(video_path, book_path, start_time, end_time)
+
+
+if __name__ == '__main__':
+    # 定义参数
+    parser = argparse.ArgumentParser(description='Speech Synthesis guideness')
+    parser.add_argument("--video_path", required=True, type=str, help="待处理的视频存储路径")
+    parser.add_argument("--start_time", required=True, type=str, help="视频中影片除开场动画外的实际开始时间点，格式为'时:分:秒'，也可以输入对应的秒数")
+    parser.add_argument("--end_time", required=True, type=str, help="视频中影片除演职表外的实际结束时间点，格式为'时:分:秒'，也可以输入对应的秒数")
+    parser.add_argument("--book_path", type=str, help='旁白解说表格存储路径，包含表格名，如"D:\AddCaption\hysxm.xlsx"')
+
+    args = parser.parse_args()
+
+    detect(args.video_path, args.start_time, args.end_time, args.book_path)
--- a/speech_synthesis.py
+++ b/speech_synthesis.py
+# coding=utf-8
+import os
+import argparse
+
+from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer
+from azure.cognitiveservices.speech.audio import AudioOutputConfig
+import openpyxl
+
+tmp_file = 'tmp.wav'
+
+
+def speech_synthesis(text, output_file, speed):
+    """
+    用于合成讲解音频并输出
+    :param text: 解说文本
+    :param output_file: 输出文件路径
+    :param speed: 指定的音频语速，默认为1.0
+    :return:
+    """
+    if float(speed) != 1.0:
+        audio_path = tmp_file
+    else:
+        audio_path = output_file
+    speech_config = SpeechConfig(subscription="ffa331815f0f4c7fa418bb6c2e1c4e17", region="eastus")
+
+    speech_config.speech_synthesis_language = "zh-CN"
+    speech_config.speech_synthesis_voice_name = 'zh-CN-XiaomoNeural'
+
+    # 先把合成的语音文件输出得到tmp.wav中，便于可能的调速需求
+
+    audio_config = AudioOutputConfig(filename=audio_path)
+    synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+
+    synthesizer.speak_text_async(text)
+
+    if float(speed) != 1.0:
+        change_speed(output_file, speed)
+
+
+def change_speed(wav_path, speed=1.5):
+    """
+    调整语速
+    :param wav_path: 原音频路径
+    :param speed: 转换后的语速
+    :return:
+    """
+    cmd_line = 'ffmpeg -y -i {} -filter:a \"atempo={}\" {}'.format(tmp_file, speed, wav_path)
+    os.system(cmd_line)
+    # 删除临时文件
+    os.remove(tmp_file)
+
+
+def read_sheet(book_path, sheet_name=None):
+    """
+    从表格中读出所有的内容，用dict保存（表格的格式固定，第一行为表头（起始时间|终止时间|字幕|建议|解说脚本））
+    :param book_path: 表格的存储路径
+    :param sheet_name: 想要读取的表在excel表格中的名字（可选项）
+    :return: sheet_content (dict) 表格中的所有内容
+    """
+    workbook = openpyxl.load_workbook(book_path)
+    sheet = workbook.active
+    rows = sheet.max_row
+    cols = sheet.max_column
+    sheet_content = {}
+    # 读取xlsx中每列的内容，默认第一行是各列的列名
+    for i in range(1, rows + 1):
+        for j in range(1, cols + 1):
+            if i == 1:
+                sheet_content[sheet.cell(1, j).value] = []
+            else:
+                sheet_content[sheet.cell(1, j).value].append(sheet.cell(i, j).value)
+    return sheet_content
+
+
+def get_narratage_text(sheet_content):
+    """
+    根据从表格中获取到的内容，分析得到解说文本+对应开始时间
+    :param sheet_content: dict，keys=["起始时间","终止时间","字幕","建议","解说脚本"]
+    :return: narratage_text: list, 旁白文本，
+             narratage_start_time: list, 旁白对应开始时间
+    """
+    narratage = sheet_content['解说脚本']
+    subtitle = sheet_content['字幕']
+    start_time = sheet_content['起始时间']
+    end_time = sheet_content['终止时间']
+    narratage_start_time = []
+    narratage_text = []
+    for i, text in enumerate(narratage):
+        if text is not None:
+            if text == '翻译':
+                narratage_text.append(subtitle[i])
+                narratage_start_time.append(float(start_time[i]) + 0.1)
+            else:
+                # 如果旁白中有换行符，即分为n段，则按照换行符进行分割，并间隔0.5s
+                text_split = text.split('\n')
+                cur_start = float(end_time[i - 1]) + 0.1 if i > 0 else 0
+                for x in text_split:
+                    narratage_text.append(x)
+                    narratage_start_time.append(cur_start)
+                    cur_start = cur_start + len(x) / (4.5 * args.speed) + 0.5
+    return narratage_text, narratage_start_time
+
+
+def second_to_str(seconds):
+    seconds = float(seconds)
+    hour = int(seconds / 3600)
+    minute = int((seconds - hour * 3600) / 60)
+    second = int(seconds - hour * 3600 - minute * 60)
+    ms = int((seconds - second - minute * 60 - hour * 3600) * 1000)
+    time_str = "%02d:%02d:%02d,%03d" % (hour, minute, second, ms)
+    return time_str
+
+
+def export_caption(sheet_content, caption_file):
+    """
+    将用户校正后的字幕输出为字幕文件（srt格式）
+    :param sheet_content: 用户校正后的表格内容
+    :return:
+    """
+    caption = sheet_content["字幕"]
+    start_time = sheet_content['起始时间']
+    end_time = sheet_content['终止时间']
+    cnt = 0
+    with open(caption_file, "w", encoding="utf-8") as f:
+        for i, x in enumerate(caption):
+            if x is not None:
+                start, end = second_to_str(start_time[i]), second_to_str(end_time[i])
+                cnt += 1
+                f.write(str(cnt) + "\n")
+                f.write(start + " --> " + end + "\n")
+                f.write(x + "\n\n")
+
+
+def ss_and_export():
+    # 旁白解说表格的位置
+    book_path = args.sheet_path
+    # 音频输出位置路径
+    root_path = args.output_dir
+    # 语速
+    speed = args.speed
+    # 字幕文件路径
+    caption_file = args.caption_file
+
+    # 如果文件夹不存在，则新建文件夹
+    if not os.path.exists(root_path):
+        os.mkdir(root_path)
+
+    # 读取表格，并获取旁白及对应插入位置
+    sheet_content = read_sheet(book_path)
+    narratages, start_timepoint = get_narratage_text(sheet_content)
+    export_caption(sheet_content, caption_file)
+
+    # 生成旁白解说语音
+    for i, text in enumerate(narratages):
+        wav_path = os.path.join(root_path, '%.2f.wav' % start_timepoint[i])
+        speech_synthesis(text, wav_path, speed)
+
+
+if __name__ == '__main__':
+    # 定义参数
+    parser = argparse.ArgumentParser(description='Speech Synthesis guideness')
+    parser.add_argument("--output_dir", required=True, type=str, help="音频输出位置路径")
+    parser.add_argument("--sheet_path", required=True, type=str, help='旁白解说表格存储路径')
+    parser.add_argument("--caption_file", required=True, type=str, help="输出的字幕文件存储路径")
+    parser.add_argument("--speed", type=float, default=1.0, help="设置语速，默认为1.0")
+    args = parser.parse_args()
+
+    # 主函数执行
+    ss_and_export(args.output_dir,args.sheet_path,args.caption_file,args.speed)
--- a/try_with_gui.py
+++ b/try_with_gui.py