合成音频使用用户选中的语速

4514a70e · 翟艳秋（20软） · 5d3fc565 · 4514a70e · 4514a70e · 4514a70e
Commit 4514a70e authored Nov 16, 2022 by 翟艳秋（20软）
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 42 deletions

assemble_dialog.py assemble_dialog.py +4 -3

main_window.py main_window.py +6 -7

speech_synthesis.py speech_synthesis.py +38 -32

No files found.
--- a/assemble_dialog.py
+++ b/assemble_dialog.py
@@ -65,12 +65,13 @@ class Assemble_Dialog(QDialog, Ui_Dialog):
        print("start_assemble")
        video_path = self.lineEdit.text()
        # 默认 输出的音频是工程目录+/output
-        audio_dir = self.projectContext.project_base_dir+"output/"
+        audio_dir = os.path.join(self.projectContext.project_base_dir, "output")
        sheet_path = self.lineEdit_2.text()
        speaker_info = self.lineEdit_3.text()
        speed_info =  self.lineEdit_4.text()
        #todo 后续变成常量存起来，或者做成配置
-        caption_path = replace_path_suffix(self.lineEdit.text(), ".srt")
+        # caption_path = replace_path_suffix(self.lineEdit.text(), ".srt")
+        caption_path = os.path.join(audio_dir, os.path.basename(video_path).split('.')[0] + ".srt")
        print("video_path: ",video_path)
        print("audio_dir: ",audio_dir)
@@ -78,7 +79,7 @@ class Assemble_Dialog(QDialog, Ui_Dialog):
        print("speed_info: ",speed_info)
        print("caption_path: ",caption_path)
        print("speaker_info: ",speaker_info)
-        self.start_assemble_signal.emit([video_path,audio_dir, sheet_path,speed_info, caption_path, speaker_info])
+        self.start_assemble_signal.emit([video_path, audio_dir, sheet_path,speed_info, caption_path, speaker_info])
 if __name__ == '__main__':
    app = QApplication(sys.argv)

--- a/main_window.py
+++ b/main_window.py
@@ -508,16 +508,15 @@ class MainWindow(QMainWindow, Ui_MainWindow):
                self.export_timer.stop()
            print("===已有线程结束了 in %s ===" % (type))
-            self.statusbarLabel.setText("  %s完成" % (type))
-            self.progressBar.setValue(100)
-            self.progressLabel.setText(f"100%")
-            self.projectContext.nd_process = 1
            for t in self.threads:
                if t.exitcode != 0:
                    print("Exception in", t.getName())
                    self.show_warning_msg_box("运行出错，请联系开发者处理")
-                    print("当前已有的检测结果", self.projectContext.all_elements)
                    return
+            self.statusbarLabel.setText("  %s完成" % (type))
+            self.progressBar.setValue(100)
+            self.progressLabel.setText(f"100%")
+            self.projectContext.nd_process = 1
    def deal_synthesis_callback_slot(self, threads, state):
        self.statusbarLabel.setText("  准备合成：")
@@ -1165,8 +1164,8 @@ class MainWindow(QMainWindow, Ui_MainWindow):
        self.projectContext.save_project(False)
    def export_all(self):
-        # 暂时存放音频的文件夹被命名为tmp
+        # 存放合成音频的文件夹被命名为output
-        output_dir = os.path.join(self.projectContext.project_base_dir, "tmp")
+        output_dir = os.path.join(self.projectContext.project_base_dir, "output")
        if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0:
            self.export.export_slot(self.projectContext.video_path, output_dir)
        else:

--- a/speech_synthesis.py
+++ b/speech_synthesis.py
@@ -89,7 +89,6 @@ def speech_synthesis(text: str, output_file: str, speaker: Speaker, speed: float
        speaker (Speaker): 说话人
        speed (float, optional): 指定的音频语速. Defaults to 1.0.
    """
-    audio_path = tmp_file
    speech_config = SpeechConfig(
        subscription="db34d38d2d3447d482e0f977c66bd624",
        region="eastus"
@@ -102,7 +101,6 @@ def speech_synthesis(text: str, output_file: str, speaker: Speaker, speed: float
    if not os.path.exists(os.path.dirname(output_file)):  # 如果路径不存在
        print("output_file路径不存在，创建:", os.path.dirname(output_file))
        os.makedirs(os.path.dirname(output_file))
-    audio_config = AudioOutputConfig(filename=audio_path)
    synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    ssml_string = f"""
    <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{speech_config.speech_synthesis_language}">
@@ -169,7 +167,7 @@ def read_sheet(book_path: str, sheet_name: str = "") -> dict:
    return sheet_content
-def get_narratage_text(sheet_content: dict, speed: float) -> Tuple[list, list, list]:
+def get_narratage_text(sheet_content: dict) -> Tuple[list, list, list]:
    """获取旁白解说文本及起止时间
    Args:
@@ -183,40 +181,49 @@ def get_narratage_text(sheet_content: dict, speed: float) -> Tuple[list, list, l
    subtitle = sheet_content['字幕']
    start_time = sheet_content['起始时间']
    end_time = sheet_content['终止时间']
+    speeds = sheet_content["语速"]
    narratage_start_time = []
    narratage_end_time = []
    narratage_text = []
+    narratage_speed = []
    for i, text in enumerate(narratage):
+        # 这里的speed是x.x倍速
+        speed = float(speeds[i].split('(')[0])
        if text is not None:
            if text == '翻译':
                narratage_text.append(subtitle[i])
-                narratage_start_time.append(float(start_time[i]))
-                narratage_end_time.append(float(end_time[i]))
            else:
-                # 如果旁白中有换行符，即分为n段，则按照换行符进行分割，并间隔0.5s
+                narratage_text.append(text)
-                text_split = text.split('\n')
+                """以下为之前自动根据表格生成旁白对应起始时间和终止时间的方法，目前不需要了
-                # 如果旁白有对应的时间戳（是这段大旁白里的特定位置）
+                """
-                if start_time[i] is not None and end_time[i] is not None:
+                # # 如果旁白中有换行符，即分为n段，则按照换行符进行分割，并间隔0.5s
-                    cur_start = float(start_time[i])
+                # text_split = text.split('\n')
-                    cur_end = float(end_time[i])
+                # # 如果旁白有对应的时间戳（是这段大旁白里的特定位置）
-                elif subtitle[i] is None:
+                # if start_time[i] is not None and end_time[i] is not None:
-                    cur_start = float(end_time[i - 1]) + 0.1 if i > 0 else 0
+                #     cur_start = float(start_time[i])
-                    # 如果是最后一句旁白，后面没有字幕及时间戳了，就先把cur_end置为-1
+                #     cur_end = float(end_time[i])
-                    cur_end = float(
+                # elif subtitle[i] is None:
-                        start_time[i + 1]) if i + 1 < len(start_time) else -1
+                #     # 上一个字幕/旁白的终止时间后0.1s
-                else:
+                #     cur_start = float(end_time[i - 1]) + normal_interval if i > 0 else 0
-                    # 有字幕，可覆盖字幕
+                #     # 如果是最后一句旁白，后面没有字幕及时间戳了，就先把cur_end置为-1
-                    cur_start = float(start_time[i])
+                #     cur_end = float(
-                    cur_end = float(end_time[i])
+                #         start_time[i + 1]) if i + 1 < len(start_time) else -1
-                for x in text_split:
+                # else:
-                    if len(x) == 0:
+                #     # 有字幕，可覆盖字幕
-                        continue
+                #     cur_start = float(start_time[i])
-                    cur_end = max(cur_end, cur_start + (len(x) / normal_speed + normal_interval) / speed)
+                #     cur_end = float(end_time[i])
-                    narratage_text.append(x)
+                # for x in text_split:
-                    narratage_start_time.append(cur_start)
+                #     if len(x) == 0:
-                    narratage_end_time.append(cur_end)
+                #         continue
-                    cur_start = cur_start + (len(x) / normal_speed + normal_interval) / speed
+                #     cur_end = max(cur_end, cur_start + (len(x) / (normal_speed * speed) + normal_interval))
-    return narratage_text, narratage_start_time, narratage_end_time
+                #     narratage_text.append(x)
+                #     narratage_start_time.append(cur_start)
+                #     narratage_end_time.append(cur_end)
+                #     cur_start = cur_start + (len(x) / normal_speed + normal_interval) / speed
+            narratage_start_time.append(float(start_time[i]))
+            narratage_end_time.append(float(end_time[i]))
+            narratage_speed.append(speed)
+    return narratage_text, narratage_start_time, narratage_end_time, narratage_speed
 def second_to_str(seconds: float) -> str:
@@ -331,8 +338,7 @@ def ss_and_export(video_path: str, sheet_path: str, output_dir: str, speed: floa
    # print("read sheet at time: ", datetime.datetime.now())
    sheet_content = read_sheet(book_path)
    # print("get narratage text at time: ", datetime.datetime.now())
-    narratages, start_timestamp, end_timestamp = get_narratage_text(
+    narratages, start_timestamp, end_timestamp, cur_speed = get_narratage_text(sheet_content)
-        sheet_content, speed)
    # print("export caption at time: ", datetime.datetime.now())
    export_caption(sheet_content, caption_file)
    print("已导出字幕文件")
@@ -343,7 +349,7 @@ def ss_and_export(video_path: str, sheet_path: str, output_dir: str, speed: floa
    for i, text in enumerate(narratages):
        wav_path = root_path + '/%.2f.wav' % start_timestamp[i]
        narratage_paths.append(wav_path)
-        speech_synthesis(text, wav_path, chosen_speaker, speed)
+        speech_synthesis(text, wav_path, chosen_speaker, cur_speed[i])
        print("目前正在处理{}".format(wav_path))
        if state is not None:
            state[0] = float((i + 1) / len(narratages)) * 0.97