Commit 1c807388 authored by 翟艳秋(20软)'s avatar 翟艳秋(20软)

modified the errors in detect_with_ocr and change the dir order of resources like speaker audios

parent 0c8c815e
__pycache__ __pycache__
.vscode .vscode
.idea .idea
chineseocr_lite
exp
chineseocr_usage.py
easyOCR_usage.py
dist
build
...@@ -83,7 +83,7 @@ class Assemble_Dialog(QDialog, Ui_Dialog): ...@@ -83,7 +83,7 @@ class Assemble_Dialog(QDialog, Ui_Dialog):
if __name__ == '__main__': if __name__ == '__main__':
app = QApplication(sys.argv) app = QApplication(sys.argv)
app.setWindowIcon(QIcon("./images/eagle_2.ico")) app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
dialog = Assemble_Dialog() dialog = Assemble_Dialog()
dialog.show() dialog.show()
sys.exit(app.exec_()) sys.exit(app.exec_())
\ No newline at end of file
...@@ -33,4 +33,4 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) ...@@ -33,4 +33,4 @@ dir_path = os.path.dirname(os.path.abspath(__file__))
class Pathes: class Pathes:
speaker_conf_path = os.path.join(dir_path, "speakers.json") speaker_conf_path = os.path.join(dir_path, "res/speakers.json")
...@@ -23,6 +23,7 @@ class Detect_Dialog(QDialog, Ui_Dialog): ...@@ -23,6 +23,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
self.pushButton_2.clicked.connect(self.openTableFile) self.pushButton_2.clicked.connect(self.openTableFile)
self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).clicked.connect(self.start_detect) self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).clicked.connect(self.start_detect)
self.prompt_dialog = Prompt_Dialog() self.prompt_dialog = Prompt_Dialog()
def init_self(self): def init_self(self):
self.lineEdit.setText(self.projectContext.video_path) self.lineEdit.setText(self.projectContext.video_path)
self.lineEdit_2.setText(self.projectContext.excel_path) self.lineEdit_2.setText(self.projectContext.excel_path)
...@@ -59,7 +60,7 @@ class Detect_Dialog(QDialog, Ui_Dialog): ...@@ -59,7 +60,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
if __name__ == '__main__': if __name__ == '__main__':
app = QApplication(sys.argv) app = QApplication(sys.argv)
app.setWindowIcon(QIcon("./images/eagle_2.ico")) app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
dialog = Detect_Dialog() dialog = Detect_Dialog()
dialog.show() dialog.show()
sys.exit(app.exec_()) sys.exit(app.exec_())
\ No newline at end of file
...@@ -19,8 +19,10 @@ import os ...@@ -19,8 +19,10 @@ import os
import cv2 import cv2
import numpy as np import numpy as np
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
# from easyOCR_usage import EasyOCR
# from chineseocr_usage import ChineseOCR
import sys import sys
print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__)) # print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__))
import difflib import difflib
import re import re
...@@ -33,6 +35,8 @@ up_b, down_b = 0, 0 ...@@ -33,6 +35,8 @@ up_b, down_b = 0, 0
# 初始化ocr工具 # 初始化ocr工具
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False) ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False)
# ocr = EasyOCR()
# ocr = ChineseOCR()
# 正常语速为4字/秒 # 正常语速为4字/秒
normal_speed = 4 normal_speed = 4
...@@ -65,7 +69,8 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]: ...@@ -65,7 +69,8 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
# print("img:", img) # print("img:", img)
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# cv2.imshow('img', gray) # cv2.imshow('img', gray)
# cv2.imshow(img) # cv2.waitKey(0)
# cv2.destroyAllWindows()
cnt += 1 cnt += 1
if img is None or cnt > 10000: if img is None or cnt > 10000:
break break
...@@ -83,42 +88,43 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]: ...@@ -83,42 +88,43 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
for x in res: for x in res:
# print("x:", x) # print("x:", x)
rect, (txt, confidence) = x rect, (txt, confidence) = x
[x1,y1],[x2,y2],[x3,y3],[x4,y4] = rect
# font_size = rect[2][1] - rect[0][1] # font_size = rect[2][1] - rect[0][1]
mid = (rect[0][0] + rect[1][0]) / 2 mid = (x1 + x2) / 2
gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0]))) gradient = np.arctan(abs((y2 - y1) / (x2 - x1)))
# 可能是字幕的文本 # 可能是字幕的文本
if confidence > 0.9 and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1: conf_thred = 0.9
# conf_thred = 0.8
if confidence > conf_thred and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1:
if bottom_position is None: if bottom_position is None:
bottom_position = rect[0][1] bottom_position = y1
# 判断是否与前一文本相同(是不是同一个字幕),非同一字幕的前提下,取对应上下边界, # 判断是否与前一文本相同(是不是同一个字幕),非同一字幕的前提下,取对应上下边界,
keys = subtitle_position.keys() keys = subtitle_position.keys()
if abs(rect[0][1] - bottom_position) < 10: if abs(y1 - bottom_position) < 10:
if pre_txt is None or pre_txt != txt: if pre_txt is None or pre_txt != txt:
txt_cnt += 1 txt_cnt += 1
pre_txt = txt pre_txt = txt
if (rect[0][0], rect[2][1]) in keys: if (y1, y3) in keys:
subtitle_position[(rect[0][1], rect[2][1])] += 1 subtitle_position[(y1, y3)] += 1
else: else:
replace = False replace = False
for k in keys: for k in keys:
# 更新键值为最宽的上下限 # 更新键值为最宽的上下限
if abs(rect[0][1] - k[0]) + abs(rect[2][1] - k[1]) < 10: if abs(y1 - k[0]) + abs(y3 - k[1]) < 10:
new_k = min(k[0], rect[0][1]), max(k[1], rect[2][1]) subtitle_position[k] += 1
new_k = min(k[0], y1), max(k[1], y3)
if new_k != k: if new_k != k:
subtitle_position[new_k] = subtitle_position[k] subtitle_position[new_k] = subtitle_position[k]
subtitle_position[new_k] += 1
subtitle_position.pop(k) subtitle_position.pop(k)
else:
subtitle_position[k] += 1
replace = True replace = True
break break
if not replace: if not replace:
subtitle_position[(rect[0][1], rect[2][1])] = 1 subtitle_position[(y1, y3)] = 1
if txt_cnt == 3: if txt_cnt == 3:
break break
print(subtitle_position) print(subtitle_position)
up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get) up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get)
return up_bounding + height, down_bounding + height return int(up_bounding + height), int(down_bounding + height)
def erasePunc(txt: str) -> str: def erasePunc(txt: str) -> str:
...@@ -167,10 +173,11 @@ def normalize(text: str) -> str: ...@@ -167,10 +173,11 @@ def normalize(text: str) -> str:
text = text.translate(table) text = text.translate(table)
text = text.strip(' ,。、【】_·:-@‘[;') text = text.strip(' ,。、【】_·:-@‘[;')
# 促成首尾匹配的() # 促成首尾匹配的()
if text[-1] == ')' and text[0] != '(': if len(text) > 0:
text = '(' + text if text[-1] == ')' and text[0] != '(':
elif text[-1] != ')' and text[0] == '(': text = '(' + text
text = text + ')' elif text[-1] != ')' and text[0] == '(':
text = text + ')'
return text return text
...@@ -191,9 +198,12 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]: ...@@ -191,9 +198,12 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
img = cv2.resize(img, (int(img.shape[1] * 1.5), int(img.shape[0] * 1.5))) img = cv2.resize(img, (int(img.shape[1] * 1.5), int(img.shape[0] * 1.5)))
res = ocr.ocr(img, cls=True) res = ocr.ocr(img, cls=True)
sorted(res, key=lambda text: text[0][0][1]) sorted(res, key=lambda text: text[0][0][1])
sorted(res, key=lambda text: text[0][0][0])
if len(res) == 0: if len(res) == 0:
return None return None, 0
possible_txt = [] possible_txt = []
conf = 0
print(res)
for x in res: for x in res:
# cv2.imshow("cut", img) # cv2.imshow("cut", img)
# cv2.waitKey(0) # cv2.waitKey(0)
...@@ -204,18 +214,35 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]: ...@@ -204,18 +214,35 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0]))) gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0])))
# log.append("文本:{},置信度:{},中心点:{},斜率:{},字体大小:{}".format(txt, confidence, mid / img.shape[1], gradient, # log.append("文本:{},置信度:{},中心点:{},斜率:{},字体大小:{}".format(txt, confidence, mid / img.shape[1], gradient,
# font_size)) 置信度>0.7 & 斜率<0.1 & 字幕偏移量<=25 & 字幕中心在画面宽的0.4-0.6之间 # font_size)) 置信度>0.7 & 斜率<0.1 & 字幕偏移量<=25 & 字幕中心在画面宽的0.4-0.6之间
if confidence > 0.7 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \ # print("文本:{},置信度:{},中心点:{},斜率:{},字体大小:{}".format(txt, confidence, mid / img.shape[1], gradient, font_size))
abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= 25: # print("差距:{}".format(abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30)))
conf_thred1 = 0.7
conf_thred2 = 0.85
# conf_thred1 = 0.1
# conf_thred2 = 0.4
# conf_thred1 = 0.5
# conf_thred2 = 0.7
if confidence > conf_thred1 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \
abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= font_size - 10:
subTitle += txt subTitle += txt
conf = max(conf,confidence)
# possible_txt.append([txt, mid/img.shape[1]])
possible_txt.append(txt)
# 如果字幕在一行中分为两个(或以上)对话文本 # 如果字幕在一行中分为两个(或以上)对话文本
elif confidence > 0.85 and gradient < 0.1: elif confidence > conf_thred2 and gradient < 0.1:
if 0.3 < mid / img.shape[1] < 0.4 or 0.6 < mid / img.shape[1] < 0.7: if 0.3 < mid / img.shape[1] < 0.4 or 0.6 < mid / img.shape[1] < 0.7:
# possible_txt.append([txt, mid/img.shape[1]])
possible_txt.append(txt) possible_txt.append(txt)
conf = max(conf, confidence)
# sorted(possible_txt, key=lambda pos : pos[1])
# print(possible_txt)
if len(possible_txt) >= 2: if len(possible_txt) >= 2:
subTitle = ''.join(possible_txt) # subTitle = ' '.join([x[0] for x in possible_txt])
subTitle = ' '.join(possible_txt)
print(subTitle, conf)
if len(subTitle) > 0: if len(subTitle) > 0:
return subTitle return subTitle, conf
return None return None, 0
def process_video(video_path: str, begin: float, end: float, book_path: str, sheet_name: str, state=None, mainWindow: MainWindow=None): def process_video(video_path: str, begin: float, end: float, book_path: str, sheet_name: str, state=None, mainWindow: MainWindow=None):
...@@ -243,6 +270,7 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she ...@@ -243,6 +270,7 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
video = cv2.VideoCapture(video_path) video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS) fps = video.get(cv2.CAP_PROP_FPS)
lastSubTitle = None lastSubTitle = None
lastConf = 0
# res是在视频遍历过程中获取的字幕文件,不掺杂对旁白的分析 # res是在视频遍历过程中获取的字幕文件,不掺杂对旁白的分析
res = [] res = []
cnt = 0 cnt = 0
...@@ -279,9 +307,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she ...@@ -279,9 +307,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
mainWindow.projectContext.nd_process = state[0] mainWindow.projectContext.nd_process = state[0]
mainWindow.projectContext.last_time = cur_time mainWindow.projectContext.last_time = cur_time
subTitle = detect_subtitle(frame) subTitle, conf = detect_subtitle(frame)
if subTitle is not None: if subTitle is not None:
subTitle = normalize(subTitle) subTitle = normalize(subTitle)
if len(subTitle) == 0:
subTitle = None
# 第一次找到字幕 # 第一次找到字幕
if lastSubTitle is None and subTitle is not None: if lastSubTitle is None and subTitle is not None:
start_time = cur_time start_time = cur_time
...@@ -315,10 +345,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she ...@@ -315,10 +345,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
add_to_list(mainWindow, "字幕", [round(start_time, 3), round(end_time, 3), lastSubTitle, '']) add_to_list(mainWindow, "字幕", [round(start_time, 3), round(end_time, 3), lastSubTitle, ''])
start_time = end_time start_time = end_time
else: else:
lastSubTitle = subTitle if len(subTitle) > len(lastSubTitle) else lastSubTitle lastSubTitle = subTitle if conf > lastConf else lastSubTitle
continue continue
# 当前字幕与上一段字幕不一样 # 当前字幕与上一段字幕不一样
lastSubTitle = subTitle lastSubTitle = subTitle
lastConf = conf
def add_to_list(mainWindow: MainWindow, element_type: str, li: list): def add_to_list(mainWindow: MainWindow, element_type: str, li: list):
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment