Commit 1c807388 authored by 翟艳秋(20软)'s avatar 翟艳秋(20软)

modified the errors in detect_with_ocr and change the dir order of resources like speaker audios

parent 0c8c815e
__pycache__
.vscode
.idea
chineseocr_lite
exp
chineseocr_usage.py
easyOCR_usage.py
dist
build
......@@ -83,7 +83,7 @@ class Assemble_Dialog(QDialog, Ui_Dialog):
if __name__ == '__main__':
app = QApplication(sys.argv)
app.setWindowIcon(QIcon("./images/eagle_2.ico"))
app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
dialog = Assemble_Dialog()
dialog.show()
sys.exit(app.exec_())
\ No newline at end of file
......@@ -33,4 +33,4 @@ dir_path = os.path.dirname(os.path.abspath(__file__))
class Pathes:
speaker_conf_path = os.path.join(dir_path, "speakers.json")
speaker_conf_path = os.path.join(dir_path, "res/speakers.json")
......@@ -23,6 +23,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
self.pushButton_2.clicked.connect(self.openTableFile)
self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).clicked.connect(self.start_detect)
self.prompt_dialog = Prompt_Dialog()
def init_self(self):
self.lineEdit.setText(self.projectContext.video_path)
self.lineEdit_2.setText(self.projectContext.excel_path)
......@@ -59,7 +60,7 @@ class Detect_Dialog(QDialog, Ui_Dialog):
if __name__ == '__main__':
app = QApplication(sys.argv)
app.setWindowIcon(QIcon("./images/eagle_2.ico"))
app.setWindowIcon(QIcon("./res/images/eagle_2.ico"))
dialog = Detect_Dialog()
dialog.show()
sys.exit(app.exec_())
\ No newline at end of file
......@@ -19,8 +19,10 @@ import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
# from easyOCR_usage import EasyOCR
# from chineseocr_usage import ChineseOCR
import sys
print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__))
# print("PaddleOCR load path:", os.path.abspath(sys.modules[PaddleOCR.__module__].__file__))
import difflib
import re
......@@ -33,6 +35,8 @@ up_b, down_b = 0, 0
# 初始化ocr工具
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False)
# ocr = EasyOCR()
# ocr = ChineseOCR()
# 正常语速为4字/秒
normal_speed = 4
......@@ -65,7 +69,8 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
# print("img:", img)
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# cv2.imshow('img', gray)
# cv2.imshow(img)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
cnt += 1
if img is None or cnt > 10000:
break
......@@ -83,42 +88,43 @@ def get_position(video_path: str, start_time: float) -> Tuple[float, float]:
for x in res:
# print("x:", x)
rect, (txt, confidence) = x
[x1,y1],[x2,y2],[x3,y3],[x4,y4] = rect
# font_size = rect[2][1] - rect[0][1]
mid = (rect[0][0] + rect[1][0]) / 2
gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0])))
mid = (x1 + x2) / 2
gradient = np.arctan(abs((y2 - y1) / (x2 - x1)))
# 可能是字幕的文本
if confidence > 0.9 and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1:
conf_thred = 0.9
# conf_thred = 0.8
if confidence > conf_thred and 0.4 * img.shape[1] < mid < 0.6 * img.shape[1] and gradient < 0.1:
if bottom_position is None:
bottom_position = rect[0][1]
bottom_position = y1
# 判断是否与前一文本相同(是不是同一个字幕),非同一字幕的前提下,取对应上下边界,
keys = subtitle_position.keys()
if abs(rect[0][1] - bottom_position) < 10:
if abs(y1 - bottom_position) < 10:
if pre_txt is None or pre_txt != txt:
txt_cnt += 1
pre_txt = txt
if (rect[0][0], rect[2][1]) in keys:
subtitle_position[(rect[0][1], rect[2][1])] += 1
if (y1, y3) in keys:
subtitle_position[(y1, y3)] += 1
else:
replace = False
for k in keys:
# 更新键值为最宽的上下限
if abs(rect[0][1] - k[0]) + abs(rect[2][1] - k[1]) < 10:
new_k = min(k[0], rect[0][1]), max(k[1], rect[2][1])
if abs(y1 - k[0]) + abs(y3 - k[1]) < 10:
subtitle_position[k] += 1
new_k = min(k[0], y1), max(k[1], y3)
if new_k != k:
subtitle_position[new_k] = subtitle_position[k]
subtitle_position[new_k] += 1
subtitle_position.pop(k)
else:
subtitle_position[k] += 1
replace = True
break
if not replace:
subtitle_position[(rect[0][1], rect[2][1])] = 1
subtitle_position[(y1, y3)] = 1
if txt_cnt == 3:
break
print(subtitle_position)
up_bounding, down_bounding = max(subtitle_position, key=subtitle_position.get)
return up_bounding + height, down_bounding + height
return int(up_bounding + height), int(down_bounding + height)
def erasePunc(txt: str) -> str:
......@@ -167,10 +173,11 @@ def normalize(text: str) -> str:
text = text.translate(table)
text = text.strip(' ,。、【】_·:-@‘[;')
# 促成首尾匹配的()
if text[-1] == ')' and text[0] != '(':
text = '(' + text
elif text[-1] != ')' and text[0] == '(':
text = text + ')'
if len(text) > 0:
if text[-1] == ')' and text[0] != '(':
text = '(' + text
elif text[-1] != ')' and text[0] == '(':
text = text + ')'
return text
......@@ -191,9 +198,12 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
img = cv2.resize(img, (int(img.shape[1] * 1.5), int(img.shape[0] * 1.5)))
res = ocr.ocr(img, cls=True)
sorted(res, key=lambda text: text[0][0][1])
sorted(res, key=lambda text: text[0][0][0])
if len(res) == 0:
return None
return None, 0
possible_txt = []
conf = 0
print(res)
for x in res:
# cv2.imshow("cut", img)
# cv2.waitKey(0)
......@@ -204,18 +214,35 @@ def detect_subtitle(img: np.ndarray) -> Union[str, None]:
gradient = np.arctan(abs((rect[1][1] - rect[0][1]) / (rect[1][0] - rect[0][0])))
# log.append("文本:{},置信度:{},中心点:{},斜率:{},字体大小:{}".format(txt, confidence, mid / img.shape[1], gradient,
# font_size)) 置信度>0.7 & 斜率<0.1 & 字幕偏移量<=25 & 字幕中心在画面宽的0.4-0.6之间
if confidence > 0.7 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \
abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= 25:
# print("文本:{},置信度:{},中心点:{},斜率:{},字体大小:{}".format(txt, confidence, mid / img.shape[1], gradient, font_size))
# print("差距:{}".format(abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30)))
conf_thred1 = 0.7
conf_thred2 = 0.85
# conf_thred1 = 0.1
# conf_thred2 = 0.4
# conf_thred1 = 0.5
# conf_thred2 = 0.7
if confidence > conf_thred1 and gradient < 0.1 and 0.4 < mid / img.shape[1] < 0.6 and \
abs(rect[0][1] - 30) + abs(img.shape[0] - rect[2][1] - 30) <= font_size - 10:
subTitle += txt
conf = max(conf,confidence)
# possible_txt.append([txt, mid/img.shape[1]])
possible_txt.append(txt)
# 如果字幕在一行中分为两个(或以上)对话文本
elif confidence > 0.85 and gradient < 0.1:
elif confidence > conf_thred2 and gradient < 0.1:
if 0.3 < mid / img.shape[1] < 0.4 or 0.6 < mid / img.shape[1] < 0.7:
# possible_txt.append([txt, mid/img.shape[1]])
possible_txt.append(txt)
conf = max(conf, confidence)
# sorted(possible_txt, key=lambda pos : pos[1])
# print(possible_txt)
if len(possible_txt) >= 2:
subTitle = ''.join(possible_txt)
# subTitle = ' '.join([x[0] for x in possible_txt])
subTitle = ' '.join(possible_txt)
print(subTitle, conf)
if len(subTitle) > 0:
return subTitle
return None
return subTitle, conf
return None, 0
def process_video(video_path: str, begin: float, end: float, book_path: str, sheet_name: str, state=None, mainWindow: MainWindow=None):
......@@ -243,6 +270,7 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
lastSubTitle = None
lastConf = 0
# res是在视频遍历过程中获取的字幕文件,不掺杂对旁白的分析
res = []
cnt = 0
......@@ -279,9 +307,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
mainWindow.projectContext.nd_process = state[0]
mainWindow.projectContext.last_time = cur_time
subTitle = detect_subtitle(frame)
subTitle, conf = detect_subtitle(frame)
if subTitle is not None:
subTitle = normalize(subTitle)
if len(subTitle) == 0:
subTitle = None
# 第一次找到字幕
if lastSubTitle is None and subTitle is not None:
start_time = cur_time
......@@ -315,10 +345,11 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
add_to_list(mainWindow, "字幕", [round(start_time, 3), round(end_time, 3), lastSubTitle, ''])
start_time = end_time
else:
lastSubTitle = subTitle if len(subTitle) > len(lastSubTitle) else lastSubTitle
lastSubTitle = subTitle if conf > lastConf else lastSubTitle
continue
# 当前字幕与上一段字幕不一样
lastSubTitle = subTitle
lastConf = conf
def add_to_list(mainWindow: MainWindow, element_type: str, li: list):
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment