Commit 5f39d7a7 authored by 翟艳秋(20软)'s avatar 翟艳秋(20软)

优化ocr的结果

parent 71880733
...@@ -6,6 +6,7 @@ from paddleocr import PaddleOCR ...@@ -6,6 +6,7 @@ from paddleocr import PaddleOCR
import difflib import difflib
import openpyxl import openpyxl
from openpyxl.styles import PatternFill, Alignment from openpyxl.styles import PatternFill, Alignment
import re
# 字幕的上下边界 # 字幕的上下边界
up_b, down_b = 0, 0 up_b, down_b = 0, 0
...@@ -86,6 +87,12 @@ def get_position(video_path, start_time): ...@@ -86,6 +87,12 @@ def get_position(video_path, start_time):
return up_b + height, down_b + height return up_b + height, down_b + height
def erasePunc(txt):
pattern = re.compile(r'[^\u4e00-\u9fa5]')
txt = re.sub(pattern, '', txt)
return txt
def string_similar(s1, s2): def string_similar(s1, s2):
""" """
比较字符串s1和s2的相似度,主要用于减少输出文件中相似字幕的重复 比较字符串s1和s2的相似度,主要用于减少输出文件中相似字幕的重复
...@@ -93,9 +100,30 @@ def string_similar(s1, s2): ...@@ -93,9 +100,30 @@ def string_similar(s1, s2):
:param s2: :param s2:
:return: 字符串间的相似度 :return: 字符串间的相似度
""" """
# 去除非中文字符后,再比较相似度
s1 = erasePunc(s1)
s2 = erasePunc(s2)
return difflib.SequenceMatcher(None, s1, s2).quick_ratio() return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
def normalize(text):
"""
用于规范化处理文本中的一些标点符号
"""
# 将英文标点转换为中文标点
E_pun = u',.!?()[]:;'
C_pun = u',。!?()【】:;'
table = {ord(f): ord(t) for f, t in zip(E_pun, C_pun)}
text = text.translate(table)
text = text.strip(' ,。、【】_·:-@‘[;')
# 促成首尾匹配的()
if text[-1] == ')' and text[0] != '(':
text = '(' + text
elif text[-1] != ')' and text[0] == '(':
text = text + ')'
return text
def detect_subtitle(img): def detect_subtitle(img):
""" """
检测当前画面得到字幕信息 检测当前画面得到字幕信息
...@@ -162,6 +190,8 @@ def process_video(video_path, begin, end, state): ...@@ -162,6 +190,8 @@ def process_video(video_path, begin, end, state):
state[0] = float((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - begin) / (end - begin)) \ state[0] = float((video.get(cv2.CAP_PROP_POS_MSEC) / 1000 - begin) / (end - begin)) \
if state[0] is None or state[0] < 0.99 else 0.99 if state[0] is None or state[0] < 0.99 else 0.99
subTitle = detect_subtitle(frame) subTitle = detect_subtitle(frame)
if subTitle is not None:
subTitle = normalize(subTitle)
# 第一次找到字幕 # 第一次找到字幕
if lastSubTitle is None and subTitle is not None: if lastSubTitle is None and subTitle is not None:
start_time = video.get(cv2.CAP_PROP_POS_MSEC) / 1000 start_time = video.get(cv2.CAP_PROP_POS_MSEC) / 1000
......
...@@ -416,7 +416,7 @@ tabControl = ttk.Notebook(window) ...@@ -416,7 +416,7 @@ tabControl = ttk.Notebook(window)
tab1 = ttk.Frame(tabControl) tab1 = ttk.Frame(tabControl)
tabControl.add(tab1, text="旁白位置推荐") tabControl.add(tab1, text="旁白位置推荐")
tab2 = ttk.Frame(tabControl) tab2 = ttk.Frame(tabControl)
tabControl.add(tab2, text="旁白语音合成及字幕导出") tabControl.add(tab2, text="旁白及字幕导出")
tabControl.pack(expand=1, fill="both") tabControl.pack(expand=1, fill="both")
""" """
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment