Commit 7296e8d4 authored by 陈晓勇(工程师)'s avatar 陈晓勇(工程师)

bugfix 4 win7 err

parents 5ea02ea4 ec38f0a0
......@@ -14,3 +14,30 @@ res/ffmpeg-4.3.1/bin/qiji_local.mp4
venv/
venv37/
shenming_test
cap.png
requirements3.8.txt
venv3.8-new/
webrtcvad-2.0.10-cp38-abi3-win_amd64.whl
xlsx-resource/
deal_ocr.csv
deal_srt.csv
new.srt
shenhai1.xlsx
shenhai2.xlsx
test,py
"\346\267\261\346\265\267\347\237\255\347\211\2072.xlsx"
"\346\267\261\346\265\267\347\237\255\347\211\207origin.xlsx"
11.py
222.py
cap/
cap1597.png
cap831.png
deal.py
deal_movie.py
movie_1.txt
movie_pro.txt
res/.paddleocr/2.3.0.1/ocr/paddleocr/
script1.py
test/
\ No newline at end of file
......@@ -45,6 +45,11 @@ cur_det_model_dir = paddle_dir + "det/ch/ch_PP-OCRv2_det_infer"
cur_rec_model_dir = paddle_dir + "rec/ch/ch_PP-OCRv2_rec_infer"
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False, cls_model_dir=cur_cls_model_dir, det_model_dir=cur_det_model_dir, rec_model_dir=cur_rec_model_dir)
# paddle_dir = "res/.paddleocr/2.3.0.1/ocr/paddleocr/"
# cur_det_model_dir = paddle_dir + "ch_PP-OCRv4_det_infer"
# cur_rec_model_dir = paddle_dir + "ch_PP-OCRv4_rec_infer"
# ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False, use_gpu=False, det_model_dir=cur_det_model_dir, rec_model_dir=cur_rec_model_dir)
# 正常语速为4字/秒
normal_speed = 4
......@@ -251,6 +256,7 @@ def normalize(text: str) -> str:
text = text + ')'
return text
def resize_img(img):
resize_height = 152
height, width = img.shape[:2]
......@@ -274,6 +280,14 @@ def resize_img(img):
smaller_image = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
return smaller_image
def extract_white_prior(img, threshold=200):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 设定阈值,将非白色部分二值化为黑色
ret, binary_image = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)
return binary_image
index = 0
t = 140
......@@ -343,7 +357,7 @@ def detect_subtitle(org_img: np.ndarray, lastSubTitle, last_confidence) -> Tuple
# cv2.imwrite(f'./test2.png', img)
index = index + 1
# index = index + 1
# img = img[int(up_b - height*0.2):int(down_b + height*0.2)]
# 针对低帧率的视频做图像放大处理
print(height)
......@@ -352,9 +366,13 @@ def detect_subtitle(org_img: np.ndarray, lastSubTitle, last_confidence) -> Tuple
print(img.shape)
if img.shape[1] < 1000:
img = cv2.resize(img, (int(img.shape[1] * 1.5), int(img.shape[0] * 1.5)))
# global index
# cv2.imwrite(f'./cap/cap{index}.png', img)
# index = index + 1
# img = extract_white_prior(img)
cv2.imwrite(f'./cap/cap{index}.png', img)
index = index + 1
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>new log" + str(index - 1))
res = ocr.ocr(img, cls=True)
print('--------> res', res)
......@@ -367,6 +385,7 @@ def detect_subtitle(org_img: np.ndarray, lastSubTitle, last_confidence) -> Tuple
subTitle = ''
conf = 0
print('res --------->', res)
res.sort(key=lambda rect: rect[0][0][0] + rect[0][1][0]) # 按照中心点排序
for x in res:
# cv2.imshow("cut", img)
# cv2.waitKey(0)
......@@ -471,6 +490,7 @@ def process_video(video_path: str, begin: float, end: float, book_path: str, she
cur_time = video.get(cv2.CAP_PROP_POS_MSEC) / 1000
# 判断当前帧是否已超限制
# end 主要用来判断是否越界
if cur_time > end:
if cur_time - end_time > 1:
print('--------------------------------------------------')
......@@ -695,7 +715,7 @@ def add_to_list(mainWindow: MainWindow, element_type: str, li: list, ocr_h : int
mainWindow.last_aside_index = len(mainWindow.projectContext.all_elements) - 1
# end_time 主要用来判断是否越界
def detect_with_ocr(video_path: str, book_path: str, start_time: float, end_time: float, state=None, mainWindow: MainWindow=None):
"""使用ocr检测视频获取字幕并输出旁白推荐
......
......@@ -711,6 +711,7 @@ class MainWindow(QMainWindow, Ui_MainWindow):
y1,y2 = evaluate_position(path.path()[1:], 0)
video_h = self.wgt_video.height()
self.widget.setY(int(video_h * y1) + 6)
time.sleep(1)
self.widget_bottom.setY(int(video_h * y2) + 10)
print("y1:%d,y2:%d" %(y1,y2))
......
import re
import sys
import csv
import jieba
import argparse
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
title = ['起始时间(转换后)', '终止时间(转换后)', '字幕']
from tqdm import tqdm
# title = ['起始时间(转换后)', '终止时间(转换后)', '字幕']
title = ['起始时间', '终止时间', '字幕']
def init():
# 获取中文停用词列表
......@@ -22,13 +27,32 @@ def change_to_second(time_str):
time_obj.second + time_obj.microsecond / 1000000
return seconds
# 将中文句子划分,并且防止划分全部为停用词
def words_segment(str):
tmp = ','.join(jieba.cut(str))
# 将分割的句子差分成单词,也不进行划分
if is_all_stopwords(tmp) or len(list(jieba.cut(str))) == len(str) :
return str
return tmp
# 判断是否从中英文字幕中提取中文
def extract_info(str, has_english=False):
if not has_english:
return str
chinese_text = re.findall(r'[\u4e00-\u9fff]+', str)
return ' '.join(chinese_text)
# 计算字幕的相似度
def calculate_similarity(str1, str2, method='cosine'):
if method == 'cosine':
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([str1, str2])
str1, str2 = words_segment(str1), words_segment(str2)
tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform([str1, str2]) # shape=[2, N]
# print(np.array(tfidf_matrix.toarray()).shape, type(tfidf_matrix), tfidf_matrix.toarray())
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix[0][1]
elif method == 'distance':
return -String_edit_distance(str1, str2)
else :
return SequenceMatcher(None, str1, str2).ratio()
......@@ -37,23 +61,62 @@ def calculate_time_difference(time1, time2):
return abs(time2 - time1)
def calculate_weight(x, y):
# weight = e^(-alpha * time_diff)
# 相差1s的系数为0.9
alpha = 0.11
return 1 / (alpha * (x + y) + 1)
# # weight = e^(-alpha * time_diff)
# # 相差1s的系数为0.9
# alpha = 0.11
# return 1 / (alpha * (x + y) + 1)
return 1.0 # 目前不考虑时间系数
# 检查句子中的每个单词是否都是停用词
def is_all_stopwords(sentence):
sentence = sentence.replace(' ', '')
return all(word in stop_words for word in sentence)
# 编辑距离算法 有问题!!!!!!
def String_edit_distance(str1, str2):
n, m = len(str1), len(str2)
dp = [[0 for _ in range(m+1)] for _ in range(n+1)]
for i in range(n+1):
dp[i][0] = i
for i in range(m+1):
dp[0][i] = i
dp[0][0] = 0
for i in range(1, n+1):
for j in range(1, m+1):
if str1[i-1] == str2[j-1]:
dp[i][j] = dp[i-1][j-1]
else :
dp[i][j] = min(dp[i-1][j-1], min(dp[i][j-1], dp[i-1][j])) + 1
# print(dp[n][m], n, m)
return 1.0 * dp[n][m] / max(n, m)
### 如果其中有-符号,可能在用excel打开时自动添加=变成公式,读取的时候没问题
def read_srt_to_csv(path_srt, path_output):
try:
with open(path_srt, 'r', encoding='utf-8-sig') as f:
srt_content = f.read() # str
except UnicodeDecodeError:
print(f"编码错误,已经切换到utf-16编码")
try:
with open(path_srt, 'r', encoding='utf-16') as f:
srt_content = f.read() # str
except:
print(f"请选择utf-8或utf-16编码形式的srt文件")
sys.exit(1)
# 使用正则表达式匹配时间码和字幕内容
pattern = re.compile(r'(\d+)\n([\d:,]+) --> ([\d:,]+)\n(.+?)(?=\n\d+\n|$)', re.DOTALL)
matches = pattern.findall(srt_content)
has_english = []
for i in range(5):
idx = np.random.randint(len(matches))
pattern = re.compile(r'[a-zA-Z]')
has_english.append(bool(pattern.search(matches[idx][3])))
has_english = all(has_english)
print('!'*20, has_english)
# 写入 csv 文件
with open(path_output, 'w', newline='', encoding='utf-8') as f:
csv_writer = csv.writer(f)
......@@ -61,7 +124,7 @@ def read_srt_to_csv(path_srt, path_output):
for _, start, end, subtitle in matches: # 都是str格式
subtitle = re.sub(r'\{[^}]*\}', '', subtitle) # 将srt文件前的加粗等格式去掉
csv_writer.writerow([start, end, subtitle.strip()])
csv_writer.writerow([start, end, extract_info(subtitle.strip(), has_english)])
def read_from_xlsx(path_xlsx='output.xlsx', path_output='deal.csv'):
data = pd.read_excel(path_xlsx)
......@@ -70,20 +133,19 @@ def read_from_xlsx(path_xlsx='output.xlsx', path_output='deal.csv'):
csv_writer.writerow(title)
for _, data1 in data.iterrows():
start, end, subtitle = data1[1], data1[3], data1[4]
# print(data1[1])
start, end, subtitle = data1[0], data1[1], data1[2]
if isinstance(subtitle, float) and np.isnan(subtitle):
continue
# 与srt文件格式同步
start = start.replace('.', ',')
end = end.replace('.', ',')
# print(start, end, subtitle,)
# print(type(start), type(end), type(subtitle))
csv_writer.writerow([start, end, subtitle.strip()])
### 对于srt中的字幕计算相似性度。从ocr中找到时间戳满足<=time_t的字幕,
### 然后计算字幕间的相似度,取一个最大的。字幕从start和end都匹配一遍
# time_threshold设置阈值,用于判断时间差是否可接受
def measure_score(path_srt, path_ocr, time_threshold=5.0, method='cosine'):
def measure_score(path_srt, path_ocr, time_threshold=5.0, time_threshold_re=False, method='cosine'):
data_srt, data_ocr = [], []
with open(path_srt, 'r', encoding='utf-8') as file:
csv_reader = csv.reader(file)
......@@ -103,22 +165,36 @@ def measure_score(path_srt, path_ocr, time_threshold=5.0, method='cosine'):
# 计算相似度
total_similarity = 0.0
total_weight = 0.0
for sub in data_srt:
max_similarity = 0.0
txt1 = []
for sub in tqdm(data_srt, desc="Processing", ncols=100):
max_similarity = 0.0 if method != 'distance' else -1.0
# 去除srt中的停用词
if is_all_stopwords(sub[2]):
continue
subb = ""
for sub1 in data_ocr:
x, y = abs(sub[0] - sub1[0]), abs(sub[1] - sub1[1])
if min(x, y) <= time_threshold:
# print(sub[2], sub1[2])
score = calculate_similarity(sub[2], sub1[2], 'cosine')
if time_threshold_re:
time_threshold_tmp = time_threshold
else :
time_threshold_tmp = (sub[1] - sub[0]) * 0.3 # 10s允许3s的误差
if min(x, y) <= time_threshold_tmp:
score = calculate_similarity(sub[2], sub1[2], method)
if max_similarity <= score * calculate_weight(x, y):
subb = sub1[2]
max_similarity = max(max_similarity, score * calculate_weight(x, y))
if max_similarity <= -0.5:
# print(max_similarity, sub[2], subb, sub[0])
txt1.append(' !!! '.join([str(max_similarity), sub[2], subb, str(sub[0])]))
total_similarity += max_similarity
total_weight += 1
if method == 'distance':
total_similarity = total_weight + total_similarity
with open('movie_pro.txt', 'w', encoding='utf-8') as f:
for i in txt1:
f.write(i + '\n')
# print(total_similarity, total_similarity / len(data_srt), total_similarity / total_weight)
return total_similarity / len(data_srt), total_similarity / total_weight
if __name__ == '__main__':
......@@ -128,13 +204,23 @@ if __name__ == '__main__':
# 添加命令行参数
parser.add_argument("--path_srt", required=True, type=str, help="Path of srt file, format is srt")
parser.add_argument("--path_ocr", required=True, type=str, help="Path of ocr file, format is xlsx")
parser.add_argument("--method", type=str, default='cosine', help="Select evaluation method")
parser.add_argument("--time_threshold", type=float,default=5.0, help="Allowable time frame")
parser.add_argument("--time_threshold", type=float, default=5.0, help="Allowable time frame")
parser.add_argument("--method", type=str, default='distance',choices=['cosine', 'distance', 'sequence']
, help="Select evaluation method")
parser.add_argument("--time_threshold_re", type=bool, default=True, help="Specify whether \
time threshold is required")
args = parser.parse_args()
output_file_srt = 'deal_srt.csv'
output_file_ocr = 'deal_ocr.csv'
read_srt_to_csv(args.path_srt, output_file_srt)
read_from_xlsx(args.path_ocr, output_file_ocr)
score = measure_score(output_file_srt, output_file_ocr, args.time_threshold, args.method)
print(f'该评估算法得分: {score[1]:.5f}')
\ No newline at end of file
score = measure_score(output_file_srt, output_file_ocr, args.time_threshold, \
args.time_threshold_re, args.method)
print(f'该评估算法得分: {100 * score[1]:.3f}')
# python ocr_metric.py --path_srt test/new/movie_1.srt --path_ocr ../测试/the-swan-v3/The.Swan-zimu.xlsx --time_threshold 10
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment