Commit a6dd2f51 authored by szr712's avatar szr712

调整数据集:韵母对应汉字,数学

parent 1cd8367b
......@@ -116,7 +116,8 @@ def align2(sent):
char = sent[i+x]
p = pnyns[i]
if '\u4e00' <= char <= '\u9fa5':
hanzis.extend([char] + ["_"] * (len(p) - 1))
# hanzis.extend([char] + ["_"] * (len(p) - 1))
hanzis.extend(["_"] * (len(p) - 1) + [char])
else:
for q in p:
hanzis.append(q)
......@@ -214,9 +215,9 @@ if __name__ == "__main__":
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
ori_dir="./data/Chinese/train/ori"
hanzi_dir="./data/Chinese/train/hanzi"
pinyin_dir="./data/Chinese/train/pinyin"
ori_dir="./data/math/train/ori"
hanzi_dir="./data/math/train/hanzi"
pinyin_dir="./data/math/train/pinyin"
for file in os.listdir(ori_dir):
build_corpus(os.path.join(ori_dir,file),
os.path.join(pinyin_dir,file), os.path.join(hanzi_dir,file))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment