Commit 5d221a5d authored by szr712's avatar szr712

调整数据集:韵母对应汉字,政史地 物化生

parent a6dd2f51
......@@ -117,7 +117,7 @@ def align2(sent):
p = pnyns[i]
if '\u4e00' <= char <= '\u9fa5':
# hanzis.extend([char] + ["_"] * (len(p) - 1))
hanzis.extend(["_"] * (len(p) - 1) + [char])
hanzis.extend(["_"] * (len(p) - 1) + [char]) #韵母对应汉字
else:
for q in p:
hanzis.append(q)
......@@ -215,9 +215,9 @@ if __name__ == "__main__":
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
ori_dir="./data/math/train/ori"
hanzi_dir="./data/math/train/hanzi"
pinyin_dir="./data/math/train/pinyin"
ori_dir="./data/zsd/train/ori"
hanzi_dir="./data/zsd/train/hanzi"
pinyin_dir="./data/zsd/train/pinyin"
for file in os.listdir(ori_dir):
build_corpus(os.path.join(ori_dir,file),
os.path.join(pinyin_dir,file), os.path.join(hanzi_dir,file))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment