Commit 8735fd67 authored by szr712's avatar szr712

新增语文数据集

parent fd2c1cfb
......@@ -214,12 +214,12 @@ if __name__ == "__main__":
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
# ori_dir="./data/train_file/ori_file_split_random_wo_tones"
# hanzi_dir="./data/train_file/hanzi_split_random_wo_tones"
# pinyin_dir="./data/train_file/pinyin_split_random_wo_tones"
# for file in os.listdir(ori_dir):
# build_corpus(os.path.join(ori_dir,file),
# os.path.join(pinyin_dir,file), os.path.join(hanzi_dir,file))
# print("Done")
build_corpus("./data/dev/dev_hanzi.txt",
"./data/dev/dev_pinyin_split.txt", "./data/dev/dev_hanzi_split.txt")
ori_dir="./data/Chinese/train/ori"
hanzi_dir="./data/Chinese/train/hanzi"
pinyin_dir="./data/Chinese/train/pinyin"
for file in os.listdir(ori_dir):
build_corpus(os.path.join(ori_dir,file),
os.path.join(pinyin_dir,file), os.path.join(hanzi_dir,file))
print("Done")
# build_corpus("./data/dev/dev_hanzi.txt",
# "./data/dev/dev_pinyin_split.txt", "./data/dev/dev_hanzi_split.txt")
......@@ -39,12 +39,12 @@ def convert_pinyin(file,hanzi_dir,pinyin_dir,new_file):
f.write("\n".join(result))
if __name__=="__main__":
hanzi_dir="./data/test/hanzi"
pinyin_dir="./data/test/pinyin"
hanzi_dir="./data/Chinese/test/hanzi"
pinyin_dir="./data/Chinese/test/pinyin"
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
# convert_pinyin("dev_hanzi.txt","./data/dev","./data/dev","dev_pinyin_split.txt")
for file in os.listdir(hanzi_dir):
convert_pinyin(file,hanzi_dir,pinyin_dir,file)
\ No newline at end of file
convert_pinyin("dev_hanzi.txt","./data/Chinese/dev","./data/Chinese/dev","dev_pinyin_split.txt")
# for file in os.listdir(hanzi_dir):
# convert_pinyin(file,hanzi_dir,pinyin_dir,file)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment