Commit 961b319e authored by szr712's avatar szr712

增加数据 改变增强倍数

parent cf45cfe1
*.pyc *.pyc
.vscode .vscode
/weights /weights
data/**/*.txt /data/pkl
data/**/*.pkl /data/result_data
/data/test_data
/data/*.txt
...@@ -132,7 +132,7 @@ def create_dataset(opt, SRC, TRG): ...@@ -132,7 +132,7 @@ def create_dataset(opt, SRC, TRG):
train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device, train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=None, train=True, shuffle=True,augment=True) batch_size_fn=None, train=True, shuffle=True,augment=True,change_possibility=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
os.remove('translate_transformer_temp.csv') os.remove('translate_transformer_temp.csv')
......
...@@ -6,21 +6,15 @@ from build_corpus import split_initials_finals, wenzi2pinyin ...@@ -6,21 +6,15 @@ from build_corpus import split_initials_finals, wenzi2pinyin
import random import random
def random_change_tones(tones): def random_change_tones(tones):
options=[0,1,2,3,4] change_possibility=[0.5, 0.6, 0.7, 0.8, 0.9, 1]
change_possibility=random.choice(change_possibility)
random.seed(42) random.seed(42)
for i,x in enumerate(tones): for i,x in enumerate(tones):
if random.randint(0,99) < 30: if random.random() < change_possibility:
tones[i]=random.choice(options) tones[i]=0
return tones return tones
hanzi_dir="./data/test_data/split_random_wo_tones/hanzi" def convert_pinyin(file,hanzi_dir,pinyin_dir,new_file):
pinyin_dir="./data/test_data/split_random_wo_tones/pinyin"
with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
yunmus=f.readlines()
yunmus=[a.strip() for a in yunmus]
for file in os.listdir(hanzi_dir):
print(file) print(file)
with open(os.path.join(hanzi_dir,file),'r',encoding="utf-8") as f: with open(os.path.join(hanzi_dir,file),'r',encoding="utf-8") as f:
contents=f.readlines() contents=f.readlines()
...@@ -29,7 +23,7 @@ for file in os.listdir(hanzi_dir): ...@@ -29,7 +23,7 @@ for file in os.listdir(hanzi_dir):
sent = line.strip() sent = line.strip()
sent = sent.replace(" ","") sent = sent.replace(" ","")
pinyins,tones=wenzi2pinyin(sent) pinyins,tones=wenzi2pinyin(sent)
# tones=random_change_tones(tones) tones=random_change_tones(tones)
pnyns=[] pnyns=[]
i=0 i=0
for pinyin,tone in zip(pinyins,tones): for pinyin,tone in zip(pinyins,tones):
...@@ -41,5 +35,16 @@ for file in os.listdir(hanzi_dir): ...@@ -41,5 +35,16 @@ for file in os.listdir(hanzi_dir):
i+=len(pinyin) i+=len(pinyin)
pnyns = " ".join(list(itertools.chain.from_iterable(pnyns))) pnyns = " ".join(list(itertools.chain.from_iterable(pnyns)))
result.append(pnyns) result.append(pnyns)
with open(os.path.join(pinyin_dir,file),"w",encoding="utf-8") as f: with open(os.path.join(pinyin_dir,new_file),"w",encoding="utf-8") as f:
f.write("\n".join(result)) f.write("\n".join(result))
if __name__=="__main__":
hanzi_dir="./data/test_data/split_random_wo_tones/hanzi"
pinyin_dir="./data/test_data/split_random_wo_tones/pinyin2"
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
convert_pinyin("dev_hanzi.txt","./data/dev","./data/dev","dev_pinyin_split.txt")
# for file in os.listdir(hanzi_dir):
# convert_pinyin(file,hanzi_dir,pinyin_dir)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment