Commit 961b319e authored by szr712's avatar szr712

增加数据 改变增强倍数

parent cf45cfe1
*.pyc
.vscode
/weights
data/**/*.txt
data/**/*.pkl
/data/pkl
/data/result_data
/data/test_data
/data/*.txt
......@@ -132,7 +132,7 @@ def create_dataset(opt, SRC, TRG):
train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=None, train=True, shuffle=True,augment=True)
batch_size_fn=None, train=True, shuffle=True,augment=True,change_possibility=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
os.remove('translate_transformer_temp.csv')
......
......@@ -6,21 +6,15 @@ from build_corpus import split_initials_finals, wenzi2pinyin
import random
def random_change_tones(tones):
options=[0,1,2,3,4]
change_possibility=[0.5, 0.6, 0.7, 0.8, 0.9, 1]
change_possibility=random.choice(change_possibility)
random.seed(42)
for i,x in enumerate(tones):
if random.randint(0,99) < 30:
tones[i]=random.choice(options)
if random.random() < change_possibility:
tones[i]=0
return tones
hanzi_dir="./data/test_data/split_random_wo_tones/hanzi"
pinyin_dir="./data/test_data/split_random_wo_tones/pinyin"
with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
yunmus=f.readlines()
yunmus=[a.strip() for a in yunmus]
for file in os.listdir(hanzi_dir):
def convert_pinyin(file,hanzi_dir,pinyin_dir,new_file):
print(file)
with open(os.path.join(hanzi_dir,file),'r',encoding="utf-8") as f:
contents=f.readlines()
......@@ -29,7 +23,7 @@ for file in os.listdir(hanzi_dir):
sent = line.strip()
sent = sent.replace(" ","")
pinyins,tones=wenzi2pinyin(sent)
# tones=random_change_tones(tones)
tones=random_change_tones(tones)
pnyns=[]
i=0
for pinyin,tone in zip(pinyins,tones):
......@@ -41,5 +35,16 @@ for file in os.listdir(hanzi_dir):
i+=len(pinyin)
pnyns = " ".join(list(itertools.chain.from_iterable(pnyns)))
result.append(pnyns)
with open(os.path.join(pinyin_dir,file),"w",encoding="utf-8") as f:
with open(os.path.join(pinyin_dir,new_file),"w",encoding="utf-8") as f:
f.write("\n".join(result))
if __name__=="__main__":
hanzi_dir="./data/test_data/split_random_wo_tones/hanzi"
pinyin_dir="./data/test_data/split_random_wo_tones/pinyin2"
# with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
# yunmus=f.readlines()
# yunmus=[a.strip() for a in yunmus]
convert_pinyin("dev_hanzi.txt","./data/dev","./data/dev","dev_pinyin_split.txt")
# for file in os.listdir(hanzi_dir):
# convert_pinyin(file,hanzi_dir,pinyin_dir)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment