修改部分代码翻译增加短句 batch增加最后判断

fd2c1cfb · szr712 · 5e1da936 · fd2c1cfb · fd2c1cfb · fd2c1cfb
Commit fd2c1cfb authored Dec 11, 2021 by szr712
19 changed files
--- a/Batch.py
+++ b/Batch.py
@@ -99,7 +99,8 @@ class MyIterator(data.Iterator):
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
-                        yield b
+                        if len(b) == self.batch_size:
+                            yield b
            self.batches = pool(self.data(), self.random_shuffler)

        else:

--- a/Models.py
+++ b/Models.py
@@ -94,6 +94,14 @@ def get_model_token_classification(opt, src_vocab, trg_vocab):
    if opt.load_weights is not None:
        print("loading pretrained weights...")
        model.load_state_dict(torch.load(f'{opt.load_weights}'))
+        # checkpoint = torch.load(opt.load_weights, map_location=lambda storage, loc: storage)
+        # state_dict = {}
+        # for k, v in checkpoint.items():
+        #     if k.startswith('module'):
+        #         state_dict[k[7:]] = v
+        #     else:
+        #         state_dict[k] = v
+        # model.load_state_dict(state_dict)
    else:
        for p in model.parameters():
            if p.dim() > 1:

--- a/cer.py
+++ b/cer.py
@@ -57,7 +57,7 @@ def cer(preFile,textFile):
                                    round(float(total_edit_distance)/num_chars, 5)))

 if __name__ == "__main__":
-    preFile = "./data/test_data/split_new_data_daoxuehao/result_random_change_tones"
-    textFile = "./data/test_data/hanzi_new"
+    preFile = "./data/test_data/tmp/2"
+    textFile = "./data/test_data/tmp/3"
    cer(preFile,textFile)
    
\ No newline at end of file
--- a/convert_test_set.py
+++ b/convert_test_set.py
@@ -6,7 +6,7 @@ from build_corpus import split_initials_finals, wenzi2pinyin
 import random

 def random_change_tones(tones):
-    change_possibility=[0.5, 0.6, 0.7, 0.8, 0.9, 1]
+    change_possibility=[0.78]
    change_possibility=random.choice(change_possibility)
    random.seed(42)
    for i,x in enumerate(tones):
@@ -39,12 +39,12 @@ def convert_pinyin(file,hanzi_dir,pinyin_dir,new_file):
        f.write("\n".join(result))

 if __name__=="__main__":
-    hanzi_dir="./data/test_data/split_random_wo_tones/hanzi"
-    pinyin_dir="./data/test_data/split_random_wo_tones/pinyin2"
+    hanzi_dir="./data/test/hanzi"
+    pinyin_dir="./data/test/pinyin"

 # with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
 #         yunmus=f.readlines()
 #         yunmus=[a.strip() for a in yunmus]
-    convert_pinyin("dev_hanzi.txt","./data/dev","./data/dev","dev_pinyin_split.txt")
-    # for file in os.listdir(hanzi_dir):
-    #     convert_pinyin(file,hanzi_dir,pinyin_dir)
\ No newline at end of file
+    # convert_pinyin("dev_hanzi.txt","./data/dev","./data/dev","dev_pinyin_split.txt")
+    for file in os.listdir(hanzi_dir):
+        convert_pinyin(file,hanzi_dir,pinyin_dir,file)
\ No newline at end of file
--- a/data/test/pinyin/chemistry.txt
+++ b/data/test/pinyin/chemistry.txt
--- a/data/test/pinyin/chinese.txt
+++ b/data/test/pinyin/chinese.txt
--- a/data/test/pinyin/geography.txt
+++ b/data/test/pinyin/geography.txt
--- a/data/test/pinyin/history.txt
+++ b/data/test/pinyin/history.txt
--- a/data/test/pinyin/math.txt
+++ b/data/test/pinyin/math.txt
--- a/data/test/pinyin/physics.txt
+++ b/data/test/pinyin/physics.txt
--- a/data/test/pinyin/politics.txt
+++ b/data/test/pinyin/politics.txt
--- a/data/test/pinyin/zho_news_2007-2009_1M-sentences_2.txt
+++ b/data/test/pinyin/zho_news_2007-2009_1M-sentences_2.txt
--- a/data/test/pre/chemistry.txt
+++ b/data/test/pre/chemistry.txt
--- a/data/test/pre/chinese.txt
+++ b/data/test/pre/chinese.txt
--- a/data/test/pre/geography.txt
+++ b/data/test/pre/geography.txt
--- a/data/test/pre/history.txt
+++ b/data/test/pre/history.txt
--- a/data/test/pre/math.txt
+++ b/data/test/pre/math.txt
--- a/data/test/pre/physics.txt
+++ b/data/test/pre/physics.txt