使用新增数据训练

6442322f · szr712 · c217436c · 6442322f · 6442322f · 6442322f
Commit 6442322f authored Nov 29, 2021 by szr712
11 changed files
--- a/README.md
+++ b/README.md
@@ -24,14 +24,17 @@ CUDA_VISIBLE_DEVICES=1 python train2.py -src_data data/pinyin_2.txt -trg_data da

 CUDA_VISIBLE_DEVICES=1 nohup python train2.py -src_data data/pinyin_2.txt -trg_data data/hanzi_2.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name token_classification 

-CUDA_VISIBLE_DEVICES=3 python translate_pkl.py -load_weights weights/token_classification/11-09_22:00:55/token_classification_35_0.055335590355098246 -pkl_dir weights/token_classification/11-09_22:00:55 -test_dir data/pkl/test-pkl -result_dir data/pkl/test-pkl-result
+CUDA_VISIBLE_DEVICES=3 python translate_pkl.py -load_weights weights/token_classification/11-09_22:00:55/token_classification_35_0.055335590355098246 -pkl_dir weights/token_classification/11-09_22:00:55 -test_dir data/pkl/pinyin_pkl -result_dir data/pkl/pinyin_pkl_result

 CUDA_VISIBLE_DEVICES=2 nohup python train_token_classification.py -src_data data/pinyin_split.txt -trg_data data/hanzi_split.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name token_classification_split -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt

-CUDA_VISIBLE_DEVICES=1 python translate2.py -load_weights weights/token_classification_split_2/11-19_17:16:18/token_classification_split_2_5_0.05776993067935109 -pkl_dir weights/token_classification_split_2/11-19_17:16:18 -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt
+CUDA_VISIBLE_DEVICES=1 python translate2.py -load_weights weights/token_classification_split_4/11-23_22:02:06/token_classification_split_4_25_0.02742394618457183 -pkl_dir weights/token_classification_split_4/11-23_22:02:06 -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt

-CUDA_VISIBLE_DEVICES=4 nohup python translate_file2.py -load_weights weights/token_classification_split_3/11-22_21:56:11/token_classification_split_3_25_0.029638311734888702 -pkl_dir weights/token_classification_split_3/11-22_21:56:11 -test_dir data/test_data/pinyin_split -result_dir data/test_data/result_split -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt >log1 2>&1 & 
+CUDA_VISIBLE_DEVICES=4 nohup python translate_file2.py -load_weights weights/token_classification_split_4/11-23_22:02:06/token_classification_split_4_25_0.02742394618457183 -pkl_dir weights/token_classification_split_4/11-23_22:02:06 -test_dir data/test_data/pinyin_split -result_dir data/test_data/result_split -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt >log1 2>&1 & 

-CUDA_VISIBLE_DEVICES=3 python eval_model.py -load_weights weights/token_classification_split_3/11-22_21:56:11/token_classification_split_3_1_0.09703897424042225 -pkl_dir weights/token_classification_split_3/11-22_21:56:11 -dev_dir data/dev -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt
+CUDA_VISIBLE_DEVICES=1 python eval_model.py -load_weights weights/token_classification_split_4/11-23_22:02:06/token_classification_split_4_1_0.09183966986835003 -pkl_dir weights/token_classification_split_4/11-23_22:02:06 -dev_dir data/dev -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt >log1 2>&1 & 

-CUDA_VISIBLE_DEVICES=2 nohup python train_token_classification.py -src_data data/pinyin_split.txt -trg_data data/hanzi_split.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name token_classification_split_4 -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt
\ No newline at end of file
+CUDA_VISIBLE_DEVICES=6 nohup python train_token_classification.py -src_data data/pinyin_split.txt -trg_data data/hanzi_split.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name token_classification_split_4 -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt
+
+
+CUDA_VISIBLE_DEVICES=2 nohup python train_token_classification.py -src_data data/pinyin_new_split.txt -trg_data data/hanzi_new_split.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name token_classification_split_new -src_voc ./data/voc/pinyin.txt -trg_voc ./data/voc/hanzi.txt >log1 2>&1 & 
--- a/build_corpus.py
+++ b/build_corpus.py
@@ -214,6 +214,6 @@ if __name__ == "__main__":
    # with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
    #     yunmus=f.readlines()
    #     yunmus=[a.strip() for a in yunmus]
-    build_corpus("./data/train_set_total.txt",
-                 "./data/pinyin_split.txt", "./data/hanzi_split.txt")
+    build_corpus("./data/train_set_new.txt",
+                 "./data/pinyin_new_split.txt", "./data/hanzi_new_split.txt")
    print("Done")
--- a/convert_test_set.py
+++ b/convert_test_set.py
@@ -4,8 +4,8 @@ from tqdm import tqdm

 from build_corpus import split_initials_finals, wenzi2pinyin

-hanzi_dir="./data/test_data/hanzi"
-pinyin_dir="./data/test_data/pinyin_split"
+hanzi_dir="./data/test_data/hanzi_new"
+pinyin_dir="./data/test_data/pinyin_split_new"

 with open("./data/voc/yunmu.txt","r",encoding="utf-8") as f:
        yunmus=f.readlines()

--- a/eval_model.py
+++ b/eval_model.py
@@ -91,7 +91,7 @@ def main():
    i=1
    while i<=60:
        for model_name in os.listdir(opt.pkl_dir):
-            if "token_classification_split_3_"+str(i)+"_" in model_name:
+            if "token_classification_split_new_"+str(i)+"_" in model_name:
                print("model_name:{}".format(model_name))

                opt.load_weights=os.path.join(opt.pkl_dir,model_name)

--- a/log
+++ b/log
--- a/log1
+++ b/log1
--- a/nohup.out
+++ b/nohup.out
--- a/test.py
+++ b/test.py
--- a/tmp.py
+++ b/tmp.py
--- a/translate2.py
+++ b/translate2.py
--- a/translate_pkl.py
+++ b/translate_pkl.py