Initial commit

da232cae · szr712 · da232cae · da232cae · da232cae · da232cae
Commit da232cae authored Nov 08, 2021 by szr712
15 changed files
--- a/.gitignore
+++ b/.gitignore
+*.pyc
+.vscode
+/weights
+data/**/*.txt
+data/**/*.pkl
--- a/Batch.py
+++ b/Batch.py
+import torch
+from torchtext import data
+import numpy as np
+from torch.autograd import Variable
+def nopeak_mask(size, opt):
+    np_mask = np.triu(np.ones((1, size, size)),
+    k=1).astype('uint8')
+    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
+    if opt.device == 0:
+      np_mask = np_mask.cuda()
+    return np_mask
+def create_masks(src, trg, opt):
+    src_mask = (src != opt.src_pad).unsqueeze(-2)
+    if trg is not None:
+        trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
+        size = trg.size(1) # get seq_len for matrix
+        np_mask = nopeak_mask(size, opt)
+        if trg.is_cuda:
+            np_mask.cuda()
+        trg_mask = trg_mask & np_mask
+    else:
+        trg_mask = None
+    return src_mask, trg_mask
+# patch on Torchtext's batching process that makes it more efficient
+# from http://nlp.seas.harvard.edu/2018/04/03/attention.html#position-wise-feed-forward-networks
+class MyIterator(data.Iterator):
+    def create_batches(self):
+        if self.train:
+            def pool(d, random_shuffler):
+                for p in data.batch(d, self.batch_size * 100):
+                    p_batch = data.batch(
+                        sorted(p, key=self.sort_key),
+                        self.batch_size, self.batch_size_fn)
+                    for b in random_shuffler(list(p_batch)):
+                        yield b
+            self.batches = pool(self.data(), self.random_shuffler)
+        else:
+            self.batches = []
+            for b in data.batch(self.data(), self.batch_size,
+                                          self.batch_size_fn):
+                self.batches.append(sorted(b, key=self.sort_key))
+global max_src_in_batch, max_tgt_in_batch
+def batch_size_fn(new, count, sofar):
+    "Keep augmenting batch and calculate total number of tokens + padding."
+    global max_src_in_batch, max_tgt_in_batch
+    if count == 1:
+        max_src_in_batch = 0
+        max_tgt_in_batch = 0
+    max_src_in_batch = max(max_src_in_batch,  len(new.src))
+    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
+    src_elements = count * max_src_in_batch
+    tgt_elements = count * max_tgt_in_batch
+    return max(src_elements, tgt_elements)
--- a/Beam.py
+++ b/Beam.py
+import torch
+from Batch import nopeak_mask
+import torch.nn.functional as F
+import math
+def init_vars(src, model, SRC, TRG, opt):
+    init_tok = TRG.vocab.stoi['<sos>']
+    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
+    e_output = model.encoder(src, src_mask)
+    outputs = torch.LongTensor([[init_tok]])
+    if opt.device == 0:
+        outputs = outputs.cuda()
+    trg_mask = nopeak_mask(1, opt)
+    out = model.out(model.decoder(outputs,
+    e_output, src_mask, trg_mask))
+    out = F.softmax(out, dim=-1)
+    probs, ix = out[:, -1].data.topk(opt.k)
+    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
+    outputs = torch.zeros(opt.k, opt.max_len).long()
+    if opt.device == 0:
+        outputs = outputs.cuda()
+    outputs[:, 0] = init_tok
+    outputs[:, 1] = ix[0]
+    e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
+    if opt.device == 0:
+        e_outputs = e_outputs.cuda()
+    e_outputs[:, :] = e_output[0]
+    return outputs, e_outputs, log_scores
+def k_best_outputs(outputs, out, log_scores, i, k):
+    probs, ix = out[:, -1].data.topk(k)
+    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
+    k_probs, k_ix = log_probs.view(-1).topk(k)
+    row = k_ix // k
+    col = k_ix % k
+    outputs[:, :i] = outputs[row, :i]
+    outputs[:, i] = ix[row, col]
+    log_scores = k_probs.unsqueeze(0)
+    return outputs, log_scores
+def beam_search(src, model, SRC, TRG, opt):
+    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
+    eos_tok = TRG.vocab.stoi['<eos>']
+    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
+    ind = None
+    for i in range(2, opt.max_len):
+        trg_mask = nopeak_mask(i, opt)
+        out = model.out(model.decoder(outputs[:,:i],
+        e_outputs, src_mask, trg_mask))
+        out = F.softmax(out, dim=-1)
+        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
+        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
+        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
+        for vec in ones:
+            i = vec[0]
+            if sentence_lengths[i]==0: # First end symbol has not been found yet
+                sentence_lengths[i] = vec[1] # Position of first end symbol
+        num_finished_sentences = len([s for s in sentence_lengths if s > 0])
+        if num_finished_sentences == opt.k:
+            alpha = 0.7
+            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
+            _, ind = torch.max(log_scores * div, 1)
+            ind = ind.data[0]
+            break
+    if ind is None:
+        try:
+            length = (outputs[0]==eos_tok).nonzero()[0]
+        except:
+            return ""
+        return ''.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
+    else:
+        try:
+            length = (outputs[ind]==eos_tok).nonzero()[0]
+        except:
+            return ""
+        # length = (outputs[ind]==eos_tok).nonzero()[0]
+        return ''.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
--- a/Embed.py
+++ b/Embed.py
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/Layers.py
+++ b/Layers.py
+import torch
+import torch.nn as nn
+from Sublayers import FeedForward, MultiHeadAttention, Norm
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
\ No newline at end of file
--- a/Models.py
+++ b/Models.py
+import torch
+import torch.nn as nn 
+from Layers import EncoderLayer, DecoderLayer
+from Embed import Embedder, PositionalEncoder
+from Sublayers import Norm
+import copy
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+class Transformer(nn.Module):
+    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
+        super().__init__()
+        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
+        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
+        self.out = nn.Linear(d_model, trg_vocab)
+    def forward(self, src, trg, src_mask, trg_mask):
+        e_outputs = self.encoder(src, src_mask)
+        #print("DECODER")
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        output = self.out(d_output)
+        return output
+def get_model(opt, src_vocab, trg_vocab):
+    assert opt.d_model % opt.heads == 0
+    assert opt.dropout < 1
+    model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)
+    if opt.load_weights is not None:
+        print("loading pretrained weights...")
+        model.load_state_dict(torch.load(f'{opt.load_weights}'))
+    else:
+        for p in model.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p) 
+    if opt.device == 0:
+        model = model.cuda()
+    return model
--- a/Optim.py
+++ b/Optim.py
+import torch
+import numpy as np
+# code from AllenNLP
+class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
+    """
+    Cosine annealing with restarts.
+    Parameters
+    ----------
+    optimizer : torch.optim.Optimizer
+    T_max : int
+        The maximum number of iterations within the first cycle.
+    eta_min : float, optional (default: 0)
+        The minimum learning rate.
+    last_epoch : int, optional (default: -1)
+        The index of the last epoch.
+    """
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 T_max: int,
+                 eta_min: float = 0.,
+                 last_epoch: int = -1,
+                 factor: float = 1.) -> None:
+        # pylint: disable=invalid-name
+        self.T_max = T_max
+        self.eta_min = eta_min
+        self.factor = factor
+        self._last_restart: int = 0
+        self._cycle_counter: int = 0
+        self._cycle_factor: float = 1.
+        self._updated_cycle_len: int = T_max
+        self._initialized: bool = False
+        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        """Get updated learning rate."""
+        # HACK: We need to check if this is the first time get_lr() was called, since
+        # we want to start with step = 0, but _LRScheduler calls get_lr with
+        # last_epoch + 1 when initialized.
+        if not self._initialized:
+            self._initialized = True
+            return self.base_lrs
+        step = self.last_epoch + 1
+        self._cycle_counter = step - self._last_restart
+        lrs = [
+            (
+                self.eta_min + ((lr - self.eta_min) / 2) *
+                (
+                    np.cos(
+                        np.pi *
+                        ((self._cycle_counter) % self._updated_cycle_len) /
+                        self._updated_cycle_len
+                    ) + 1
+                )
+            ) for lr in self.base_lrs
+        ]
+        if self._cycle_counter % self._updated_cycle_len == 0:
+            # Adjust the cycle length.
+            self._cycle_factor *= self.factor
+            self._cycle_counter = 0
+            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
+            self._last_restart = step
+        return lrs
\ No newline at end of file
--- a/Process.py
+++ b/Process.py
+import pandas as pd
+import torchtext
+from torchtext import data
+from Tokenize import tokenize
+from Batch import MyIterator, batch_size_fn
+import os
+import dill as pickle
+from pypinyin import Style, pinyin
+from pypinyin.core import lazy_pinyin
+from tqdm import tqdm 
+def wenzi2pinyin(text):
+    pinyin_list = lazy_pinyin(text, style=Style.TONE3)
+    # print(pinyin_list)
+    tones_list = [int(py[-1]) if py[-1].isdigit()
+                  else 0 for py in pinyin_list]
+    pinyin_list = lazy_pinyin(text, style=Style.NORMAL)
+    return "".join(pinyin_list)
+def read_data(opt):
+    if opt.src_data is not None:
+        try:
+            print("loading src_data")
+            opt.src_data = open(opt.src_data).read().strip().split('\n')
+            opt.src_data=[x for x in tqdm(opt.src_data) if len(x)<=200]
+            # print(len(opt.src_data))
+        except:
+            print("error: '" + opt.src_data + "' file not found")
+            quit()
+    if opt.trg_data is not None:
+        try:
+            print("loading trg_data")
+            opt.trg_data = open(opt.trg_data).read().strip().split('\n')
+            # opt.trg_data=[x for x in tqdm(opt.trg_data) if len(wenzi2pinyin(x))<=200]
+            opt.trg_data=[x for x in tqdm(opt.trg_data) if len(x)<=200]
+        except:
+            print("error: '" + opt.trg_data + "' file not found")
+            quit()
+    print("len of src_data:{} ; len of trg_data:{}".format(len(opt.src_data),len(opt.trg_data)))
+def my_tokenize(text):
+    return list(text)
+def create_fields(opt):
+    spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
+    # if opt.src_lang not in spacy_langs:
+    #     print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)  
+    # if opt.trg_lang not in spacy_langs:
+    #     print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
+    print("loading spacy tokenizers...")
+    # t_src = tokenize(opt.src_lang)
+    # t_trg = tokenize(opt.trg_lang)
+    # TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
+    # SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
+    TRG = data.Field(tokenize=my_tokenize, init_token='<sos>', eos_token='<eos>')
+    SRC = data.Field(tokenize=my_tokenize)
+    if opt.pkl_dir is not None:
+        try:
+            print("loading presaved fields...")
+            SRC = pickle.load(open(f'{opt.pkl_dir}/SRC.pkl', 'rb'))
+            TRG = pickle.load(open(f'{opt.pkl_dir}/TRG.pkl', 'rb'))
+        except:
+            print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
+            quit()
+    return(SRC, TRG)
+def create_dataset(opt, SRC, TRG):
+    print("creating dataset and iterator... ")
+    raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
+    df = pd.DataFrame(raw_data, columns=["src", "trg"])
+    print(df.head())
+    mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
+    # print(mask)
+    # df = df.loc[mask]
+    df.to_csv("translate_transformer_temp.csv", index=False)
+    data_fields = [('src', SRC), ('trg', TRG)]
+    train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
+    train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
+                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
+                        batch_size_fn=batch_size_fn, train=True, shuffle=True)
+    os.remove('translate_transformer_temp.csv')
+    if opt.load_weights is None:
+        SRC.build_vocab(train)
+        print(SRC.vocab.stoi)
+        TRG.build_vocab(train)
+        print(TRG.vocab.stoi)
+        if opt.checkpoint > 0:
+            try:
+                os.mkdir("weights")
+            except:
+                print("weights folder already exists, run program with -load_weights weights to load them")
+                quit()
+            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
+            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
+    opt.src_pad = SRC.vocab.stoi['<pad>']
+    opt.trg_pad = TRG.vocab.stoi['<pad>']
+    opt.train_len = get_len(train_iter)
+    return train_iter
+def get_len(train):
+    for i, b in enumerate(train):
+        pass
+    return i
--- a/README.md
+++ b/README.md
+# Transformer
+This is a pytorch implementation of the transformer model. If you'd like to understand the model, or any of the code better, please refer to <a href=https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec>my tutorial</a>.
+Using the Europarl dataset plus the dataset in the data folder, I was able to achieve a BLEU score of 0.39 on the test set (current SOTA is around 0.42), after 4/5 days of training on a single 8gb GPU. For more results see the tutorial again.
+# Train the model immediately on FloydHub
+[![Run on FloydHub](https://static.floydhub.com/button/button-small.svg)](https://floydhub.com/run)
+Launch a [FloydHub Workspace](https://www.floydhub.com/product/build) to start training this model with 1 click. Workspace is a GPU-enabled cloud IDE for machine learning. It provides a fully configured environment so you can start hacking right away, without worrying about dependencies, data sets, etc.
+Once you've started the workspace, run the 'start_here' notebook or type 'floyd run' into the workspace terminal. This will begin to train the model on the sample dataset.
+# Usage
+Two text files containing parallel sentences (seperated by '\n' characters) in two languages are required to train the model. See an example of this in the data/ folder (french.txt and english.txt).
+To begin training, run this code:
+```
+python train.py -src_data path/lang1.txt -trg_data path/lang2.txt -src_lang lang1 -trg_lang lang2
+```
+The spacy tokenizer is used to tokenize the text, hence only languages supported by spacy are supported by this program. The languages supported by Spacy and their codes are:
+English : 'en'<br />
+French : 'fr'<br />
+Portugese : 'pt'<br />
+Italian : 'it'<br />
+Dutch : 'nl'<br />
+Spanish : 'es'<br />
+German : 'de'<br />
+For example, to train tan English->French translator on the datasets provided in the data folder, you would run the following:
+```
+python train.py -src_data data/english.txt -trg_data data/french.txt -src_lang en -trg_lang fr
+```
+Additional parameters:<br />
+-epochs : how many epochs to train data for (default=2)<br />
+-batch_size : measured as number of tokens fed to model in each iteration (default=1500)<br />
+-n_layers : how many layers to have in Transformer model (default=6)<br />
+-heads : how many heads to split into for multi-headed attention (default=8)<br />
+-no_cuda : adding this will disable cuda, and run model on cpu<br />
+-SGDR : adding this will implement stochastic gradient descent with restarts, using cosine annealing<br />
+-d_model : dimension of embedding vector and layers (default=512)<br />
+-dropout' : decide how big dropout will be (default=0.1)<br />
+-printevery : how many iterations run before printing (default=100)<br />
+-lr : learning rate (default=0.0001)<br />
+-load_weights : if loading pretrained weights, put path to folder where previous weights and pickles were saved <br />
+-max_strlen : sentenced with more words will not be included in dataset (default=80)<br />
+-checkpoint : enter a number of minutes. Model's weights will then be saved every this many minutes to folder 'weights/'<br />
+# Training and Translating
+```
+python train.py -src_data data/english.txt -trg_data data/french.txt -src_lang en -trg_lang fr -epochs 10
+```
+This code gave the following results on a K100 GPU with 8bg RAM:
+![screen shot 2018-09-18 at 21 35 55](https://user-images.githubusercontent.com/28839356/45754258-1656fc00-bc13-11e8-9506-5ace6fb6b79c.png)
+After saving the results to folder 'weights', the model can then be tested:
+```
+python translate.py -load_weights weights
+```
+![screen shot 2018-09-18 at 21 40 08](https://user-images.githubusercontent.com/28839356/45754259-18b95600-bc13-11e8-86c7-a07fe18b1ecc.png)
+So with a small dataset of 150,000 sentences and 1 hour of training, already some quite good results...
+# Features still to add
+- create validation set and get validation scores each epoch
+- function to show translations of sentences from training and validation sets
+CUDA_VISIBLE_DEVICES=5 nohup python train.py -src_data data/train_set_onlyChinese.txt -trg_data data/train_set_pinyin_onlyChinese.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name pinyin_to_hanzi_onlyChinese
+CUDA_VISIBLE_DEVICES=5 python train.py -src_data data/train_set_pinyin_total.txt -trg_data data/train_set_total.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name pinyin_to_hanzi_total -load_weights weights/pinyin_to_hanzi_total/10-29_18:51:57/pinyin_to_hanzi_total_10_0.1508198243379593 -pkl_dir weights/pinyin_to_hanzi_total/10-29_18:51:57
+CUDA_VISIBLE_DEVICES=6 python translate.py -load_weights weights/pinyin_to_hanzi_total/10-29_18:51:57/pinyin_to_hanzi_total_10_0.1508198243379593 -pkl_dir weights/pinyin_to_hanzi_total/10-29_18:51:57
+CUDA_VISIBLE_DEVICES=5 python translate_file.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_9_0.12325442619621754 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/test_data/pinyin_short -result_dir data/test_data/result_tmp
+CUDA_VISIBLE_DEVICES=6 python translate_pkl.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_9_0.12325442619621754 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/pkl/label_pkl -result_dir data/pkl/lable_pkl_result
+CUDA_VISIBLE_DEVICES=2 nohup python translate_file.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_59_0.07513352055102587 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/test_data/pinyin_short -result_dir data/test_data/result_short >log2 2>&1 & 
+CUDA_VISIBLE_DEVICES=3 nohup python translate_file.py -load_weights weights/pinyin_to_hanzi_onlyChinese/11-04_16:36:46/pinyin_to_hanzi_onlyChinese_27_0.0009592685928873834 -pkl_dir weights/pinyin_to_hanzi_onlyChinese/11-04_16:36:46 -test_dir data/test_data/pinyin_onlyChinese -result_dir data/test_data/result_onlyChinese >log2 2>&1 & 
--- a/Sublayers.py
+++ b/Sublayers.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+        self.size = d_model
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        self.eps = eps
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    scores = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        scores = dropout(scores)
+    output = torch.matmul(scores, v)
+    return output
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    def forward(self, q, k, v, mask=None):
+        bs = q.size(0)
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
--- a/Tokenize.py
+++ b/Tokenize.py
+import spacy
+import re
+class tokenize(object):
+    def __init__(self, lang):
+        self.nlp = spacy.load(lang)
+    def tokenizer(self, sentence):
+        sentence = re.sub(
+        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
+        sentence = re.sub(r"[ ]+", " ", sentence)
+        sentence = re.sub(r"\!+", "!", sentence)
+        sentence = re.sub(r"\,+", ",", sentence)
+        sentence = re.sub(r"\?+", "?", sentence)
+        sentence = sentence.lower()
+        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
--- a/cer_multi.py
+++ b/cer_multi.py
+"""
+多线程计算CER
+需比对文字放入preFile文件夹下
+原文放入textFile文件夹下
+"""
+import os
+import re
+from tqdm import tqdm
+import multiprocess
+import math
+import time
+def ishan(text):
+    """去除输入字符串中除中文字符串的内容
+    Args:
+        text (str): 字符串
+    Returns:
+        str: 去除非中文字符后的字符串
+    """
+    # for python 3.x
+    # sample: ishan('一') == True, ishan('我&&你') == False
+    result= [char if '\u4e00' <= char and char<= '\u9fff' else "" for char in text]
+    return "".join(result)
+def sub_process(textList,preList,result,record,lock,id):
+    for t,p in zip(textList,preList):
+        record.append(t)
+        t=ishan(t)
+        p=ishan(p)
+        r = [ishan(x) for x in t]
+        h = [ishan(x) for x in p]
+        cer(r,h,result,lock,id)
+def cer(r: list, h: list, result,lock,id):
+    """
+    Calculation of CER with Levenshtein distance.
+    """
+    # initialisation
+    import numpy
+    # print("{}:start initialisation".format(id))
+    d = numpy.zeros((len(r) + 1) * (len(h) + 1), dtype=numpy.uint16)
+    d = d.reshape((len(r) + 1, len(h) + 1))
+    for i in range(len(r) + 1):
+        for j in range(len(h) + 1):
+            if i == 0:
+                d[0][j] = j
+            elif j == 0:
+                d[i][0] = i
+    # computation
+    # print("{}:start computation".format(id))
+    for i in range(1, len(r) + 1):
+        # lock.acquire()
+        # print("{}:  {}".format(id,i))
+        # lock.release()
+        for j in range(1, len(h) + 1):
+            if r[i - 1] == h[j - 1]:
+                d[i][j] = d[i - 1][j - 1]
+            else:
+                substitution = d[i - 1][j - 1] + 1
+                insertion = d[i][j - 1] + 1
+                deletion = d[i - 1][j] + 1
+                d[i][j] = min(substitution, insertion, deletion)
+    # lock.acquire()       
+    result.append((d[len(r)][len(h)],float(len(r))))
+    # lock.release()
+def listener(record,total,start,lock):
+    now=start
+    while total-len(record)>100:
+        if time.time()-now>5:
+            now=time.time()
+            # lock.acquire()
+            print("{}/{}, {:.2f}%,cost:{:.2f}m,rest:{:.2f}m".format(len(record),total,len(record)/float(total)*100,(now-start)/60,(now-start)/60/(len(record)/float(total))-(now-start)/60))
+            # if len(record)==84952:
+            #     print(result)
+            # lock.release()
+def getList(fileList, dirPath):
+    result = []
+    for file in fileList:
+        with open(os.path.join(dirPath, file), "r", encoding="utf-8") as fw:
+            contents = fw.readlines()
+            result = result+contents
+    return result
+if __name__ == "__main__":
+    num_process = 128
+    preFile = "./data/test_data/result_short"
+    textFile = "./data/test_data/hanzi_short"
+    lock = multiprocess.Lock()
+    # preList=getList(preFile,"./preFile")
+    # textList=getList(textFile,"./textFile")
+    # for a,b in zip(textList,preList):
+    #      print('pred: {}, gt: {}'.format(b, a))
+    for pre in os.listdir(preFile):
+        text=pre[:-11]+".txt"
+        preList = []
+        textList = []
+        with open(os.path.join(preFile, pre), "r", encoding="utf-8") as fw:
+            preList = fw.readlines()
+        with open(os.path.join(textFile, text), "r", encoding="utf-8") as fw:
+            textList = fw.readlines()
+        start =time.time()
+        print("preFileName:{}\ntextFileName:{}".format(pre,text))
+        with multiprocess.Manager() as m:
+            result = m.list()
+            record = m.list()
+            batch_size = int(
+                math.ceil(float(len(preList))/float(num_process)))
+            print("batch_size:{}".format(batch_size))
+            task_list = []
+            p = multiprocess.Process(target=listener, args=(
+               record,len(preList),start,lock))
+            task_list.append(p)
+            p.start()
+            for i in range(num_process):
+                tmp_pre = preList[i*batch_size:(i+1)*batch_size]
+                tmp_text = textList[i*batch_size:(i+1)*batch_size]
+                p = multiprocess.Process(target=sub_process, args=(tmp_text,tmp_pre,result,record,lock,i))
+                task_list.append(p)
+                p.start()
+            for t in task_list:
+                t.join()
+            w=0
+            n=0
+            for (key,value) in tqdm(result):
+                w+=key
+                n+=value
+                # print(n)
+            print('{} \n total char：{} CER: {:.3f}'.format(pre[:-7],n,w/float(n)))
--- a/floyd.yml
+++ b/floyd.yml
+env: pytorch-0.4
+machine: cpu
+command: spacy download en && spacy download fr && python train.py -src_data /floyd/input/trans_data/english.txt -trg_data /floyd/input/trans_data/french.txt -src_lang en -trg_lang fr -floyd -checkpoint 15 -batchsize 3000 -epochs 10