Commit da232cae authored by szr712's avatar szr712

Initial commit

parents
*.pyc
.vscode
/weights
data/**/*.txt
data/**/*.pkl
import torch
from torchtext import data
import numpy as np
from torch.autograd import Variable
def nopeak_mask(size, opt):
np_mask = np.triu(np.ones((1, size, size)),
k=1).astype('uint8')
np_mask = Variable(torch.from_numpy(np_mask) == 0)
if opt.device == 0:
np_mask = np_mask.cuda()
return np_mask
def create_masks(src, trg, opt):
src_mask = (src != opt.src_pad).unsqueeze(-2)
if trg is not None:
trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
size = trg.size(1) # get seq_len for matrix
np_mask = nopeak_mask(size, opt)
if trg.is_cuda:
np_mask.cuda()
trg_mask = trg_mask & np_mask
else:
trg_mask = None
return src_mask, trg_mask
# patch on Torchtext's batching process that makes it more efficient
# from http://nlp.seas.harvard.edu/2018/04/03/attention.html#position-wise-feed-forward-networks
class MyIterator(data.Iterator):
def create_batches(self):
if self.train:
def pool(d, random_shuffler):
for p in data.batch(d, self.batch_size * 100):
p_batch = data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
"Keep augmenting batch and calculate total number of tokens + padding."
global max_src_in_batch, max_tgt_in_batch
if count == 1:
max_src_in_batch = 0
max_tgt_in_batch = 0
max_src_in_batch = max(max_src_in_batch, len(new.src))
max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
src_elements = count * max_src_in_batch
tgt_elements = count * max_tgt_in_batch
return max(src_elements, tgt_elements)
import torch
from Batch import nopeak_mask
import torch.nn.functional as F
import math
def init_vars(src, model, SRC, TRG, opt):
init_tok = TRG.vocab.stoi['<sos>']
src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
e_output = model.encoder(src, src_mask)
outputs = torch.LongTensor([[init_tok]])
if opt.device == 0:
outputs = outputs.cuda()
trg_mask = nopeak_mask(1, opt)
out = model.out(model.decoder(outputs,
e_output, src_mask, trg_mask))
out = F.softmax(out, dim=-1)
probs, ix = out[:, -1].data.topk(opt.k)
log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
outputs = torch.zeros(opt.k, opt.max_len).long()
if opt.device == 0:
outputs = outputs.cuda()
outputs[:, 0] = init_tok
outputs[:, 1] = ix[0]
e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
if opt.device == 0:
e_outputs = e_outputs.cuda()
e_outputs[:, :] = e_output[0]
return outputs, e_outputs, log_scores
def k_best_outputs(outputs, out, log_scores, i, k):
probs, ix = out[:, -1].data.topk(k)
log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
k_probs, k_ix = log_probs.view(-1).topk(k)
row = k_ix // k
col = k_ix % k
outputs[:, :i] = outputs[row, :i]
outputs[:, i] = ix[row, col]
log_scores = k_probs.unsqueeze(0)
return outputs, log_scores
def beam_search(src, model, SRC, TRG, opt):
outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
eos_tok = TRG.vocab.stoi['<eos>']
src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
ind = None
for i in range(2, opt.max_len):
trg_mask = nopeak_mask(i, opt)
out = model.out(model.decoder(outputs[:,:i],
e_outputs, src_mask, trg_mask))
out = F.softmax(out, dim=-1)
outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
for vec in ones:
i = vec[0]
if sentence_lengths[i]==0: # First end symbol has not been found yet
sentence_lengths[i] = vec[1] # Position of first end symbol
num_finished_sentences = len([s for s in sentence_lengths if s > 0])
if num_finished_sentences == opt.k:
alpha = 0.7
div = 1/(sentence_lengths.type_as(log_scores)**alpha)
_, ind = torch.max(log_scores * div, 1)
ind = ind.data[0]
break
if ind is None:
try:
length = (outputs[0]==eos_tok).nonzero()[0]
except:
return ""
return ''.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
else:
try:
length = (outputs[ind]==eos_tok).nonzero()[0]
except:
return ""
# length = (outputs[ind]==eos_tok).nonzero()[0]
return ''.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
import torch
import torch.nn as nn
import math
from torch.autograd import Variable
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.d_model = d_model
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class PositionalEncoder(nn.Module):
def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
super().__init__()
self.d_model = d_model
self.dropout = nn.Dropout(dropout)
# create constant 'pe' matrix with values dependant on
# pos and i
pe = torch.zeros(max_seq_len, d_model)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = \
math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = \
math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
# make embeddings relatively larger
x = x * math.sqrt(self.d_model)
#add constant to embedding
seq_len = x.size(1)
pe = Variable(self.pe[:,:seq_len], requires_grad=False)
if x.is_cuda:
pe.cuda()
x = x + pe
return self.dropout(x)
\ No newline at end of file
This diff is collapsed.
import torch
import torch.nn as nn
from Sublayers import FeedForward, MultiHeadAttention, Norm
class EncoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x, mask):
x2 = self.norm_1(x)
x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
x2 = self.norm_2(x)
x = x + self.dropout_2(self.ff(x2))
return x
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.norm_3 = Norm(d_model)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
self.dropout_3 = nn.Dropout(dropout)
self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)
def forward(self, x, e_outputs, src_mask, trg_mask):
x2 = self.norm_1(x)
x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
x2 = self.norm_2(x)
x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
src_mask))
x2 = self.norm_3(x)
x = x + self.dropout_3(self.ff(x2))
return x
\ No newline at end of file
import torch
import torch.nn as nn
from Layers import EncoderLayer, DecoderLayer
from Embed import Embedder, PositionalEncoder
from Sublayers import Norm
import copy
def get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
class Encoder(nn.Module):
def __init__(self, vocab_size, d_model, N, heads, dropout):
super().__init__()
self.N = N
self.embed = Embedder(vocab_size, d_model)
self.pe = PositionalEncoder(d_model, dropout=dropout)
self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
self.norm = Norm(d_model)
def forward(self, src, mask):
x = self.embed(src)
x = self.pe(x)
for i in range(self.N):
x = self.layers[i](x, mask)
return self.norm(x)
class Decoder(nn.Module):
def __init__(self, vocab_size, d_model, N, heads, dropout):
super().__init__()
self.N = N
self.embed = Embedder(vocab_size, d_model)
self.pe = PositionalEncoder(d_model, dropout=dropout)
self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
self.norm = Norm(d_model)
def forward(self, trg, e_outputs, src_mask, trg_mask):
x = self.embed(trg)
x = self.pe(x)
for i in range(self.N):
x = self.layers[i](x, e_outputs, src_mask, trg_mask)
return self.norm(x)
class Transformer(nn.Module):
def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
super().__init__()
self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
self.out = nn.Linear(d_model, trg_vocab)
def forward(self, src, trg, src_mask, trg_mask):
e_outputs = self.encoder(src, src_mask)
#print("DECODER")
d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
output = self.out(d_output)
return output
def get_model(opt, src_vocab, trg_vocab):
assert opt.d_model % opt.heads == 0
assert opt.dropout < 1
model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)
if opt.load_weights is not None:
print("loading pretrained weights...")
model.load_state_dict(torch.load(f'{opt.load_weights}'))
else:
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
if opt.device == 0:
model = model.cuda()
return model
import torch
import numpy as np
# code from AllenNLP
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
"""
Cosine annealing with restarts.
Parameters
----------
optimizer : torch.optim.Optimizer
T_max : int
The maximum number of iterations within the first cycle.
eta_min : float, optional (default: 0)
The minimum learning rate.
last_epoch : int, optional (default: -1)
The index of the last epoch.
"""
def __init__(self,
optimizer: torch.optim.Optimizer,
T_max: int,
eta_min: float = 0.,
last_epoch: int = -1,
factor: float = 1.) -> None:
# pylint: disable=invalid-name
self.T_max = T_max
self.eta_min = eta_min
self.factor = factor
self._last_restart: int = 0
self._cycle_counter: int = 0
self._cycle_factor: float = 1.
self._updated_cycle_len: int = T_max
self._initialized: bool = False
super(CosineWithRestarts, self).__init__(optimizer, last_epoch)
def get_lr(self):
"""Get updated learning rate."""
# HACK: We need to check if this is the first time get_lr() was called, since
# we want to start with step = 0, but _LRScheduler calls get_lr with
# last_epoch + 1 when initialized.
if not self._initialized:
self._initialized = True
return self.base_lrs
step = self.last_epoch + 1
self._cycle_counter = step - self._last_restart
lrs = [
(
self.eta_min + ((lr - self.eta_min) / 2) *
(
np.cos(
np.pi *
((self._cycle_counter) % self._updated_cycle_len) /
self._updated_cycle_len
) + 1
)
) for lr in self.base_lrs
]
if self._cycle_counter % self._updated_cycle_len == 0:
# Adjust the cycle length.
self._cycle_factor *= self.factor
self._cycle_counter = 0
self._updated_cycle_len = int(self._cycle_factor * self.T_max)
self._last_restart = step
return lrs
\ No newline at end of file
import pandas as pd
import torchtext
from torchtext import data
from Tokenize import tokenize
from Batch import MyIterator, batch_size_fn
import os
import dill as pickle
from pypinyin import Style, pinyin
from pypinyin.core import lazy_pinyin
from tqdm import tqdm
def wenzi2pinyin(text):
pinyin_list = lazy_pinyin(text, style=Style.TONE3)
# print(pinyin_list)
tones_list = [int(py[-1]) if py[-1].isdigit()
else 0 for py in pinyin_list]
pinyin_list = lazy_pinyin(text, style=Style.NORMAL)
return "".join(pinyin_list)
def read_data(opt):
if opt.src_data is not None:
try:
print("loading src_data")
opt.src_data = open(opt.src_data).read().strip().split('\n')
opt.src_data=[x for x in tqdm(opt.src_data) if len(x)<=200]
# print(len(opt.src_data))
except:
print("error: '" + opt.src_data + "' file not found")
quit()
if opt.trg_data is not None:
try:
print("loading trg_data")
opt.trg_data = open(opt.trg_data).read().strip().split('\n')
# opt.trg_data=[x for x in tqdm(opt.trg_data) if len(wenzi2pinyin(x))<=200]
opt.trg_data=[x for x in tqdm(opt.trg_data) if len(x)<=200]
except:
print("error: '" + opt.trg_data + "' file not found")
quit()
print("len of src_data:{} ; len of trg_data:{}".format(len(opt.src_data),len(opt.trg_data)))
def my_tokenize(text):
return list(text)
def create_fields(opt):
spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
# if opt.src_lang not in spacy_langs:
# print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)
# if opt.trg_lang not in spacy_langs:
# print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
print("loading spacy tokenizers...")
# t_src = tokenize(opt.src_lang)
# t_trg = tokenize(opt.trg_lang)
# TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
# SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
TRG = data.Field(tokenize=my_tokenize, init_token='<sos>', eos_token='<eos>')
SRC = data.Field(tokenize=my_tokenize)
if opt.pkl_dir is not None:
try:
print("loading presaved fields...")
SRC = pickle.load(open(f'{opt.pkl_dir}/SRC.pkl', 'rb'))
TRG = pickle.load(open(f'{opt.pkl_dir}/TRG.pkl', 'rb'))
except:
print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
quit()
return(SRC, TRG)
def create_dataset(opt, SRC, TRG):
print("creating dataset and iterator... ")
raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
df = pd.DataFrame(raw_data, columns=["src", "trg"])
print(df.head())
mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
# print(mask)
# df = df.loc[mask]
df.to_csv("translate_transformer_temp.csv", index=False)
data_fields = [('src', SRC), ('trg', TRG)]
train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=True, shuffle=True)
os.remove('translate_transformer_temp.csv')
if opt.load_weights is None:
SRC.build_vocab(train)
print(SRC.vocab.stoi)
TRG.build_vocab(train)
print(TRG.vocab.stoi)
if opt.checkpoint > 0:
try:
os.mkdir("weights")
except:
print("weights folder already exists, run program with -load_weights weights to load them")
quit()
pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
opt.src_pad = SRC.vocab.stoi['<pad>']
opt.trg_pad = TRG.vocab.stoi['<pad>']
opt.train_len = get_len(train_iter)
return train_iter
def get_len(train):
for i, b in enumerate(train):
pass
return i
# Transformer
This is a pytorch implementation of the transformer model. If you'd like to understand the model, or any of the code better, please refer to <a href=https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec>my tutorial</a>.
Using the Europarl dataset plus the dataset in the data folder, I was able to achieve a BLEU score of 0.39 on the test set (current SOTA is around 0.42), after 4/5 days of training on a single 8gb GPU. For more results see the tutorial again.
# Train the model immediately on FloydHub
[![Run on FloydHub](https://static.floydhub.com/button/button-small.svg)](https://floydhub.com/run)
Launch a [FloydHub Workspace](https://www.floydhub.com/product/build) to start training this model with 1 click. Workspace is a GPU-enabled cloud IDE for machine learning. It provides a fully configured environment so you can start hacking right away, without worrying about dependencies, data sets, etc.
Once you've started the workspace, run the 'start_here' notebook or type 'floyd run' into the workspace terminal. This will begin to train the model on the sample dataset.
# Usage
Two text files containing parallel sentences (seperated by '\n' characters) in two languages are required to train the model. See an example of this in the data/ folder (french.txt and english.txt).
To begin training, run this code:
```
python train.py -src_data path/lang1.txt -trg_data path/lang2.txt -src_lang lang1 -trg_lang lang2
```
The spacy tokenizer is used to tokenize the text, hence only languages supported by spacy are supported by this program. The languages supported by Spacy and their codes are:
English : 'en'<br />
French : 'fr'<br />
Portugese : 'pt'<br />
Italian : 'it'<br />
Dutch : 'nl'<br />
Spanish : 'es'<br />
German : 'de'<br />
For example, to train tan English->French translator on the datasets provided in the data folder, you would run the following:
```
python train.py -src_data data/english.txt -trg_data data/french.txt -src_lang en -trg_lang fr
```
Additional parameters:<br />
-epochs : how many epochs to train data for (default=2)<br />
-batch_size : measured as number of tokens fed to model in each iteration (default=1500)<br />
-n_layers : how many layers to have in Transformer model (default=6)<br />
-heads : how many heads to split into for multi-headed attention (default=8)<br />
-no_cuda : adding this will disable cuda, and run model on cpu<br />
-SGDR : adding this will implement stochastic gradient descent with restarts, using cosine annealing<br />
-d_model : dimension of embedding vector and layers (default=512)<br />
-dropout' : decide how big dropout will be (default=0.1)<br />
-printevery : how many iterations run before printing (default=100)<br />
-lr : learning rate (default=0.0001)<br />
-load_weights : if loading pretrained weights, put path to folder where previous weights and pickles were saved <br />
-max_strlen : sentenced with more words will not be included in dataset (default=80)<br />
-checkpoint : enter a number of minutes. Model's weights will then be saved every this many minutes to folder 'weights/'<br />
# Training and Translating
```
python train.py -src_data data/english.txt -trg_data data/french.txt -src_lang en -trg_lang fr -epochs 10
```
This code gave the following results on a K100 GPU with 8bg RAM:
![screen shot 2018-09-18 at 21 35 55](https://user-images.githubusercontent.com/28839356/45754258-1656fc00-bc13-11e8-9506-5ace6fb6b79c.png)
After saving the results to folder 'weights', the model can then be tested:
```
python translate.py -load_weights weights
```
![screen shot 2018-09-18 at 21 40 08](https://user-images.githubusercontent.com/28839356/45754259-18b95600-bc13-11e8-86c7-a07fe18b1ecc.png)
So with a small dataset of 150,000 sentences and 1 hour of training, already some quite good results...
# Features still to add
- create validation set and get validation scores each epoch
- function to show translations of sentences from training and validation sets
CUDA_VISIBLE_DEVICES=5 nohup python train.py -src_data data/train_set_onlyChinese.txt -trg_data data/train_set_pinyin_onlyChinese.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name pinyin_to_hanzi_onlyChinese
CUDA_VISIBLE_DEVICES=5 python train.py -src_data data/train_set_pinyin_total.txt -trg_data data/train_set_total.txt -src_lang en_core_web_sm -trg_lang fr_core_news_sm -epochs 100 -model_name pinyin_to_hanzi_total -load_weights weights/pinyin_to_hanzi_total/10-29_18:51:57/pinyin_to_hanzi_total_10_0.1508198243379593 -pkl_dir weights/pinyin_to_hanzi_total/10-29_18:51:57
CUDA_VISIBLE_DEVICES=6 python translate.py -load_weights weights/pinyin_to_hanzi_total/10-29_18:51:57/pinyin_to_hanzi_total_10_0.1508198243379593 -pkl_dir weights/pinyin_to_hanzi_total/10-29_18:51:57
CUDA_VISIBLE_DEVICES=5 python translate_file.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_9_0.12325442619621754 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/test_data/pinyin_short -result_dir data/test_data/result_tmp
CUDA_VISIBLE_DEVICES=6 python translate_pkl.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_9_0.12325442619621754 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/pkl/label_pkl -result_dir data/pkl/lable_pkl_result
CUDA_VISIBLE_DEVICES=2 nohup python translate_file.py -load_weights weights/pinyin_to_hanzi_total/10-30_21:22:49/pinyin_to_hanzi_total_59_0.07513352055102587 -pkl_dir weights/pinyin_to_hanzi_total/10-30_21:22:49 -test_dir data/test_data/pinyin_short -result_dir data/test_data/result_short >log2 2>&1 &
CUDA_VISIBLE_DEVICES=3 nohup python translate_file.py -load_weights weights/pinyin_to_hanzi_onlyChinese/11-04_16:36:46/pinyin_to_hanzi_onlyChinese_27_0.0009592685928873834 -pkl_dir weights/pinyin_to_hanzi_onlyChinese/11-04_16:36:46 -test_dir data/test_data/pinyin_onlyChinese -result_dir data/test_data/result_onlyChinese >log2 2>&1 &
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class Norm(nn.Module):
def __init__(self, d_model, eps = 1e-6):
super().__init__()
self.size = d_model
# create two learnable parameters to calibrate normalisation
self.alpha = nn.Parameter(torch.ones(self.size))
self.bias = nn.Parameter(torch.zeros(self.size))
self.eps = eps
def forward(self, x):
norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
/ (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
return norm
def attention(q, k, v, d_k, mask=None, dropout=None):
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
mask = mask.unsqueeze(1)
scores = scores.masked_fill(mask == 0, -1e9)
scores = F.softmax(scores, dim=-1)
if dropout is not None:
scores = dropout(scores)
output = torch.matmul(scores, v)
return output
class MultiHeadAttention(nn.Module):
def __init__(self, heads, d_model, dropout = 0.1):
super().__init__()
self.d_model = d_model
self.d_k = d_model // heads
self.h = heads
self.q_linear = nn.Linear(d_model, d_model)
self.v_linear = nn.Linear(d_model, d_model)
self.k_linear = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
self.out = nn.Linear(d_model, d_model)
def forward(self, q, k, v, mask=None):
bs = q.size(0)
# perform linear operation and split into N heads
k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
# transpose to get dimensions bs * N * sl * d_model
k = k.transpose(1,2)
q = q.transpose(1,2)
v = v.transpose(1,2)
# calculate attention using function we will define next
scores = attention(q, k, v, self.d_k, mask, self.dropout)
# concatenate heads and put through final linear layer
concat = scores.transpose(1,2).contiguous()\
.view(bs, -1, self.d_model)
output = self.out(concat)
return output
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff=2048, dropout = 0.1):
super().__init__()
# We set d_ff as a default to 2048
self.linear_1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear_2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = self.dropout(F.relu(self.linear_1(x)))
x = self.linear_2(x)
return x
import spacy
import re
class tokenize(object):
def __init__(self, lang):
self.nlp = spacy.load(lang)
def tokenizer(self, sentence):
sentence = re.sub(
r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
sentence = re.sub(r"[ ]+", " ", sentence)
sentence = re.sub(r"\!+", "!", sentence)
sentence = re.sub(r"\,+", ",", sentence)
sentence = re.sub(r"\?+", "?", sentence)
sentence = sentence.lower()
return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
"""
多线程计算CER
需比对文字放入preFile文件夹下
原文放入textFile文件夹下
"""
import os
import re
from tqdm import tqdm
import multiprocess
import math
import time
def ishan(text):
"""去除输入字符串中除中文字符串的内容
Args:
text (str): 字符串
Returns:
str: 去除非中文字符后的字符串
"""
# for python 3.x
# sample: ishan('一') == True, ishan('我&&你') == False
result= [char if '\u4e00' <= char and char<= '\u9fff' else "" for char in text]
return "".join(result)
def sub_process(textList,preList,result,record,lock,id):
for t,p in zip(textList,preList):
record.append(t)
t=ishan(t)
p=ishan(p)
r = [ishan(x) for x in t]
h = [ishan(x) for x in p]
cer(r,h,result,lock,id)
def cer(r: list, h: list, result,lock,id):
"""
Calculation of CER with Levenshtein distance.
"""
# initialisation
import numpy
# print("{}:start initialisation".format(id))
d = numpy.zeros((len(r) + 1) * (len(h) + 1), dtype=numpy.uint16)
d = d.reshape((len(r) + 1, len(h) + 1))
for i in range(len(r) + 1):
for j in range(len(h) + 1):
if i == 0:
d[0][j] = j
elif j == 0:
d[i][0] = i
# computation
# print("{}:start computation".format(id))
for i in range(1, len(r) + 1):
# lock.acquire()
# print("{}: {}".format(id,i))
# lock.release()
for j in range(1, len(h) + 1):
if r[i - 1] == h[j - 1]:
d[i][j] = d[i - 1][j - 1]
else:
substitution = d[i - 1][j - 1] + 1
insertion = d[i][j - 1] + 1
deletion = d[i - 1][j] + 1
d[i][j] = min(substitution, insertion, deletion)
# lock.acquire()
result.append((d[len(r)][len(h)],float(len(r))))
# lock.release()
def listener(record,total,start,lock):
now=start
while total-len(record)>100:
if time.time()-now>5:
now=time.time()
# lock.acquire()
print("{}/{}, {:.2f}%,cost:{:.2f}m,rest:{:.2f}m".format(len(record),total,len(record)/float(total)*100,(now-start)/60,(now-start)/60/(len(record)/float(total))-(now-start)/60))
# if len(record)==84952:
# print(result)
# lock.release()
def getList(fileList, dirPath):
result = []
for file in fileList:
with open(os.path.join(dirPath, file), "r", encoding="utf-8") as fw:
contents = fw.readlines()
result = result+contents
return result
if __name__ == "__main__":
num_process = 128
preFile = "./data/test_data/result_short"
textFile = "./data/test_data/hanzi_short"
lock = multiprocess.Lock()
# preList=getList(preFile,"./preFile")
# textList=getList(textFile,"./textFile")
# for a,b in zip(textList,preList):
# print('pred: {}, gt: {}'.format(b, a))
for pre in os.listdir(preFile):
text=pre[:-11]+".txt"
preList = []
textList = []
with open(os.path.join(preFile, pre), "r", encoding="utf-8") as fw:
preList = fw.readlines()
with open(os.path.join(textFile, text), "r", encoding="utf-8") as fw:
textList = fw.readlines()
start =time.time()
print("preFileName:{}\ntextFileName:{}".format(pre,text))
with multiprocess.Manager() as m:
result = m.list()
record = m.list()
batch_size = int(
math.ceil(float(len(preList))/float(num_process)))
print("batch_size:{}".format(batch_size))
task_list = []
p = multiprocess.Process(target=listener, args=(
record,len(preList),start,lock))
task_list.append(p)
p.start()
for i in range(num_process):
tmp_pre = preList[i*batch_size:(i+1)*batch_size]
tmp_text = textList[i*batch_size:(i+1)*batch_size]
p = multiprocess.Process(target=sub_process, args=(tmp_text,tmp_pre,result,record,lock,i))
task_list.append(p)
p.start()
for t in task_list:
t.join()
w=0
n=0
for (key,value) in tqdm(result):
w+=key
n+=value
# print(n)
print('{} \n total char:{} CER: {:.3f}'.format(pre[:-7],n,w/float(n)))
env: pytorch-0.4
machine: cpu
command: spacy download en && spacy download fr && python train.py -src_data /floyd/input/trans_data/english.txt -trg_data /floyd/input/trans_data/french.txt -src_lang en -trg_lang fr -floyd -checkpoint 15 -batchsize 3000 -epochs 10
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment