first commit
This commit is contained in:
commit
4b6c65862b
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
/src/dataset/corpus/*
|
||||||
|
/src/dataset/tok_model/*
|
||||||
|
/src/dataset/__pycache__
|
||||||
|
/src/model/__pycache__
|
||||||
|
/src/output/*
|
||||||
|
/src/trainer/__pycache__
|
||||||
|
/.vscode
|
2
src/dataset/__init__.py
Normal file
2
src/dataset/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
from .tokenizer import BertTokenizer
|
||||||
|
from .dataset import BERTDataset
|
158
src/dataset/dataset.py
Normal file
158
src/dataset/dataset.py
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
"""Dataset Class for Bert"""
|
||||||
|
import random
|
||||||
|
import tqdm
|
||||||
|
import torch
|
||||||
|
import linecache
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from .tokenizer import BertTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class BERTDataset(Dataset):
|
||||||
|
def __init__(self, corpus_path, tokenizer: BertTokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.seq_len = seq_len
|
||||||
|
|
||||||
|
self.on_memory = on_memory
|
||||||
|
self.corpus_lines = corpus_lines
|
||||||
|
self.corpus_path = corpus_path
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
self.corpus_lines = sum(1 for line in open(self.corpus_path))
|
||||||
|
|
||||||
|
# with open(corpus_path, "r", encoding=encoding) as f:
|
||||||
|
# if self.corpus_lines is None and not on_memory:
|
||||||
|
# for _ in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines):
|
||||||
|
# self.corpus_lines += 1
|
||||||
|
|
||||||
|
# if on_memory:
|
||||||
|
# self.lines = [line[:-1].split("\t")
|
||||||
|
# for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)]
|
||||||
|
# self.corpus_lines = len(self.lines)
|
||||||
|
|
||||||
|
# if not on_memory:
|
||||||
|
# self.file = open(corpus_path, "r", encoding=encoding)
|
||||||
|
# self.random_file = open(corpus_path, "r", encoding=encoding)
|
||||||
|
|
||||||
|
# for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)):
|
||||||
|
# self.random_file.__next__()
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.corpus_lines
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
t1, t2, is_next_label = self.random_sent(item)
|
||||||
|
t1_random, t1_label = self.random_word(t1)
|
||||||
|
t2_random, t2_label = self.random_word(t2)
|
||||||
|
|
||||||
|
# [CLS] tag = SOS tag, [SEP] tag = EOS tag
|
||||||
|
t1 = [self.tokenizer.sos_index] + t1_random + [self.tokenizer.eos_index]
|
||||||
|
t2 = t2_random + [self.tokenizer.eos_index]
|
||||||
|
|
||||||
|
t1_label = [self.tokenizer.pad_index] + t1_label + [self.tokenizer.pad_index]
|
||||||
|
t2_label = t2_label + [self.tokenizer.pad_index]
|
||||||
|
|
||||||
|
segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
|
||||||
|
bert_input = (t1 + t2)[:self.seq_len]
|
||||||
|
bert_label = (t1_label + t2_label)[:self.seq_len]
|
||||||
|
|
||||||
|
padding = [self.tokenizer.pad_index for _ in range(self.seq_len - len(bert_input))]
|
||||||
|
bert_input.extend(padding)
|
||||||
|
bert_label.extend(padding)
|
||||||
|
segment_label.extend(padding)
|
||||||
|
|
||||||
|
output = {"bert_input": bert_input,
|
||||||
|
"bert_label": bert_label,
|
||||||
|
"segment_label": segment_label,
|
||||||
|
"is_next": is_next_label}
|
||||||
|
|
||||||
|
return {key: torch.tensor(value) for key, value in output.items()} #pylint: disable=not-callable
|
||||||
|
|
||||||
|
def random_word(self, sentence):
|
||||||
|
# tokens = sentence.split()
|
||||||
|
output_label = []
|
||||||
|
tokens = self.tokenizer.tokenize(sentence)
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
prob = random.random()
|
||||||
|
if prob < 0.15:
|
||||||
|
prob /= 0.15
|
||||||
|
|
||||||
|
if prob < 0.8:
|
||||||
|
tokens[i] = self.tokenizer.mask_index
|
||||||
|
elif prob < 0.9:
|
||||||
|
tokens[i] = self.tokenizer.getRandomTokenID()
|
||||||
|
else:
|
||||||
|
tokens[i] = token
|
||||||
|
output_label.append(token)
|
||||||
|
else:
|
||||||
|
tokens[i] = token
|
||||||
|
output_label.append(0)
|
||||||
|
return tokens, output_label
|
||||||
|
|
||||||
|
# for i, token in enumerate(tokens):
|
||||||
|
# prob = random.random()
|
||||||
|
# if prob < 0.15:
|
||||||
|
# prob /= 0.15
|
||||||
|
|
||||||
|
# # 80% randomly change token to mask token
|
||||||
|
# if prob < 0.8:
|
||||||
|
# tokens[i] = self.vocab.mask_index
|
||||||
|
|
||||||
|
# # 10% randomly change token to random token
|
||||||
|
# elif prob < 0.9:
|
||||||
|
# tokens[i] = random.randrange(len(self.vocab))
|
||||||
|
|
||||||
|
# # 10% randomly change token to current token
|
||||||
|
# else:
|
||||||
|
# tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
|
||||||
|
|
||||||
|
# output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index))
|
||||||
|
|
||||||
|
# else:
|
||||||
|
# tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
|
||||||
|
# output_label.append(0)
|
||||||
|
|
||||||
|
# return tokens, output_label
|
||||||
|
|
||||||
|
def random_sent(self, index):
|
||||||
|
t1, t2 = self.get_corpus_line(index)
|
||||||
|
# t1 = self.tokenizer.tokenize(t1)
|
||||||
|
# t2 = self.tokenizer.tokenize(t2)
|
||||||
|
# output_text, label(isNotNext:0, isNext:1)
|
||||||
|
if random.random() > 0.5:
|
||||||
|
return t1, t2, 1
|
||||||
|
else:
|
||||||
|
# rand_line = self.tokenizer.tokenize(self.get_random_line())
|
||||||
|
return t1, self.get_random_line(), 0
|
||||||
|
|
||||||
|
# def get_corpus_line(self, item):
|
||||||
|
# if self.on_memory:
|
||||||
|
# return self.lines[item][0], self.lines[item][1]
|
||||||
|
# else:
|
||||||
|
# line = self.file.__next__()
|
||||||
|
# if line is None:
|
||||||
|
# self.file.close()
|
||||||
|
# self.file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||||
|
# line = self.file.__next__()
|
||||||
|
|
||||||
|
# t1, t2 = line[:-1].split("\t")
|
||||||
|
# return t1, t2
|
||||||
|
def get_corpus_line(self, item):
|
||||||
|
t1 = linecache.getline(self.corpus_path, item)
|
||||||
|
t2 = linecache.getline(self.corpus_path, item+1)
|
||||||
|
return t1, t2
|
||||||
|
|
||||||
|
# def get_random_line(self):
|
||||||
|
# if self.on_memory:
|
||||||
|
# return self.lines[random.randrange(len(self.lines))][1]
|
||||||
|
|
||||||
|
# line = self.file.__next__()
|
||||||
|
# if line is None:
|
||||||
|
# self.file.close()
|
||||||
|
# self.file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||||
|
# for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)):
|
||||||
|
# self.random_file.__next__()
|
||||||
|
# line = self.random_file.__next__()
|
||||||
|
# return line[:-1].split("\t")[1]
|
||||||
|
|
||||||
|
def get_random_line(self):
|
||||||
|
return linecache.getline(self.corpus_path, random.randint(1, self.corpus_lines))
|
46
src/dataset/tokenizer.py
Normal file
46
src/dataset/tokenizer.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
""" Tokenizer class """
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
import tokenizers
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
from tokenizers.pre_tokenizers import Digits
|
||||||
|
|
||||||
|
|
||||||
|
class BertTokenizer():
|
||||||
|
"""Bert Tokenizer using WordPiece Tokenizer Model"""
|
||||||
|
def __init__(self, path):
|
||||||
|
self.path = path
|
||||||
|
text_paths = [str(x) for x in Path("./dataset/corpus/").glob("**/*.txt")]
|
||||||
|
savedpath = "./dataset/tok_model/MaLaMo-vocab.txt"
|
||||||
|
if os.path.exists(savedpath):
|
||||||
|
self.tokenizer = tokenizers.BertWordPieceTokenizer(
|
||||||
|
"./dataset/tok_model/MaLaMo-vocab.txt",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.tokenizer = tokenizers.BertWordPieceTokenizer()
|
||||||
|
self.tokenizer.train(files=text_paths, special_tokens=[
|
||||||
|
"[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=14200)
|
||||||
|
self.tokenizer.save_model("./dataset/tok_model", "MaLaMo")
|
||||||
|
self.tokenizer.enable_truncation(max_length=512)
|
||||||
|
self.pretokenizer = tokenizers.pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
|
||||||
|
self.vocab = self.tokenizer.get_vocab()
|
||||||
|
self.mask_index = self.vocab.get("[MASK]")
|
||||||
|
self.pad_index = self.vocab.get("[PAD]")
|
||||||
|
self.eos_index = self.vocab.get("[SEP]")
|
||||||
|
self.sos_index = self.vocab.get("[CLS]")
|
||||||
|
self.unk_index = self.vocab.get("[UNK]")
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(self, sentence: str):
|
||||||
|
return self.tokenizer.encode(sentence).ids
|
||||||
|
|
||||||
|
def getRandomTokenID(self):
|
||||||
|
return random.randint(6, len(self.vocab) - 1)
|
||||||
|
|
||||||
|
def get_vocab(self):
|
||||||
|
return self.tokenizer.get_vocab()
|
||||||
|
|
||||||
|
def get_vocab_size(self):
|
||||||
|
return self.tokenizer.get_vocab_size()
|
||||||
|
|
77
src/main.py
Normal file
77
src/main.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
"""main entry for training"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from model.bert import BERT
|
||||||
|
from trainer import BERTTrainer
|
||||||
|
from dataset import BERTDataset, BertTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("-c", "--train_dataset", type=str, default="./dataset/corpus/train.txt", help="train dataset for train bert")
|
||||||
|
parser.add_argument("-t", "--test_dataset", type=str, default="./dataset/corpus/test.txt", help="test set for evaluate train set")
|
||||||
|
#parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab")
|
||||||
|
parser.add_argument("-o", "--output_path", type=str, default="./output/bert.model", help="ex)output/bert.model")
|
||||||
|
|
||||||
|
parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model")
|
||||||
|
parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers")
|
||||||
|
parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
|
||||||
|
parser.add_argument("-s", "--seq_len", type=int, default=512, help="maximum sequence len")
|
||||||
|
|
||||||
|
parser.add_argument("-b", "--batch_size", type=int, default=8, help="number of batch_size")
|
||||||
|
parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs")
|
||||||
|
parser.add_argument("-w", "--num_workers", type=int, default=1, help="dataloader worker size")
|
||||||
|
|
||||||
|
parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
|
||||||
|
parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
|
||||||
|
parser.add_argument("--corpus_lines", type=int, default=5110, help="total number of lines in corpus")
|
||||||
|
parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
|
||||||
|
parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
|
||||||
|
|
||||||
|
parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
|
||||||
|
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
|
||||||
|
parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
|
||||||
|
parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("Loading Vocab")
|
||||||
|
tokenizer = BertTokenizer("./dataset/corpus")
|
||||||
|
vocab_size = tokenizer.get_vocab_size()
|
||||||
|
print("Vocab Size: ", vocab_size)
|
||||||
|
|
||||||
|
print("Loading Train Dataset", args.train_dataset)
|
||||||
|
train_dataset = BERTDataset(args.train_dataset, tokenizer, seq_len=args.seq_len,
|
||||||
|
corpus_lines=args.corpus_lines, on_memory=args.on_memory)
|
||||||
|
|
||||||
|
print("Loading Test Dataset", args.test_dataset)
|
||||||
|
test_dataset = BERTDataset(args.test_dataset, tokenizer, seq_len=args.seq_len, on_memory=args.on_memory) \
|
||||||
|
if args.test_dataset is not None else None
|
||||||
|
|
||||||
|
print("Creating Dataloader")
|
||||||
|
train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
|
||||||
|
test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
|
||||||
|
if test_dataset is not None else None
|
||||||
|
|
||||||
|
print("Building BERT model")
|
||||||
|
bert = BERT(vocab_size, hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads)
|
||||||
|
|
||||||
|
print("Creating BERT Trainer")
|
||||||
|
trainer = BERTTrainer(bert, vocab_size, train_dataloader=train_data_loader, test_dataloader=test_data_loader,
|
||||||
|
lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
|
||||||
|
with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq)
|
||||||
|
|
||||||
|
print("Training Start")
|
||||||
|
for epoch in range(args.epochs):
|
||||||
|
trainer.train(epoch)
|
||||||
|
trainer.save(epoch, args.output_path)
|
||||||
|
|
||||||
|
if test_data_loader is not None:
|
||||||
|
trainer.test(epoch)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train()
|
2
src/model/__init__.py
Normal file
2
src/model/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
from .bert import BERT
|
||||||
|
from .language_model import BERTLM
|
64
src/model/bert.py
Normal file
64
src/model/bert.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
""" BERT CLASS MODuLE"""
|
||||||
|
|
||||||
|
from .embedding import BERTEmbedding
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class BERT(nn.Module):
|
||||||
|
"""
|
||||||
|
BERT model : Bidirectional Encoder Representations from Transformers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
|
||||||
|
"""
|
||||||
|
:param vocab_size: vocab_size of total words
|
||||||
|
:param hidden: BERT model hidden size
|
||||||
|
:param n_layers: numbers of Transformer blocks(layers)
|
||||||
|
:param attn_heads: number of attention heads
|
||||||
|
:param dropout: dropout rate
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.hidden = hidden
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.attn_heads = attn_heads
|
||||||
|
|
||||||
|
# paper noted they used 4*hidden_size for ff_network_hidden_size
|
||||||
|
self.feed_forward_hidden = hidden * 4
|
||||||
|
|
||||||
|
self.src_mask = None
|
||||||
|
|
||||||
|
# embedding for BERT, sum of positional, segment, token embeddings
|
||||||
|
self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
|
||||||
|
|
||||||
|
# multi-layers transformer blocks, deep network
|
||||||
|
#self.transformer_blocks = nn.ModuleList(
|
||||||
|
# [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
|
||||||
|
encoder_layers = nn.TransformerEncoderLayer(hidden, attn_heads, self.feed_forward_hidden, dropout)
|
||||||
|
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
|
||||||
|
|
||||||
|
def _generate_square_subsequent_mask(self, sz):
|
||||||
|
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
|
||||||
|
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
|
||||||
|
return mask
|
||||||
|
|
||||||
|
def forward(self, x, segment_info, has_mask=True):
|
||||||
|
if has_mask:
|
||||||
|
if self.src_mask is None or self.src_mask.size(0) != len(x):
|
||||||
|
mask = self._generate_square_subsequent_mask(len(x))
|
||||||
|
self.src_mask = mask
|
||||||
|
else:
|
||||||
|
self.src_mask = None
|
||||||
|
# attention masking for padded token
|
||||||
|
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
|
||||||
|
#mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
|
||||||
|
#mask = mask.view(-1, 512, 512)
|
||||||
|
|
||||||
|
#print(x)
|
||||||
|
|
||||||
|
# embedding the indexed sequence to sequence of vectors
|
||||||
|
x = self.embedding(x, segment_info)
|
||||||
|
x = self.transformer_encoder(x, self.src_mask)
|
||||||
|
|
||||||
|
return x
|
1
src/model/embedding/__init__.py
Normal file
1
src/model/embedding/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .bert import BERTEmbedding
|
BIN
src/model/embedding/__pycache__/__init__.cpython-38.pyc
Normal file
BIN
src/model/embedding/__pycache__/__init__.cpython-38.pyc
Normal file
Binary file not shown.
BIN
src/model/embedding/__pycache__/bert.cpython-38.pyc
Normal file
BIN
src/model/embedding/__pycache__/bert.cpython-38.pyc
Normal file
Binary file not shown.
BIN
src/model/embedding/__pycache__/position.cpython-38.pyc
Normal file
BIN
src/model/embedding/__pycache__/position.cpython-38.pyc
Normal file
Binary file not shown.
BIN
src/model/embedding/__pycache__/segment.cpython-38.pyc
Normal file
BIN
src/model/embedding/__pycache__/segment.cpython-38.pyc
Normal file
Binary file not shown.
BIN
src/model/embedding/__pycache__/token.cpython-38.pyc
Normal file
BIN
src/model/embedding/__pycache__/token.cpython-38.pyc
Normal file
Binary file not shown.
36
src/model/embedding/bert.py
Normal file
36
src/model/embedding/bert.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
"""Bert Embedding Module"""
|
||||||
|
|
||||||
|
import torch.nn as nn
|
||||||
|
from .token import TokenEmbedding
|
||||||
|
from .position import PositionalEmbedding
|
||||||
|
from .segment import SegmentEmbedding
|
||||||
|
|
||||||
|
|
||||||
|
class BERTEmbedding(nn.Module):
|
||||||
|
"""
|
||||||
|
BERT Embedding which is consisted with under features
|
||||||
|
1. TokenEmbedding : normal embedding matrix
|
||||||
|
2. PositionalEmbedding : adding positional information using sin, cos
|
||||||
|
2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
|
||||||
|
sum of all these features are output of BERTEmbedding
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab_size, embed_size, dropout=0.1):
|
||||||
|
"""
|
||||||
|
:param vocab_size: total vocab size
|
||||||
|
:param embed_size: embedding size of token embedding
|
||||||
|
:param dropout: dropout rate
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
|
||||||
|
self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
|
||||||
|
#self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
|
||||||
|
self.segment = nn.Embedding(8, self.token.embedding_dim, padding_idx=0)
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
self.embed_size = embed_size
|
||||||
|
|
||||||
|
def forward(self, sequence, segment_label):
|
||||||
|
#print(segment_label.shape)
|
||||||
|
#segmented = self.segment(segment_label)
|
||||||
|
x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
|
||||||
|
return self.dropout(x)
|
27
src/model/embedding/position.py
Normal file
27
src/model/embedding/position.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
""" Positional Embedding Module """
|
||||||
|
|
||||||
|
import math
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class PositionalEmbedding(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, d_model, max_len=512):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Compute the positional encodings once in log space.
|
||||||
|
pe = torch.zeros(max_len, d_model).float()
|
||||||
|
pe.require_grad = False
|
||||||
|
|
||||||
|
position = torch.arange(0, max_len).float().unsqueeze(1)
|
||||||
|
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
|
||||||
|
|
||||||
|
pe[:, 0::2] = torch.sin(position * div_term)
|
||||||
|
pe[:, 1::2] = torch.cos(position * div_term)
|
||||||
|
|
||||||
|
pe = pe.unsqueeze(0)
|
||||||
|
self.register_buffer('pe', pe)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.pe[:, :x.size(1)]
|
6
src/model/embedding/segment.py
Normal file
6
src/model/embedding/segment.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class SegmentEmbedding(nn.Embedding):
|
||||||
|
def __init__(self, embed_size=512):
|
||||||
|
super().__init__(3, embed_size, padding_idx=0)
|
6
src/model/embedding/token.py
Normal file
6
src/model/embedding/token.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class TokenEmbedding(nn.Embedding):
|
||||||
|
def __init__(self, vocab_size, embed_size=512):
|
||||||
|
super().__init__(vocab_size, embed_size, padding_idx=0)
|
61
src/model/language_model.py
Normal file
61
src/model/language_model.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from .bert import BERT
|
||||||
|
|
||||||
|
|
||||||
|
class BERTLM(nn.Module):
|
||||||
|
"""
|
||||||
|
BERT Language Model
|
||||||
|
Next Sentence Prediction Model + Masked Language Model
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, bert: BERT, vocab_size):
|
||||||
|
"""
|
||||||
|
:param bert: BERT model which should be trained
|
||||||
|
:param vocab_size: total vocab size for masked_lm
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.bert = bert
|
||||||
|
self.next_sentence = NextSentencePrediction(self.bert.hidden)
|
||||||
|
self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x, segment_label):
|
||||||
|
x = self.bert(x, segment_label)
|
||||||
|
return self.next_sentence(x), self.mask_lm(x)
|
||||||
|
|
||||||
|
|
||||||
|
class NextSentencePrediction(nn.Module):
|
||||||
|
"""
|
||||||
|
2-class classification model : is_next, is_not_next
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden):
|
||||||
|
"""
|
||||||
|
:param hidden: BERT model output size
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.linear = nn.Linear(hidden, 2)
|
||||||
|
self.softmax = nn.LogSoftmax(dim=-1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.softmax(self.linear(x[:, 0]))
|
||||||
|
|
||||||
|
|
||||||
|
class MaskedLanguageModel(nn.Module):
|
||||||
|
"""
|
||||||
|
predicting origin token from masked input sequence
|
||||||
|
n-class classification problem, n-class = vocab_size
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden, vocab_size):
|
||||||
|
"""
|
||||||
|
:param hidden: output size of BERT model
|
||||||
|
:param vocab_size: total vocab size
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.linear = nn.Linear(hidden, vocab_size)
|
||||||
|
self.softmax = nn.LogSoftmax(dim=-1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.softmax(self.linear(x))
|
0
src/model/utils/__init__.py
Normal file
0
src/model/utils/__init__.py
Normal file
12
src/model/utils/gelu.py
Normal file
12
src/model/utils/gelu.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import math
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class GELU(nn.Module):
|
||||||
|
"""
|
||||||
|
GELU Activation function
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
1
src/trainer/__init__.py
Normal file
1
src/trainer/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .pretrain import BERTTrainer
|
35
src/trainer/optim_schedule.py
Normal file
35
src/trainer/optim_schedule.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
'''A wrapper class for optimizer '''
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class ScheduledOptim():
|
||||||
|
'''A simple wrapper class for learning rate scheduling'''
|
||||||
|
|
||||||
|
def __init__(self, optimizer, d_model, n_warmup_steps):
|
||||||
|
self._optimizer = optimizer
|
||||||
|
self.n_warmup_steps = n_warmup_steps
|
||||||
|
self.n_current_steps = 0
|
||||||
|
self.init_lr = np.power(d_model, -0.5)
|
||||||
|
|
||||||
|
def step_and_update_lr(self):
|
||||||
|
"Step with the inner optimizer"
|
||||||
|
self._update_learning_rate()
|
||||||
|
self._optimizer.step()
|
||||||
|
|
||||||
|
def zero_grad(self):
|
||||||
|
"Zero out the gradients by the inner optimizer"
|
||||||
|
self._optimizer.zero_grad()
|
||||||
|
|
||||||
|
def _get_lr_scale(self):
|
||||||
|
return np.min([
|
||||||
|
np.power(self.n_current_steps, -0.5),
|
||||||
|
np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
|
||||||
|
|
||||||
|
def _update_learning_rate(self):
|
||||||
|
''' Learning rate scheduling per step '''
|
||||||
|
|
||||||
|
self.n_current_steps += 1
|
||||||
|
lr = self.init_lr * self._get_lr_scale()
|
||||||
|
|
||||||
|
for param_group in self._optimizer.param_groups:
|
||||||
|
param_group['lr'] = lr
|
152
src/trainer/pretrain.py
Normal file
152
src/trainer/pretrain.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.optim import Adam
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from model.bert import BERT
|
||||||
|
from model.language_model import BERTLM
|
||||||
|
from .optim_schedule import ScheduledOptim
|
||||||
|
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class BERTTrainer:
|
||||||
|
"""
|
||||||
|
BERTTrainer make the pretrained BERT model with two LM training method.
|
||||||
|
|
||||||
|
1. Masked Language Model : 3.3.1 Task #1: Masked LM
|
||||||
|
2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction
|
||||||
|
|
||||||
|
please check the details on README.md with simple example.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, bert: BERT, vocab_size: int,
|
||||||
|
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
|
||||||
|
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
|
||||||
|
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10):
|
||||||
|
"""
|
||||||
|
:param bert: BERT model which you want to train
|
||||||
|
:param vocab_size: total word vocab size
|
||||||
|
:param train_dataloader: train dataset data loader
|
||||||
|
:param test_dataloader: test dataset data loader [can be None]
|
||||||
|
:param lr: learning rate of optimizer
|
||||||
|
:param betas: Adam optimizer betas
|
||||||
|
:param weight_decay: Adam optimizer weight decay param
|
||||||
|
:param with_cuda: traning with cuda
|
||||||
|
:param log_freq: logging frequency of the batch iteration
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Setup cuda device for BERT training, argument -c, --cuda should be true
|
||||||
|
cuda_condition = torch.cuda.is_available() and with_cuda
|
||||||
|
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
||||||
|
|
||||||
|
# This BERT model will be saved every epoch
|
||||||
|
self.bert = bert
|
||||||
|
# Initialize the BERT Language Model, with BERT model
|
||||||
|
self.model = BERTLM(bert, vocab_size).to(self.device)
|
||||||
|
|
||||||
|
# Distributed GPU training if CUDA can detect more than 1 GPU
|
||||||
|
if with_cuda and torch.cuda.device_count() > 1:
|
||||||
|
print("Using %d GPUS for BERT" % torch.cuda.device_count())
|
||||||
|
self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
|
||||||
|
|
||||||
|
# Setting the train and test data loader
|
||||||
|
self.train_data = train_dataloader
|
||||||
|
self.test_data = test_dataloader
|
||||||
|
|
||||||
|
# Setting the Adam optimizer with hyper-param
|
||||||
|
self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
|
||||||
|
self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
|
||||||
|
|
||||||
|
# Using Negative Log Likelihood Loss function for predicting the masked_token
|
||||||
|
self.criterion = nn.NLLLoss(ignore_index=0)
|
||||||
|
|
||||||
|
self.log_freq = log_freq
|
||||||
|
|
||||||
|
print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
|
||||||
|
|
||||||
|
def train(self, epoch):
|
||||||
|
self.iteration(epoch, self.train_data)
|
||||||
|
|
||||||
|
def test(self, epoch):
|
||||||
|
self.iteration(epoch, self.test_data, train=False)
|
||||||
|
|
||||||
|
def iteration(self, epoch, data_loader, train=True):
|
||||||
|
"""
|
||||||
|
loop over the data_loader for training or testing
|
||||||
|
if on train status, backward operation is activated
|
||||||
|
and also auto save the model every peoch
|
||||||
|
|
||||||
|
:param epoch: current epoch index
|
||||||
|
:param data_loader: torch.utils.data.DataLoader for iteration
|
||||||
|
:param train: boolean value of is train or test
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
str_code = "train" if train else "test"
|
||||||
|
|
||||||
|
# Setting the tqdm progress bar
|
||||||
|
data_iter = tqdm.tqdm(enumerate(data_loader),
|
||||||
|
desc="EP_%s:%d" % (str_code, epoch),
|
||||||
|
total=len(data_loader),
|
||||||
|
bar_format="{l_bar}{r_bar}")
|
||||||
|
|
||||||
|
avg_loss = 0.0
|
||||||
|
total_correct = 0
|
||||||
|
total_element = 0
|
||||||
|
|
||||||
|
for i, data in data_iter:
|
||||||
|
# 0. batch_data will be sent into the device(GPU or cpu)
|
||||||
|
data = {key: value.to(self.device) for key, value in data.items()}
|
||||||
|
|
||||||
|
# 1. forward the next_sentence_prediction and masked_lm model
|
||||||
|
next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
|
||||||
|
|
||||||
|
# 2-1. NLL(negative log likelihood) loss of is_next classification result
|
||||||
|
next_loss = self.criterion(next_sent_output, data["is_next"])
|
||||||
|
|
||||||
|
# 2-2. NLLLoss of predicting masked token word
|
||||||
|
mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
|
||||||
|
|
||||||
|
# 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
|
||||||
|
loss = next_loss + mask_loss
|
||||||
|
|
||||||
|
# 3. backward and optimization only in train
|
||||||
|
if train:
|
||||||
|
self.optim_schedule.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
self.optim_schedule.step_and_update_lr()
|
||||||
|
|
||||||
|
# next sentence prediction accuracy
|
||||||
|
correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
|
||||||
|
avg_loss += loss.item()
|
||||||
|
total_correct += correct
|
||||||
|
total_element += data["is_next"].nelement()
|
||||||
|
|
||||||
|
post_fix = {
|
||||||
|
"epoch": epoch,
|
||||||
|
"iter": i,
|
||||||
|
"avg_loss": avg_loss / (i + 1),
|
||||||
|
"avg_acc": total_correct / total_element * 100,
|
||||||
|
"loss": loss.item()
|
||||||
|
}
|
||||||
|
|
||||||
|
if i % self.log_freq == 0:
|
||||||
|
data_iter.write(str(post_fix))
|
||||||
|
|
||||||
|
print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=",
|
||||||
|
total_correct * 100.0 / total_element)
|
||||||
|
|
||||||
|
def save(self, epoch, file_path="output/bert_trained.model"):
|
||||||
|
"""
|
||||||
|
Saving the current BERT model on file_path
|
||||||
|
|
||||||
|
:param epoch: current epoch number
|
||||||
|
:param file_path: model output path which gonna be file_path+"ep%d" % epoch
|
||||||
|
:return: final_output_path
|
||||||
|
"""
|
||||||
|
output_path = file_path + ".ep%d" % epoch
|
||||||
|
torch.save(self.bert.cpu(), output_path)
|
||||||
|
self.bert.to(self.device)
|
||||||
|
print("EP:%d Model Saved on:" % epoch, output_path)
|
||||||
|
return output_path
|
Loading…
Reference in New Issue
Block a user