commit 4b6c65862be90d75f949cda37e6914403b2f9f20 Author: Setra Solofoniaina <60129070+Setra-Solofoniaina@users.noreply.github.com> Date: Fri Apr 2 09:54:59 2021 +0300 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ddde8a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/src/dataset/corpus/* +/src/dataset/tok_model/* +/src/dataset/__pycache__ +/src/model/__pycache__ +/src/output/* +/src/trainer/__pycache__ +/.vscode \ No newline at end of file diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py new file mode 100644 index 0000000..6845a39 --- /dev/null +++ b/src/dataset/__init__.py @@ -0,0 +1,2 @@ +from .tokenizer import BertTokenizer +from .dataset import BERTDataset \ No newline at end of file diff --git a/src/dataset/dataset.py b/src/dataset/dataset.py new file mode 100644 index 0000000..7f05bc0 --- /dev/null +++ b/src/dataset/dataset.py @@ -0,0 +1,158 @@ +"""Dataset Class for Bert""" +import random +import tqdm +import torch +import linecache +from torch.utils.data import Dataset +from .tokenizer import BertTokenizer + + +class BERTDataset(Dataset): + def __init__(self, corpus_path, tokenizer: BertTokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True): + self.tokenizer = tokenizer + self.seq_len = seq_len + + self.on_memory = on_memory + self.corpus_lines = corpus_lines + self.corpus_path = corpus_path + self.encoding = encoding + + self.corpus_lines = sum(1 for line in open(self.corpus_path)) + + # with open(corpus_path, "r", encoding=encoding) as f: + # if self.corpus_lines is None and not on_memory: + # for _ in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines): + # self.corpus_lines += 1 + + # if on_memory: + # self.lines = [line[:-1].split("\t") + # for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)] + # self.corpus_lines = len(self.lines) + + # if not on_memory: + # self.file = open(corpus_path, "r", encoding=encoding) + # self.random_file = open(corpus_path, "r", encoding=encoding) + + # for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)): + # self.random_file.__next__() + + def __len__(self): + return self.corpus_lines + + def __getitem__(self, item): + t1, t2, is_next_label = self.random_sent(item) + t1_random, t1_label = self.random_word(t1) + t2_random, t2_label = self.random_word(t2) + + # [CLS] tag = SOS tag, [SEP] tag = EOS tag + t1 = [self.tokenizer.sos_index] + t1_random + [self.tokenizer.eos_index] + t2 = t2_random + [self.tokenizer.eos_index] + + t1_label = [self.tokenizer.pad_index] + t1_label + [self.tokenizer.pad_index] + t2_label = t2_label + [self.tokenizer.pad_index] + + segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len] + bert_input = (t1 + t2)[:self.seq_len] + bert_label = (t1_label + t2_label)[:self.seq_len] + + padding = [self.tokenizer.pad_index for _ in range(self.seq_len - len(bert_input))] + bert_input.extend(padding) + bert_label.extend(padding) + segment_label.extend(padding) + + output = {"bert_input": bert_input, + "bert_label": bert_label, + "segment_label": segment_label, + "is_next": is_next_label} + + return {key: torch.tensor(value) for key, value in output.items()} #pylint: disable=not-callable + + def random_word(self, sentence): + # tokens = sentence.split() + output_label = [] + tokens = self.tokenizer.tokenize(sentence) + for i, token in enumerate(tokens): + prob = random.random() + if prob < 0.15: + prob /= 0.15 + + if prob < 0.8: + tokens[i] = self.tokenizer.mask_index + elif prob < 0.9: + tokens[i] = self.tokenizer.getRandomTokenID() + else: + tokens[i] = token + output_label.append(token) + else: + tokens[i] = token + output_label.append(0) + return tokens, output_label + + # for i, token in enumerate(tokens): + # prob = random.random() + # if prob < 0.15: + # prob /= 0.15 + + # # 80% randomly change token to mask token + # if prob < 0.8: + # tokens[i] = self.vocab.mask_index + + # # 10% randomly change token to random token + # elif prob < 0.9: + # tokens[i] = random.randrange(len(self.vocab)) + + # # 10% randomly change token to current token + # else: + # tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index) + + # output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index)) + + # else: + # tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index) + # output_label.append(0) + + # return tokens, output_label + + def random_sent(self, index): + t1, t2 = self.get_corpus_line(index) + # t1 = self.tokenizer.tokenize(t1) + # t2 = self.tokenizer.tokenize(t2) + # output_text, label(isNotNext:0, isNext:1) + if random.random() > 0.5: + return t1, t2, 1 + else: + # rand_line = self.tokenizer.tokenize(self.get_random_line()) + return t1, self.get_random_line(), 0 + + # def get_corpus_line(self, item): + # if self.on_memory: + # return self.lines[item][0], self.lines[item][1] + # else: + # line = self.file.__next__() + # if line is None: + # self.file.close() + # self.file = open(self.corpus_path, "r", encoding=self.encoding) + # line = self.file.__next__() + + # t1, t2 = line[:-1].split("\t") + # return t1, t2 + def get_corpus_line(self, item): + t1 = linecache.getline(self.corpus_path, item) + t2 = linecache.getline(self.corpus_path, item+1) + return t1, t2 + + # def get_random_line(self): + # if self.on_memory: + # return self.lines[random.randrange(len(self.lines))][1] + + # line = self.file.__next__() + # if line is None: + # self.file.close() + # self.file = open(self.corpus_path, "r", encoding=self.encoding) + # for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)): + # self.random_file.__next__() + # line = self.random_file.__next__() + # return line[:-1].split("\t")[1] + + def get_random_line(self): + return linecache.getline(self.corpus_path, random.randint(1, self.corpus_lines)) \ No newline at end of file diff --git a/src/dataset/tokenizer.py b/src/dataset/tokenizer.py new file mode 100644 index 0000000..3a7249c --- /dev/null +++ b/src/dataset/tokenizer.py @@ -0,0 +1,46 @@ +""" Tokenizer class """ +import os +import random +from pathlib import Path +import tokenizers +from tokenizers.pre_tokenizers import Whitespace +from tokenizers.pre_tokenizers import Digits + + +class BertTokenizer(): + """Bert Tokenizer using WordPiece Tokenizer Model""" + def __init__(self, path): + self.path = path + text_paths = [str(x) for x in Path("./dataset/corpus/").glob("**/*.txt")] + savedpath = "./dataset/tok_model/MaLaMo-vocab.txt" + if os.path.exists(savedpath): + self.tokenizer = tokenizers.BertWordPieceTokenizer( + "./dataset/tok_model/MaLaMo-vocab.txt", + ) + else: + self.tokenizer = tokenizers.BertWordPieceTokenizer() + self.tokenizer.train(files=text_paths, special_tokens=[ + "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=14200) + self.tokenizer.save_model("./dataset/tok_model", "MaLaMo") + self.tokenizer.enable_truncation(max_length=512) + self.pretokenizer = tokenizers.pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)]) + self.vocab = self.tokenizer.get_vocab() + self.mask_index = self.vocab.get("[MASK]") + self.pad_index = self.vocab.get("[PAD]") + self.eos_index = self.vocab.get("[SEP]") + self.sos_index = self.vocab.get("[CLS]") + self.unk_index = self.vocab.get("[UNK]") + + + def tokenize(self, sentence: str): + return self.tokenizer.encode(sentence).ids + + def getRandomTokenID(self): + return random.randint(6, len(self.vocab) - 1) + + def get_vocab(self): + return self.tokenizer.get_vocab() + + def get_vocab_size(self): + return self.tokenizer.get_vocab_size() + \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..a84dcd5 --- /dev/null +++ b/src/main.py @@ -0,0 +1,77 @@ +"""main entry for training""" + +import argparse + +from torch.utils.data import DataLoader + +from model.bert import BERT +from trainer import BERTTrainer +from dataset import BERTDataset, BertTokenizer + + +def train(): + parser = argparse.ArgumentParser() + + parser.add_argument("-c", "--train_dataset", type=str, default="./dataset/corpus/train.txt", help="train dataset for train bert") + parser.add_argument("-t", "--test_dataset", type=str, default="./dataset/corpus/test.txt", help="test set for evaluate train set") + #parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") + parser.add_argument("-o", "--output_path", type=str, default="./output/bert.model", help="ex)output/bert.model") + + parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") + parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") + parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") + parser.add_argument("-s", "--seq_len", type=int, default=512, help="maximum sequence len") + + parser.add_argument("-b", "--batch_size", type=int, default=8, help="number of batch_size") + parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") + parser.add_argument("-w", "--num_workers", type=int, default=1, help="dataloader worker size") + + parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") + parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") + parser.add_argument("--corpus_lines", type=int, default=5110, help="total number of lines in corpus") + parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") + parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false") + + parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") + parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") + parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") + + args = parser.parse_args() + + print("Loading Vocab") + tokenizer = BertTokenizer("./dataset/corpus") + vocab_size = tokenizer.get_vocab_size() + print("Vocab Size: ", vocab_size) + + print("Loading Train Dataset", args.train_dataset) + train_dataset = BERTDataset(args.train_dataset, tokenizer, seq_len=args.seq_len, + corpus_lines=args.corpus_lines, on_memory=args.on_memory) + + print("Loading Test Dataset", args.test_dataset) + test_dataset = BERTDataset(args.test_dataset, tokenizer, seq_len=args.seq_len, on_memory=args.on_memory) \ + if args.test_dataset is not None else None + + print("Creating Dataloader") + train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ + if test_dataset is not None else None + + print("Building BERT model") + bert = BERT(vocab_size, hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) + + print("Creating BERT Trainer") + trainer = BERTTrainer(bert, vocab_size, train_dataloader=train_data_loader, test_dataloader=test_data_loader, + lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, + with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) + + print("Training Start") + for epoch in range(args.epochs): + trainer.train(epoch) + trainer.save(epoch, args.output_path) + + if test_data_loader is not None: + trainer.test(epoch) + +if __name__ == "__main__": + train() diff --git a/src/model/__init__.py b/src/model/__init__.py new file mode 100644 index 0000000..1a62230 --- /dev/null +++ b/src/model/__init__.py @@ -0,0 +1,2 @@ +from .bert import BERT +from .language_model import BERTLM \ No newline at end of file diff --git a/src/model/bert.py b/src/model/bert.py new file mode 100644 index 0000000..22dc5a8 --- /dev/null +++ b/src/model/bert.py @@ -0,0 +1,64 @@ +""" BERT CLASS MODuLE""" + +from .embedding import BERTEmbedding +import torch +import torch.nn as nn + + +class BERT(nn.Module): + """ + BERT model : Bidirectional Encoder Representations from Transformers. + """ + + def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1): + """ + :param vocab_size: vocab_size of total words + :param hidden: BERT model hidden size + :param n_layers: numbers of Transformer blocks(layers) + :param attn_heads: number of attention heads + :param dropout: dropout rate + """ + + super().__init__() + self.hidden = hidden + self.n_layers = n_layers + self.attn_heads = attn_heads + + # paper noted they used 4*hidden_size for ff_network_hidden_size + self.feed_forward_hidden = hidden * 4 + + self.src_mask = None + + # embedding for BERT, sum of positional, segment, token embeddings + self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden) + + # multi-layers transformer blocks, deep network + #self.transformer_blocks = nn.ModuleList( + # [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)]) + encoder_layers = nn.TransformerEncoderLayer(hidden, attn_heads, self.feed_forward_hidden, dropout) + self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers) + + def _generate_square_subsequent_mask(self, sz): + mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + def forward(self, x, segment_info, has_mask=True): + if has_mask: + if self.src_mask is None or self.src_mask.size(0) != len(x): + mask = self._generate_square_subsequent_mask(len(x)) + self.src_mask = mask + else: + self.src_mask = None + # attention masking for padded token + # torch.ByteTensor([batch_size, 1, seq_len, seq_len) + #mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1) + #mask = mask.view(-1, 512, 512) + + #print(x) + + # embedding the indexed sequence to sequence of vectors + x = self.embedding(x, segment_info) + x = self.transformer_encoder(x, self.src_mask) + + return x \ No newline at end of file diff --git a/src/model/embedding/__init__.py b/src/model/embedding/__init__.py new file mode 100644 index 0000000..d9cc742 --- /dev/null +++ b/src/model/embedding/__init__.py @@ -0,0 +1 @@ +from .bert import BERTEmbedding \ No newline at end of file diff --git a/src/model/embedding/__pycache__/__init__.cpython-38.pyc b/src/model/embedding/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..a7b79af Binary files /dev/null and b/src/model/embedding/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/model/embedding/__pycache__/bert.cpython-38.pyc b/src/model/embedding/__pycache__/bert.cpython-38.pyc new file mode 100644 index 0000000..bc97395 Binary files /dev/null and b/src/model/embedding/__pycache__/bert.cpython-38.pyc differ diff --git a/src/model/embedding/__pycache__/position.cpython-38.pyc b/src/model/embedding/__pycache__/position.cpython-38.pyc new file mode 100644 index 0000000..ca746b0 Binary files /dev/null and b/src/model/embedding/__pycache__/position.cpython-38.pyc differ diff --git a/src/model/embedding/__pycache__/segment.cpython-38.pyc b/src/model/embedding/__pycache__/segment.cpython-38.pyc new file mode 100644 index 0000000..475ef68 Binary files /dev/null and b/src/model/embedding/__pycache__/segment.cpython-38.pyc differ diff --git a/src/model/embedding/__pycache__/token.cpython-38.pyc b/src/model/embedding/__pycache__/token.cpython-38.pyc new file mode 100644 index 0000000..0748f5d Binary files /dev/null and b/src/model/embedding/__pycache__/token.cpython-38.pyc differ diff --git a/src/model/embedding/bert.py b/src/model/embedding/bert.py new file mode 100644 index 0000000..5984460 --- /dev/null +++ b/src/model/embedding/bert.py @@ -0,0 +1,36 @@ +"""Bert Embedding Module""" + +import torch.nn as nn +from .token import TokenEmbedding +from .position import PositionalEmbedding +from .segment import SegmentEmbedding + + +class BERTEmbedding(nn.Module): + """ + BERT Embedding which is consisted with under features + 1. TokenEmbedding : normal embedding matrix + 2. PositionalEmbedding : adding positional information using sin, cos + 2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2) + sum of all these features are output of BERTEmbedding + """ + + def __init__(self, vocab_size, embed_size, dropout=0.1): + """ + :param vocab_size: total vocab size + :param embed_size: embedding size of token embedding + :param dropout: dropout rate + """ + super().__init__() + self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size) + self.position = PositionalEmbedding(d_model=self.token.embedding_dim) + #self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim) + self.segment = nn.Embedding(8, self.token.embedding_dim, padding_idx=0) + self.dropout = nn.Dropout(p=dropout) + self.embed_size = embed_size + + def forward(self, sequence, segment_label): + #print(segment_label.shape) + #segmented = self.segment(segment_label) + x = self.token(sequence) + self.position(sequence) + self.segment(segment_label) + return self.dropout(x) \ No newline at end of file diff --git a/src/model/embedding/position.py b/src/model/embedding/position.py new file mode 100644 index 0000000..ddb18f9 --- /dev/null +++ b/src/model/embedding/position.py @@ -0,0 +1,27 @@ +""" Positional Embedding Module """ + +import math +import torch.nn as nn +import torch + + +class PositionalEmbedding(nn.Module): + + def __init__(self, d_model, max_len=512): + super().__init__() + + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] \ No newline at end of file diff --git a/src/model/embedding/segment.py b/src/model/embedding/segment.py new file mode 100644 index 0000000..110a5bf --- /dev/null +++ b/src/model/embedding/segment.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +class SegmentEmbedding(nn.Embedding): + def __init__(self, embed_size=512): + super().__init__(3, embed_size, padding_idx=0) \ No newline at end of file diff --git a/src/model/embedding/token.py b/src/model/embedding/token.py new file mode 100644 index 0000000..de7df41 --- /dev/null +++ b/src/model/embedding/token.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +class TokenEmbedding(nn.Embedding): + def __init__(self, vocab_size, embed_size=512): + super().__init__(vocab_size, embed_size, padding_idx=0) \ No newline at end of file diff --git a/src/model/language_model.py b/src/model/language_model.py new file mode 100644 index 0000000..608f42a --- /dev/null +++ b/src/model/language_model.py @@ -0,0 +1,61 @@ +import torch.nn as nn + +from .bert import BERT + + +class BERTLM(nn.Module): + """ + BERT Language Model + Next Sentence Prediction Model + Masked Language Model + """ + + def __init__(self, bert: BERT, vocab_size): + """ + :param bert: BERT model which should be trained + :param vocab_size: total vocab size for masked_lm + """ + + super().__init__() + self.bert = bert + self.next_sentence = NextSentencePrediction(self.bert.hidden) + self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size) + + def forward(self, x, segment_label): + x = self.bert(x, segment_label) + return self.next_sentence(x), self.mask_lm(x) + + +class NextSentencePrediction(nn.Module): + """ + 2-class classification model : is_next, is_not_next + """ + + def __init__(self, hidden): + """ + :param hidden: BERT model output size + """ + super().__init__() + self.linear = nn.Linear(hidden, 2) + self.softmax = nn.LogSoftmax(dim=-1) + + def forward(self, x): + return self.softmax(self.linear(x[:, 0])) + + +class MaskedLanguageModel(nn.Module): + """ + predicting origin token from masked input sequence + n-class classification problem, n-class = vocab_size + """ + + def __init__(self, hidden, vocab_size): + """ + :param hidden: output size of BERT model + :param vocab_size: total vocab size + """ + super().__init__() + self.linear = nn.Linear(hidden, vocab_size) + self.softmax = nn.LogSoftmax(dim=-1) + + def forward(self, x): + return self.softmax(self.linear(x)) diff --git a/src/model/utils/__init__.py b/src/model/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/model/utils/gelu.py b/src/model/utils/gelu.py new file mode 100644 index 0000000..9809fcc --- /dev/null +++ b/src/model/utils/gelu.py @@ -0,0 +1,12 @@ +import math +import torch.nn as nn +import torch + + +class GELU(nn.Module): + """ + GELU Activation function + """ + + def forward(self, x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) \ No newline at end of file diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py new file mode 100644 index 0000000..2727c92 --- /dev/null +++ b/src/trainer/__init__.py @@ -0,0 +1 @@ +from .pretrain import BERTTrainer \ No newline at end of file diff --git a/src/trainer/optim_schedule.py b/src/trainer/optim_schedule.py new file mode 100644 index 0000000..cfe7472 --- /dev/null +++ b/src/trainer/optim_schedule.py @@ -0,0 +1,35 @@ +'''A wrapper class for optimizer ''' +import numpy as np + + +class ScheduledOptim(): + '''A simple wrapper class for learning rate scheduling''' + + def __init__(self, optimizer, d_model, n_warmup_steps): + self._optimizer = optimizer + self.n_warmup_steps = n_warmup_steps + self.n_current_steps = 0 + self.init_lr = np.power(d_model, -0.5) + + def step_and_update_lr(self): + "Step with the inner optimizer" + self._update_learning_rate() + self._optimizer.step() + + def zero_grad(self): + "Zero out the gradients by the inner optimizer" + self._optimizer.zero_grad() + + def _get_lr_scale(self): + return np.min([ + np.power(self.n_current_steps, -0.5), + np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) + + def _update_learning_rate(self): + ''' Learning rate scheduling per step ''' + + self.n_current_steps += 1 + lr = self.init_lr * self._get_lr_scale() + + for param_group in self._optimizer.param_groups: + param_group['lr'] = lr \ No newline at end of file diff --git a/src/trainer/pretrain.py b/src/trainer/pretrain.py new file mode 100644 index 0000000..5a4d874 --- /dev/null +++ b/src/trainer/pretrain.py @@ -0,0 +1,152 @@ +import torch +import torch.nn as nn +from torch.optim import Adam +from torch.utils.data import DataLoader + +from model.bert import BERT +from model.language_model import BERTLM +from .optim_schedule import ScheduledOptim + +import tqdm + + +class BERTTrainer: + """ + BERTTrainer make the pretrained BERT model with two LM training method. + + 1. Masked Language Model : 3.3.1 Task #1: Masked LM + 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction + + please check the details on README.md with simple example. + + """ + + def __init__(self, bert: BERT, vocab_size: int, + train_dataloader: DataLoader, test_dataloader: DataLoader = None, + lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, + with_cuda: bool = True, cuda_devices=None, log_freq: int = 10): + """ + :param bert: BERT model which you want to train + :param vocab_size: total word vocab size + :param train_dataloader: train dataset data loader + :param test_dataloader: test dataset data loader [can be None] + :param lr: learning rate of optimizer + :param betas: Adam optimizer betas + :param weight_decay: Adam optimizer weight decay param + :param with_cuda: traning with cuda + :param log_freq: logging frequency of the batch iteration + """ + + # Setup cuda device for BERT training, argument -c, --cuda should be true + cuda_condition = torch.cuda.is_available() and with_cuda + self.device = torch.device("cuda:0" if cuda_condition else "cpu") + + # This BERT model will be saved every epoch + self.bert = bert + # Initialize the BERT Language Model, with BERT model + self.model = BERTLM(bert, vocab_size).to(self.device) + + # Distributed GPU training if CUDA can detect more than 1 GPU + if with_cuda and torch.cuda.device_count() > 1: + print("Using %d GPUS for BERT" % torch.cuda.device_count()) + self.model = nn.DataParallel(self.model, device_ids=cuda_devices) + + # Setting the train and test data loader + self.train_data = train_dataloader + self.test_data = test_dataloader + + # Setting the Adam optimizer with hyper-param + self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) + self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) + + # Using Negative Log Likelihood Loss function for predicting the masked_token + self.criterion = nn.NLLLoss(ignore_index=0) + + self.log_freq = log_freq + + print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) + + def train(self, epoch): + self.iteration(epoch, self.train_data) + + def test(self, epoch): + self.iteration(epoch, self.test_data, train=False) + + def iteration(self, epoch, data_loader, train=True): + """ + loop over the data_loader for training or testing + if on train status, backward operation is activated + and also auto save the model every peoch + + :param epoch: current epoch index + :param data_loader: torch.utils.data.DataLoader for iteration + :param train: boolean value of is train or test + :return: None + """ + str_code = "train" if train else "test" + + # Setting the tqdm progress bar + data_iter = tqdm.tqdm(enumerate(data_loader), + desc="EP_%s:%d" % (str_code, epoch), + total=len(data_loader), + bar_format="{l_bar}{r_bar}") + + avg_loss = 0.0 + total_correct = 0 + total_element = 0 + + for i, data in data_iter: + # 0. batch_data will be sent into the device(GPU or cpu) + data = {key: value.to(self.device) for key, value in data.items()} + + # 1. forward the next_sentence_prediction and masked_lm model + next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"]) + + # 2-1. NLL(negative log likelihood) loss of is_next classification result + next_loss = self.criterion(next_sent_output, data["is_next"]) + + # 2-2. NLLLoss of predicting masked token word + mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) + + # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure + loss = next_loss + mask_loss + + # 3. backward and optimization only in train + if train: + self.optim_schedule.zero_grad() + loss.backward() + self.optim_schedule.step_and_update_lr() + + # next sentence prediction accuracy + correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item() + avg_loss += loss.item() + total_correct += correct + total_element += data["is_next"].nelement() + + post_fix = { + "epoch": epoch, + "iter": i, + "avg_loss": avg_loss / (i + 1), + "avg_acc": total_correct / total_element * 100, + "loss": loss.item() + } + + if i % self.log_freq == 0: + data_iter.write(str(post_fix)) + + print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", + total_correct * 100.0 / total_element) + + def save(self, epoch, file_path="output/bert_trained.model"): + """ + Saving the current BERT model on file_path + + :param epoch: current epoch number + :param file_path: model output path which gonna be file_path+"ep%d" % epoch + :return: final_output_path + """ + output_path = file_path + ".ep%d" % epoch + torch.save(self.bert.cpu(), output_path) + self.bert.to(self.device) + print("EP:%d Model Saved on:" % epoch, output_path) + return output_path