This commit is contained in:
Setra Solofoniaina 2021-04-02 10:24:30 +03:00
parent 42efbf86b6
commit d988d3e4a3
4 changed files with 3 additions and 82 deletions

View File

@ -19,23 +19,6 @@ class BERTDataset(Dataset):
self.corpus_lines = sum(1 for line in open(self.corpus_path))
# with open(corpus_path, "r", encoding=encoding) as f:
# if self.corpus_lines is None and not on_memory:
# for _ in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines):
# self.corpus_lines += 1
# if on_memory:
# self.lines = [line[:-1].split("\t")
# for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)]
# self.corpus_lines = len(self.lines)
# if not on_memory:
# self.file = open(corpus_path, "r", encoding=encoding)
# self.random_file = open(corpus_path, "r", encoding=encoding)
# for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)):
# self.random_file.__next__()
def __len__(self):
return self.corpus_lines
@ -88,71 +71,19 @@ class BERTDataset(Dataset):
output_label.append(0)
return tokens, output_label
# for i, token in enumerate(tokens):
# prob = random.random()
# if prob < 0.15:
# prob /= 0.15
# # 80% randomly change token to mask token
# if prob < 0.8:
# tokens[i] = self.vocab.mask_index
# # 10% randomly change token to random token
# elif prob < 0.9:
# tokens[i] = random.randrange(len(self.vocab))
# # 10% randomly change token to current token
# else:
# tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
# output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index))
# else:
# tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
# output_label.append(0)
# return tokens, output_label
def random_sent(self, index):
t1, t2 = self.get_corpus_line(index)
# t1 = self.tokenizer.tokenize(t1)
# t2 = self.tokenizer.tokenize(t2)
# output_text, label(isNotNext:0, isNext:1)
if random.random() > 0.5:
return t1, t2, 1
else:
# rand_line = self.tokenizer.tokenize(self.get_random_line())
return t1, self.get_random_line(), 0
# def get_corpus_line(self, item):
# if self.on_memory:
# return self.lines[item][0], self.lines[item][1]
# else:
# line = self.file.__next__()
# if line is None:
# self.file.close()
# self.file = open(self.corpus_path, "r", encoding=self.encoding)
# line = self.file.__next__()
# t1, t2 = line[:-1].split("\t")
# return t1, t2
def get_corpus_line(self, item):
t1 = linecache.getline(self.corpus_path, item)
t2 = linecache.getline(self.corpus_path, item+1)
return t1, t2
# def get_random_line(self):
# if self.on_memory:
# return self.lines[random.randrange(len(self.lines))][1]
# line = self.file.__next__()
# if line is None:
# self.file.close()
# self.file = open(self.corpus_path, "r", encoding=self.encoding)
# for _ in range(random.randint(0, self.corpus_lines if self.corpus_lines < 1000 else 1000)):
# self.random_file.__next__()
# line = self.random_file.__next__()
# return line[:-1].split("\t")[1]
def get_random_line(self):
return linecache.getline(self.corpus_path, random.randint(1, self.corpus_lines))

View File

@ -12,16 +12,16 @@ class BertTokenizer():
def __init__(self, path):
self.path = path
text_paths = [str(x) for x in Path("./dataset/corpus/").glob("**/*.txt")]
savedpath = "./dataset/tok_model/MaLaMo-vocab.txt"
savedpath = "./dataset/tok_model/MALBERT-vocab.txt"
if os.path.exists(savedpath):
self.tokenizer = tokenizers.BertWordPieceTokenizer(
"./dataset/tok_model/MaLaMo-vocab.txt",
"./dataset/tok_model/MALBERT-vocab.txt",
)
else:
self.tokenizer = tokenizers.BertWordPieceTokenizer()
self.tokenizer.train(files=text_paths, special_tokens=[
"[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=14200)
self.tokenizer.save_model("./dataset/tok_model", "MaLaMo")
self.tokenizer.save_model("./dataset/tok_model", "MALBERT")
self.tokenizer.enable_truncation(max_length=512)
self.pretokenizer = tokenizers.pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
self.vocab = self.tokenizer.get_vocab()

View File

@ -33,8 +33,6 @@ class BERT(nn.Module):
self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
# multi-layers transformer blocks, deep network
#self.transformer_blocks = nn.ModuleList(
# [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
encoder_layers = nn.TransformerEncoderLayer(hidden, attn_heads, self.feed_forward_hidden, dropout, activation="gelu")
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
@ -50,12 +48,6 @@ class BERT(nn.Module):
self.src_mask = mask
else:
self.src_mask = None
# attention masking for padded token
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
#mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
#mask = mask.view(-1, 512, 512)
#print(x)
# embedding the indexed sequence to sequence of vectors
x = self.embedding(x, segment_info)

View File

@ -17,8 +17,6 @@ class BERTTrainer:
1. Masked Language Model : 3.3.1 Task #1: Masked LM
2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction
please check the details on README.md with simple example.
"""
def __init__(self, bert: BERT, vocab_size: int,