complete data processing and first vision of training script

This commit is contained in:
zyr 2021-06-06 20:50:36 +08:00
commit b595346a07
12 changed files with 2193 additions and 0 deletions

View File

@ -0,0 +1,434 @@
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)
import numpy as np
import tokenizers
import torch
from transformers import BatchEncoding
EncodedInput = List[int]
@dataclass
class MyDataCollatorForPreTraining:
# """
# Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
# are not all of the same length.
# Args:
# # tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
# tokenizer (:class:`tokenizers.Tokenizer`)
# The tokenizer used for encoding the data.
# mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
# Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
# inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
# non-masked tokens and the value to predict for the masked token.
# mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
# The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
# pad_to_multiple_of (:obj:`int`, `optional`):
# If set will pad the sequence to a multiple of the provided value.
# .. note::
# For best performance, this data collator should be used with a dataset having items that are dictionaries or
# BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
# :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
# argument :obj:`return_special_tokens_mask=True`.
# """
# def __init__(
# self,
# tokenizer: tokenizers.Tokenizer,
# mlm: bool = True,
# mlm_probability: float = 0.15,
# pad_to_multiple_of: Optional[int] = None,
# ):
# self.tokenizer = tokenizer
# self.mlm = mlm
# self.mlm_probability = mlm_probability
# self.pad_to_multiple_of = pad_to_multiple_of
tokenizer: tokenizers.Tokenizer
mlm: bool = True
mlm_probability: float = 0.15
pad_to_multiple_of: Optional[int] = None
def __post_init__(self):
if self.mlm and self.tokenizer.token_to_id("[MASK]") is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead."
)
def __call__(
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]],
) -> Dict[str, torch.Tensor]:
# Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], (dict, BatchEncoding)):
batch = pad(
examples,
return_tensors="pt",
pad_to_multiple_of=self.pad_to_multiple_of,
)
else:
batch = {
"input_ids": _collate_batch(
examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
)
}
# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
# else:
# labels = batch["input_ids"].clone()
# if self.tokenizer.pad_token_id is not None:
# labels[labels == self.tokenizer.pad_token_id] = -100
# batch["labels"] = labels
return batch
def mask_tokens(
self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = torch.full(labels.shape, self.mlm_probability)
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(
val, already_has_special_tokens=True
)
for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = (
torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
)
# inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
# self.tokenizer.mask_token
# )
inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]")
# 10% of the time, we replace masked input tokens with random word
indices_random = (
torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
& masked_indices
& ~indices_replaced
)
random_words = torch.randint(
self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long
)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def pad(
self,
encoded_inputs: Union[
BatchEncoding,
List[BatchEncoding],
Dict[str, EncodedInput],
Dict[str, List[EncodedInput]],
List[Dict[str, EncodedInput]],
],
padding=True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors=None,
verbose: bool = True,
) -> BatchEncoding:
"""
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
in the batch.
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
``self.pad_token_id`` and ``self.pad_token_type_id``)
.. note::
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
case of PyTorch tensors, you will lose the specific device of your tensors however.
Args:
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
well as in a PyTorch Dataloader collate function.
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask (:obj:`bool`, `optional`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to print more information and warnings.
"""
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(encoded_inputs, (list, tuple)) and isinstance(
encoded_inputs[0], (dict, BatchEncoding)
):
encoded_inputs = {
key: [example[key] for example in encoded_inputs]
for key in encoded_inputs[0].keys()
}
# The model's main input name, usually `input_ids`, has be passed for padding
# if self.model_input_names[0] not in encoded_inputs:
# raise ValueError(
# "You should supply an encoding or a list of encodings to this method "
# f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
# )
required_input = encoded_inputs["input_ids"]
if not required_input:
if return_attention_mask:
encoded_inputs["attention_mask"] = []
return encoded_inputs
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
first_element = required_input[0]
if isinstance(first_element, (list, tuple)):
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
index = 0
while len(required_input[index]) == 0:
index += 1
if index < len(required_input):
first_element = required_input[index][0]
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
if not isinstance(first_element, (int, list, tuple)):
if isinstance(first_element, torch.Tensor):
return_tensors = "pt" if return_tensors is None else return_tensors
elif isinstance(first_element, np.ndarray):
return_tensors = "np" if return_tensors is None else return_tensors
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy, pytorch or tensorflow object."
)
for key, value in encoded_inputs.items():
encoded_inputs[key] = to_py_obj(value)
# # Convert padding_strategy in PaddingStrategy
# padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
# padding=padding, max_length=max_length, verbose=verbose
# )
required_input = encoded_inputs["input_ids"]
if required_input and not isinstance(required_input[0], (list, tuple)):
# encoded_inputs = _pad(
# encoded_inputs,
# max_length=max_length,
# # padding_strategy=padding_strategy,
# pad_to_multiple_of=pad_to_multiple_of,
# return_attention_mask=return_attention_mask,
# )
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
batch_size = len(required_input)
assert all(
len(v) == batch_size for v in encoded_inputs.values()
), "Some items in the output dictionary have a different batch size than others."
# if padding_strategy == PaddingStrategy.LONGEST:
# max_length = max(len(inputs) for inputs in required_input)
# padding_strategy = PaddingStrategy.MAX_LENGTH
batch_outputs = {}
for i in range(batch_size):
inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
# outputs = self._pad(
# inputs,
# max_length=max_length,
# # padding_strategy=padding_strategy,
# pad_to_multiple_of=pad_to_multiple_of,
# return_attention_mask=return_attention_mask,
# )
for key, value in inputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
# def _pad(
# self,
# encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
# max_length: Optional[int] = None,
# padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
# pad_to_multiple_of: Optional[int] = None,
# return_attention_mask: Optional[bool] = None,
# ) -> dict:
# """
# Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
# Args:
# encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
# max_length: maximum length of the returned list and optionally padding length (see below).
# Will truncate by taking into account the special tokens.
# padding_strategy: PaddingStrategy to use for padding.
# - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
# - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
# - PaddingStrategy.DO_NOT_PAD: Do not pad
# The tokenizer padding sides are defined in self.padding_side:
# - 'left': pads on the left of the sequences
# - 'right': pads on the right of the sequences
# pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
# This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
# >= 7.5 (Volta).
# return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
# """
# # Load from model defaults
# if return_attention_mask is None:
# return_attention_mask = "attention_mask" in self.model_input_names
# required_input = encoded_inputs[self.model_input_names[0]]
# if padding_strategy == PaddingStrategy.LONGEST:
# max_length = len(required_input)
# if (
# max_length is not None
# and pad_to_multiple_of is not None
# and (max_length % pad_to_multiple_of != 0)
# ):
# max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
# needs_to_be_padded = (
# padding_strategy != PaddingStrategy.DO_NOT_PAD
# and len(required_input) != max_length
# )
# if needs_to_be_padded:
# difference = max_length - len(required_input)
# if self.padding_side == "right":
# if return_attention_mask:
# encoded_inputs["attention_mask"] = [1] * len(required_input) + [
# 0
# ] * difference
# if "token_type_ids" in encoded_inputs:
# encoded_inputs["token_type_ids"] = (
# encoded_inputs["token_type_ids"]
# + [self.pad_token_type_id] * difference
# )
# if "special_tokens_mask" in encoded_inputs:
# encoded_inputs["special_tokens_mask"] = (
# encoded_inputs["special_tokens_mask"] + [1] * difference
# )
# encoded_inputs[self.model_input_names[0]] = (
# required_input + [self.pad_token_id] * difference
# )
# elif self.padding_side == "left":
# if return_attention_mask:
# encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
# required_input
# )
# if "token_type_ids" in encoded_inputs:
# encoded_inputs["token_type_ids"] = [
# self.pad_token_type_id
# ] * difference + encoded_inputs["token_type_ids"]
# if "special_tokens_mask" in encoded_inputs:
# encoded_inputs["special_tokens_mask"] = [
# 1
# ] * difference + encoded_inputs["special_tokens_mask"]
# encoded_inputs[self.model_input_names[0]] = [
# self.pad_token_id
# ] * difference + required_input
# else:
# raise ValueError("Invalid padding strategy:" + str(self.padding_side))
# elif return_attention_mask and "attention_mask" not in encoded_inputs:
# encoded_inputs["attention_mask"] = [1] * len(required_input)
# return encoded_inputs
def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [torch.tensor(e, dtype=torch.long) for e in examples]
# Check if padding is necessary.
length_of_first = examples[0].size(0)
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length and (
pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0
):
return torch.stack(examples, dim=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(x.size(0) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
for i, example in enumerate(examples):
if tokenizer.padding_side == "right":
result[i, : example.shape[0]] = example
else:
result[i, -example.shape[0] :] = example
return result
def to_py_obj(obj):
if isinstance(obj, torch.Tensor):
return obj.detach().cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj

272
my_data_collator.py Normal file
View File

@ -0,0 +1,272 @@
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)
import numpy as np
import tokenizers
import torch
from transformers import BatchEncoding
EncodedInput = List[int]
@dataclass
class MyDataCollatorForPreTraining:
tokenizer: tokenizers.Tokenizer
mlm: bool = True
mlm_probability: float = 0.15
pad_to_multiple_of: Optional[int] = None
def __post_init__(self):
if self.mlm and self.tokenizer.token_to_id("[MASK]") is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead."
)
def __call__(
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]],
) -> Dict[str, torch.Tensor]:
# Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], (dict, BatchEncoding)):
batch = pad(
encoded_inputs=examples,
return_tensors="pt",
pad_to_multiple_of=self.pad_to_multiple_of,
)
else:
batch = {
"input_ids": _collate_batch(
examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
)
}
# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
return batch
def mask_tokens(
self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = torch.full(labels.shape, self.mlm_probability)
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(
val, already_has_special_tokens=True
)
for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = (
torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
)
# inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
# self.tokenizer.mask_token
# )
inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]")
# 10% of the time, we replace masked input tokens with random word
indices_random = (
torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
& masked_indices
& ~indices_replaced
)
random_words = torch.randint(
self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long
)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def pad(
encoded_inputs: Union[
BatchEncoding,
List[BatchEncoding],
Dict[str, EncodedInput],
Dict[str, List[EncodedInput]],
List[Dict[str, EncodedInput]],
],
padding=True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors=None,
verbose: bool = True,
) -> BatchEncoding:
"""
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
in the batch.
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
``self.pad_token_id`` and ``self.pad_token_type_id``)
.. note::
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
case of PyTorch tensors, you will lose the specific device of your tensors however.
Args:
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
well as in a PyTorch Dataloader collate function.
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta).
return_attention_mask (:obj:`bool`, `optional`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to print more information and warnings.
"""
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(encoded_inputs, (list, tuple)) and isinstance(
encoded_inputs[0], (dict, BatchEncoding)
):
encoded_inputs = {
key: [example[key] for example in encoded_inputs]
for key in encoded_inputs[0].keys()
}
required_input = encoded_inputs["input_ids"]
if not required_input:
if return_attention_mask:
encoded_inputs["attention_mask"] = []
return encoded_inputs
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
first_element = required_input[0]
if isinstance(first_element, (list, tuple)):
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
index = 0
while len(required_input[index]) == 0:
index += 1
if index < len(required_input):
first_element = required_input[index][0]
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
if not isinstance(first_element, (int, list, tuple)):
if isinstance(first_element, torch.Tensor):
return_tensors = "pt" if return_tensors is None else return_tensors
elif isinstance(first_element, np.ndarray):
return_tensors = "np" if return_tensors is None else return_tensors
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy, pytorch or tensorflow object."
)
for key, value in encoded_inputs.items():
encoded_inputs[key] = to_py_obj(value)
required_input = encoded_inputs["input_ids"]
if required_input and not isinstance(required_input[0], (list, tuple)):
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
batch_size = len(required_input)
assert all(
len(v) == batch_size for v in encoded_inputs.values()
), "Some items in the output dictionary have a different batch size than others."
batch_outputs = {}
for i in range(batch_size):
inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
for key, value in inputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [torch.tensor(e, dtype=torch.long) for e in examples]
# Check if padding is necessary.
length_of_first = examples[0].size(0)
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length and (
pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0
):
return torch.stack(examples, dim=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(x.size(0) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
for i, example in enumerate(examples):
if tokenizer.padding_side == "right":
result[i, : example.shape[0]] = example
else:
result[i, -example.shape[0] :] = example
return result
def to_py_obj(obj):
if isinstance(obj, torch.Tensor):
return obj.detach().cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj

421
my_run_mlm_no_trainer.py Normal file
View File

@ -0,0 +1,421 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 Alan Zhao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# """
# Pre-train the BERT on a dataset without using HuggingFace Trainer.
# """
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
import argparse
import logging
import math
import os
import random
import datasets
import numpy as np
import tokenizers
import torch
import transformers
from accelerate import Accelerator
from datasets import load_dataset
from torch.nn import DataParallel
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AdamW,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
BatchEncoding,
BertConfig,
BertForPreTraining,
DataCollatorForLanguageModeling,
SchedulerType,
get_scheduler,
set_seed,
)
from my_data_collator import MyDataCollatorForPreTraining
from process_data.utils import CURRENT_DATA_BASE
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args():
parser = argparse.ArgumentParser(
description="Finetune a transformers model on a Masked Language Modeling task"
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=16,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument(
"--per_device_eval_batch_size",
type=int,
default=64,
help="Batch size (per device) for the evaluation dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument(
"--weight_decay", type=float, default=0.0, help="Weight decay to use."
)
parser.add_argument(
"--num_train_epochs",
type=int,
default=40,
help="Total number of training epochs to perform.",
)
parser.add_argument(
"--max_train_steps",
type=int,
default=None,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
"--lr_scheduler_type",
type=SchedulerType,
default="linear",
help="The scheduler type to use.",
choices=[
"linear",
"cosine",
"cosine_with_restarts",
"polynomial",
"constant",
"constant_with_warmup",
],
)
parser.add_argument(
"--num_warmup_steps",
type=int,
default=0,
help="Number of steps for the warmup in the lr scheduler.",
)
parser.add_argument(
"--output_dir", type=str, default=None, help="Where to store the final model."
)
parser.add_argument(
"--seed", type=int, default=None, help="A seed for reproducible training."
)
parser.add_argument(
"--preprocessing_num_workers",
type=int,
default=None,
help="The number of processes to use for the preprocessing.",
)
parser.add_argument(
"--mlm_probability",
type=float,
default=0.15,
help="Ratio of tokens to mask for masked language modeling loss",
)
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--eval_every_steps",
type=int,
default=5000,
help="Number of steps before evaluating the model.",
)
args = parser.parse_args()
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def main():
args = parse_args()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(
logging.INFO if accelerator.is_local_main_process else logging.ERROR
)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# If passed along, set the training seed now.
if args.seed is not None:
set_seed(args.seed)
# we take control of the load of dataset by oursevles
# there will be several json file for training
# `raw_dataset` has two features:
# `text`: "sentA\tsentB"
# `is_next`: 0 or 1
# raw_datasets = load_dataset(
# "json",
# data_files={
# "train": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
# "validation": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
# },
# field="data",
# )
train_files = [
os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128)
]
valid_file = "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json"
raw_datasets = load_dataset(
"json",
data_files={"train": train_files, "validation": valid_file,},
field="data",
)
# we use the tokenizer previously trained on the dataset above
tokenizer = tokenizers.Tokenizer.from_file(
os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json")
)
# NOTE: have to promise the `length` here is consistent with the one used in `train_my_tokenizer.py`
tokenizer.enable_padding(
pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=32
)
# NOTE: `max_position_embeddings` here should be consistent with `length` above
# we use a much smaller BERT, config is:
config = BertConfig(
vocab_size=tokenizer.get_vocab_size(),
hidden_size=96,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=384,
max_position_embeddings=32,
)
# initalize a new BERT for pre-training
model = BertForPreTraining(config)
# Preprocessing the datasets.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
# First we aplly `tokenize_function` on the dataset.
def tokenize_function(examples):
text = [tuple(sent) for sent in examples["text"]]
encoded_inputs = {}
results = tokenizer.encode_batch(text)
encoded_inputs["input_ids"] = [result.ids for result in results]
encoded_inputs["token_type_ids"] = [result.type_ids for result in results]
encoded_inputs["special_tokens_mask"] = [
result.special_tokens_mask for result in results
]
# according to the document of BERT in HuggingFace
# 0: is
# 1: is not
encoded_inputs["next_sentence_label"] = [
1 - label for label in examples["is_next"]
]
# use `np` rather than `pt` in case of reporting of error
batch_outputs = BatchEncoding(
encoded_inputs, tensor_type="np", prepend_batch_axis=False,
)
return batch_outputs
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = MyDataCollatorForPreTraining(
tokenizer=tokenizer, mlm_probability=args.mlm_probability
)
# DataLoaders creation:
train_dataloader = DataLoader(
train_dataset,
shuffle=True,
collate_fn=data_collator,
batch_size=args.per_device_train_batch_size,
)
eval_dataloader = DataLoader(
eval_dataset,
collate_fn=data_collator,
batch_size=args.per_device_eval_batch_size,
)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": args.weight_decay,
},
{
"params": [
p
for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Prepare everything with our `accelerator`.
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
# model, optimizer, train_dataloader, eval_dataloader
# )
model, optimizer, train_dataloader = accelerator.prepare(
model, optimizer, train_dataloader
)
model = DataParallel(model)
# model.to("cuda:0")
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps
)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(
args.max_train_steps / num_update_steps_per_epoch
)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.max_train_steps,
)
# Train!
total_batch_size = (
args.per_device_train_batch_size
* accelerator.num_processes
* args.gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(
f" Instantaneous batch size per device = {args.per_device_train_batch_size}"
)
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
logger.info(f" Evalute every {args.eval_every_steps} steps")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
range(args.max_train_steps), disable=not accelerator.is_local_main_process
)
completed_steps = 0
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
if (
step % args.gradient_accumulation_steps == 0
or step == len(train_dataloader) - 1
):
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
completed_steps += 1
if completed_steps >= args.max_train_steps:
break
if completed_steps % args.eval_every_steps == 0:
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(
accelerator.gather(loss.repeat(args.per_device_eval_batch_size))
)
losses = torch.cat(losses)
# losses = losses[: len(eval_dataset)]
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
logger.info(f"steps {completed_steps}: perplexity: {perplexity}")
model.train()
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,29 @@
import os
from utils import ORIGINAL_DATA_BASE, read_file
def check(filename):
sents = read_file(filename)
result = 0
for sent in sents:
result = max(result, len(sent[-1].replace("\t", " ").split()))
print("The longest sentence in {} has {} words".format(filename, result))
return result
def main():
longest = 0
# for i in range(6):
for i in [1]:
for group in ("pos", "neg"):
filename = os.path.join(
ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group)
)
longest = max(check(filename), longest)
print("The longest sentence in all files has {} words.".format(longest))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,29 @@
import os
from utils import ORIGINAL_DATA_BASE, read_file
def write_file(data, filename):
print("Writing data into {}...".format(filename))
with open(filename, "w", encoding="utf-8") as fout:
for sent in data:
fout.write(sent.replace("<space>", "SPACE"))
def convert(fin, fout):
print("Start the replacement task for {}...".format(fin))
# filename = "/home/ming/malware/data/elfasm_inst_pairs/linux32_00xxxx.all"
sents = read_file(fin)
write_file(sents, fout)
def main():
# for i in range(6):
for i in [1]:
fin = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i))
fout = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.txt".format(i))
convert(fin, fout)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,54 @@
import os
from multiprocessing import Pool, Process, Queue
from tqdm import tqdm
from utils import ORIGINAL_DATA_BASE, read_file
q = Queue(128)
BASE = 4600000
def counter_worker(sents):
cnt = set()
for sent in tqdm(sents):
cnt = cnt.union(set(sent[:-1].replace("\t", " ").split()))
print("Process {} get {} words".format(os.getpid(), len(cnt)))
q.put(cnt)
return
def counter(filename):
sents = read_file(filename)
p = Pool(36)
for i in range(64):
p.apply_async(counter_worker, args=(sents[i * BASE : (i + 1) * BASE],))
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
cnt = set()
# for sent in tqdm(sents):
# cnt += set(sent[-1].replace("\t", " ").split())
for _ in tqdm(range(64)):
cnt = cnt.union(q.get())
print("There are {} charcters in {}".format(len(cnt), filename))
return cnt
def main():
cnt = set()
# for i in range(6):
for i in [1]:
for group in ["pos", "neg"]:
filename = os.path.join(
ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group)
)
cnt += counter(filename)
print("There are {} charcters in all files".format(len(cnt)))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,34 @@
import os
from random import randint
from tqdm import tqdm
from utils import ORIGINAL_DATA_BASE, read_file
def create(pos, neg, tgt):
pos_sents = read_file(pos)
neg_sents = read_file(neg)
neg_length = len(neg_sents)
print("Start writing negative examples to {}...".format(tgt))
with open(tgt, "w", encoding="utf-8") as fout:
for sent in tqdm(pos_sents):
first = sent.split("\t")[0]
index = randint(0, neg_length - 1)
pair = neg_sents[index].split("\t")[randint(0, 1)].replace("\n", "")
fout.write(first + "\t" + pair + "\n")
def main():
# for i in range(6):
for i in [1]:
j = (i + 1) % 6
pos = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i))
neg = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(j))
tgt = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.txt".format(i))
create(pos, neg, tgt)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,91 @@
import gc
import json
import os
from multiprocessing import Pool, Process, Queue
from tqdm import tqdm
from utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file
BASE = 4600000
def write_worker(sents, json_file, index):
examples = []
for sent in tqdm(sents):
tmp = sent[:-1].split("\t")
examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
examples[-1]["text"] = tuple(examples[-1]["text"])
print("Writing to {}...".format(json_file + "{}.json".format(index)))
results = {"data": examples}
with open(json_file + "{}.json".format(index), "w") as f:
json.dump(results, f)
def merge_to_json(pos, neg, json_file):
sents = read_file(pos)
p = Pool(36)
for i in range(64):
p.apply_async(
write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, i,)
)
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
# length = len(sents)
# base = length // 20000000 + 1
# for i in tqdm(range(length)):
# examples = []
# tmp = sents[i][:-1].split("\t")
# examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
# examples[i]["text"] = tuple(examples[i]["text"])
# index = i // 20000000
# print("Writing to {}...".format(json_file + "{}.json".format(index)))
# with open(json_file + "{}.json".format(index), "w") as f:
# json.dump(examples, f)
del sents
gc.collect()
sents = read_file(neg)
p = Pool(8)
for i in range(64):
p.apply_async(
write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, 64 + i,)
)
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
# length = len(sents)
# for i in tqdm(range(length)):
# examples = []
# tmp = sents[i][:-1].split("\t")
# examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
# examples[i]["text"] = tuple(examples[i]["text"])
# index = i // 20000000
# print("Writing to {}...".format(json_file + "{}.json".format(base + index)))
# with open(json_file + "{}.json".format(base + index), "w") as f:
# json.dump(examples, f)
def main():
# for i in range(6):
for i in [1]:
pos = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.label.txt".format(i))
neg = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.label.txt".format(i))
json_file = os.path.join(CURRENT_DATA_BASE, "inst.{}.".format(i))
merge_to_json(pos, neg, json_file)
if __name__ == "__main__":
main()

22
process_data/readme.md Normal file
View File

@ -0,0 +1,22 @@
# Pre-processing steps
### 1. run `convert_space_format.py`
Convert the string `<space>` to `SPACE`
### 2. run `create_negtive_examples.py`
We use the next file of the current file as its negative examples, which is apparently rational.
Specifically, for each instruction in the current positive file, we randomly choose a line in its next file and select one of two instructions in the line as its negative example.
### 3. run `merge_examples_to_json.py`
We dump the positive and negative examples with their corresponding labels into several json files.
Each json file contains 20m lines of examples.
### 4. run `check_length.py`
We will specify the length padded to when we use the tokenizer, `tokenizer.enable_padding(..., length=)`.
So we need to know the longest sentences in the dataset.
### 5. run `count_word_for_vocab.py`
Similarly, we also need to specify the size of vocabulary when we train the tokenizer, `WordLevelTrainer(vocab_size=, ...)`.
So we need to know how many characters in the dataset.

11
process_data/utils.py Normal file
View File

@ -0,0 +1,11 @@
import os
ORIGINAL_DATA_BASE = "/home/ming/malware/data/elfasm_inst_pairs"
CURRENT_DATA_BASE = "/home/ming/malware/inst2vec_bert/data/asm_bert"
def read_file(filename):
print("Reading data from {}...".format(filename))
with open(filename, "r", encoding="utf-8") as fin:
return fin.readlines()

672
run_mlm_no_trainer.py Normal file
View File

@ -0,0 +1,672 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# """
# Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
# on a text file or a dataset without using HuggingFace Trainer.
# Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
# https://huggingface.co/models?filter=masked-lm
# """
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
import argparse
import logging
import math
import os
import random
import datasets
import numpy as np
import tokenizers
import torch
import transformers
from accelerate import Accelerator
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AdamW,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
BatchEncoding,
BertConfig,
BertForPreTraining,
DataCollatorForLanguageModeling,
SchedulerType,
get_scheduler,
set_seed,
)
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args():
parser = argparse.ArgumentParser(
description="Finetune a transformers model on a Masked Language Modeling task"
)
# parser.add_argument(
# "--dataset_name",
# type=str,
# default=None,
# help="The name of the dataset to use (via the datasets library).",
# )
# parser.add_argument(
# "--dataset_config_name",
# type=str,
# default=None,
# help="The configuration name of the dataset to use (via the datasets library).",
# )
# parser.add_argument(
# "--train_file",
# type=str,
# default=None,
# help="A csv or a json file containing the training data.",
# )
# parser.add_argument(
# "--validation_file",
# type=str,
# default=None,
# help="A csv or a json file containing the validation data.",
# )
# parser.add_argument(
# "--validation_split_percentage",
# default=5,
# help="The percentage of the train set used as validation set in case there's no validation split",
# )
# parser.add_argument(
# "--pad_to_max_length",
# action="store_true",
# help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
# )
# parser.add_argument(
# "--model_name_or_path",
# type=str,
# help="Path to pretrained model or model identifier from huggingface.co/models.",
# required=True,
# )
# parser.add_argument(
# "--config_name",
# type=str,
# default=None,
# help="Pretrained config name or path if not the same as model_name",
# )
# parser.add_argument(
# "--tokenizer_name",
# type=str,
# default=None,
# help="Pretrained tokenizer name or path if not the same as model_name",
# )
# parser.add_argument(
# "--use_slow_tokenizer",
# action="store_true",
# help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
# )
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument(
"--per_device_eval_batch_size",
type=int,
default=8,
help="Batch size (per device) for the evaluation dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument(
"--weight_decay", type=float, default=0.0, help="Weight decay to use."
)
parser.add_argument(
"--num_train_epochs",
type=int,
default=40,
help="Total number of training epochs to perform.",
)
# parser.add_argument(
# "--max_train_steps",
# type=int,
# default=None,
# help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
# )
# parser.add_argument(
# "--gradient_accumulation_steps",
# type=int,
# default=1,
# help="Number of updates steps to accumulate before performing a backward/update pass.",
# )
parser.add_argument(
"--lr_scheduler_type",
type=SchedulerType,
default="linear",
help="The scheduler type to use.",
choices=[
"linear",
"cosine",
"cosine_with_restarts",
"polynomial",
"constant",
"constant_with_warmup",
],
)
parser.add_argument(
"--num_warmup_steps",
type=int,
default=0,
help="Number of steps for the warmup in the lr scheduler.",
)
parser.add_argument(
"--output_dir", type=str, default=None, help="Where to store the final model."
)
parser.add_argument(
"--seed", type=int, default=None, help="A seed for reproducible training."
)
# parser.add_argument(
# "--model_type",
# type=str,
# default=None,
# help="Model type to use if training from scratch.",
# choices=MODEL_TYPES,
# )
# parser.add_argument(
# "--max_seq_length",
# type=int,
# default=None,
# help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
# )
# parser.add_argument(
# "--line_by_line",
# type=bool,
# default=False,
# help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
# )
parser.add_argument(
"--preprocessing_num_workers",
type=int,
default=None,
help="The number of processes to use for the preprocessing.",
)
# parser.add_argument(
# "--overwrite_cache",
# type=bool,
# default=False,
# help="Overwrite the cached training and evaluation sets",
# )
parser.add_argument(
"--mlm_probability",
type=float,
default=0.15,
help="Ratio of tokens to mask for masked language modeling loss",
)
args = parser.parse_args()
# Sanity checks
# if (
# args.dataset_name is None
# and args.train_file is None
# and args.validation_file is None
# ):
# raise ValueError("Need either a dataset name or a training/validation file.")
# else:
# if args.train_file is not None:
# extension = args.train_file.split(".")[-1]
# assert extension in [
# "csv",
# "json",
# "txt",
# ], "`train_file` should be a csv, json or txt file."
# if args.validation_file is not None:
# extension = args.validation_file.split(".")[-1]
# assert extension in [
# "csv",
# "json",
# "txt",
# ], "`validation_file` should be a csv, json or txt file."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def main():
args = parse_args()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(
logging.INFO if accelerator.is_local_main_process else logging.ERROR
)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# If passed along, set the training seed now.
if args.seed is not None:
set_seed(args.seed)
# # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# # (the dataset will be downloaded automatically from the datasets Hub).
# #
# # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# # 'text' is found. You can easily tweak this behavior (see below).
# #
# # In distributed training, the load_dataset function guarantee that only one local process can concurrently
# # download the dataset.
# if args.dataset_name is not None:
# # Downloading and loading a dataset from the hub.
# raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
# if "validation" not in raw_datasets.keys():
# raw_datasets["validation"] = load_dataset(
# args.dataset_name,
# args.dataset_config_name,
# split=f"train[:{args.validation_split_percentage}%]",
# )
# raw_datasets["train"] = load_dataset(
# args.dataset_name,
# args.dataset_config_name,
# split=f"train[{args.validation_split_percentage}%:]",
# )
# else:
# data_files = {}
# if args.train_file is not None:
# data_files["train"] = args.train_file
# if args.validation_file is not None:
# data_files["validation"] = args.validation_file
# extension = args.train_file.split(".")[-1]
# if extension == "txt":
# extension = "text"
# raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# we use dataset in json format,
# `raw_dataset` has two features: `text`:"sentA\tsentB" and `is_next`:0 or 1
# we take control of the load of dataset by oursevles
# there will be several json file for training
# raw_datasets = load_dataset("json", data_files=args.train_file, field="data")
raw_datasets = load_dataset(
"json",
data_files="/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
field="data",
)
# we use the tokenizer trained on the positive dataset before
# tokenizer = tokenizers.Tokenizer.from_file(args.tokenizer_file)
tokenizer = tokenizers.Tokenizer.from_file(
"/home/ming/malware/inst2vec_bert/bert/tokenizer-inst.json"
)
tokenizer.enable_padding(
pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=64
)
# we use a much smaller BERT, config is:
config = BertConfig(
vocab_size=tokenizer.get_vocab_size(),
hidden_size=64,
num_hidden_layers=8,
num_attention_heads=8,
intermediate_size=256,
max_position_embeddings=64,
)
# initalize the new BERT for pre-training
model = BertForPreTraining(config)
# all_special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
# all_special_ids = [tokenizer.token_to_id(token) for token in all_special_tokens]
# # Load pretrained model and tokenizer
# #
# # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# # download model & vocab.
# if args.config_name:
# config = AutoConfig.from_pretrained(args.config_name)
# elif args.model_name_or_path:
# config = AutoConfig.from_pretrained(args.model_name_or_path)
# else:
# config = CONFIG_MAPPING[args.model_type]()
# logger.warning("You are instantiating a new config instance from scratch.")
# if args.tokenizer_name:
# tokenizer = AutoTokenizer.from_pretrained(
# args.tokenizer_name, use_fast=not args.use_slow_tokenizer
# )
# elif args.model_name_or_path:
# tokenizer = AutoTokenizer.from_pretrained(
# args.model_name_or_path, use_fast=not args.use_slow_tokenizer
# )
# else:
# raise ValueError(
# "You are instantiating a new tokenizer from scratch. This is not supported by this script."
# "You can do it from another script, save it, and load it from here, using --tokenizer_name."
# )
# if args.model_name_or_path:
# model = AutoModelForMaskedLM.from_pretrained(
# args.model_name_or_path,
# from_tf=bool(".ckpt" in args.model_name_or_path),
# config=config,
# )
# else:
# logger.info("Training new model from scratch")
# model = AutoModelForMaskedLM.from_config(config)
# model.resize_token_embeddings(len(tokenizer))
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
# if args.max_seq_length is None:
# max_seq_length = tokenizer.model_max_length
# if max_seq_length > 1024:
# logger.warning(
# f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
# "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
# )
# max_seq_length = 1024
# else:
# if args.max_seq_length > tokenizer.model_max_length:
# logger.warning(
# f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
# f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
# )
# max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
# if args.line_by_line:
# # When using line_by_line, we just tokenize each nonempty line.
# padding = "max_length" if args.pad_to_max_length else False
# def tokenize_function(examples):
# # Remove empty lines
# examples["text"] = [
# line
# for line in examples["text"]
# if len(line) > 0 and not line.isspace()
# ]
# return tokenizer(
# examples["text"],
# padding=padding,
# truncation=True,
# max_length=max_seq_length,
# # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
# # receives the `special_tokens_mask`.
# return_special_tokens_mask=True,
# )
# tokenized_datasets = raw_datasets.map(
# tokenize_function,
# batched=True,
# num_proc=args.preprocessing_num_workers,
# remove_columns=[text_column_name],
# load_from_cache_file=not args.overwrite_cache,
# )
# else:
# # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# # efficient when it receives the `special_tokens_mask`.
# def tokenize_function(examples):
# return tokenizer(
# examples[text_column_name], return_special_tokens_mask=True
# )
# tokenized_datasets = raw_datasets.map(
# tokenize_function,
# batched=True,
# num_proc=args.preprocessing_num_workers,
# remove_columns=column_names,
# load_from_cache_file=not args.overwrite_cache,
# )
# # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# # max_seq_length.
# def group_texts(examples):
# # Concatenate all texts.
# concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
# total_length = len(concatenated_examples[list(examples.keys())[0]])
# # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# # customize this part to your needs.
# total_length = (total_length // max_seq_length) * max_seq_length
# # Split by chunks of max_len.
# result = {
# k: [
# t[i : i + max_seq_length]
# for i in range(0, total_length, max_seq_length)
# ]
# for k, t in concatenated_examples.items()
# }
# return result
# # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# # might be slower to preprocess.
# #
# # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# tokenized_datasets = tokenized_datasets.map(
# group_texts,
# batched=True,
# num_proc=args.preprocessing_num_workers,
# load_from_cache_file=not args.overwrite_cache,
# )
def tokenize_function(examples):
text = [tuple(sent) for sent in examples["text"]]
encoded_inputs = {}
results = tokenizer.encode_batch(text)
# input_ids, type_ids, special_token_masks = [], [], []
# for i, result in enumerate(results):
# input_ids.append(result.ids)
# type_ids.append(result.type_ids)
# special_token_masks.append(result.special_tokens_mask)
encoded_inputs["input_ids"] = [result.ids for result in results]
encoded_inputs["token_type_ids"] = [result.type_ids for result in results]
# special_tokens_mask = [1 if token in all_special_ids else 0 for token in ids]
encoded_inputs["special_tokens_mask"] = [
result.special_tokens_mask for result in results
]
# 0: is ; 1 : is not
encoded_inputs["next_sentence_label "] = [
1 - label for label in examples["is_next"]
]
batch_outputs = BatchEncoding(
encoded_inputs, tensor_type="np", prepend_batch_axis=False,
)
return batch_outputs
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
# load_from_cache_file=not args.overwrite_cache,
)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm_probability=args.mlm_probability
)
# DataLoaders creation:
train_dataloader = DataLoader(
train_dataset,
shuffle=True,
collate_fn=data_collator,
batch_size=args.per_device_train_batch_size,
)
eval_dataloader = DataLoader(
eval_dataset,
collate_fn=data_collator,
batch_size=args.per_device_eval_batch_size,
)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": args.weight_decay,
},
{
"params": [
p
for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps
)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(
args.max_train_steps / num_update_steps_per_epoch
)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.max_train_steps,
)
# Train!
total_batch_size = (
args.per_device_train_batch_size
* accelerator.num_processes
* args.gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(
f" Instantaneous batch size per device = {args.per_device_train_batch_size}"
)
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
range(args.max_train_steps), disable=not accelerator.is_local_main_process
)
completed_steps = 0
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
if (
step % args.gradient_accumulation_steps == 0
or step == len(train_dataloader) - 1
):
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
completed_steps += 1
if completed_steps >= args.max_train_steps:
break
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(
accelerator.gather(loss.repeat(args.per_device_eval_batch_size))
)
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if __name__ == "__main__":
main()

124
train_my_tokenizer.py Normal file
View File

@ -0,0 +1,124 @@
import argparse
import os
from itertools import chain
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer
from process_data.utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file
BASE_PATH = "/home/ming/malware/inst2vec_bert/bert/"
def parse_args():
parser = argparse.ArgumentParser(
description="Train a word level tokenizer for ASM_BERT"
)
parser.add_argument(
"--vocab_size",
type=int,
default=2000,
help="The size of vocabulary used to train the tokenizer.",
)
parser.add_argument(
"--padding_length",
type=int,
default=32,
help="The length will be padded to by the tokenizer.",
)
args = parser.parse_args()
return args
def train_tokenizer(args, dataset):
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer(
vocab_size=args.vocab_size,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
)
# def batch_iterator(batch_size=1000):
# for i in range(0, len(dataset), batch_size):
# yield dataset[i : i + batch_size]["text"]
# tokenizer.train_from_iterator(
# batch_iterator(), trainer=trainer, length=len(dataset)
# )
tokenizer.train_from_iterator(dataset, trainer)
return tokenizer
def save_tokenizer(tokenizer, tokenizer_file):
tokenizer.save(tokenizer_file)
def load_tokenizer(tokenizer_file):
if not os.path.exists(tokenizer_file):
print("{} doesn't exist, will be retrained...".format(tokenizer_file))
return None
print("The tokenizer has already been trained.")
return Tokenizer.from_file(tokenizer_file)
def post_process(tokenizer):
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]")),
],
)
return tokenizer
def tokenizer_encode(tokenizer, data):
return tokenizer.encode_batch(data)
def main(tokenizer_file=""):
args = parse_args()
tokenizer = load_tokenizer(tokenizer_file)
if tokenizer is not None:
return
# json_files = [
# os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128)
# ]
# dataset = load_dataset("json", data_files=json_files, field="data")
text_files = [
os.path.join(ORIGINAL_DATA_BASE, "inst.1.{}.txt".format(group))
for group in ["pos", "neg"]
]
dataset = []
for f in text_files:
dataset += read_file(f)
dataset = [tuple(sent[:-1].split("\t")) for sent in dataset]
print("Trainging tokenizer...")
tokenizer = train_tokenizer(args, chain.from_iterable(dataset))
tokenizer = post_process(tokenizer)
tokenizer.enable_padding(
pad_id=tokenizer.token_to_id("[PAD]"),
pad_token="[PAD]",
length=args.padding_length,
)
save_tokenizer(tokenizer, tokenizer_file)
if __name__ == "__main__":
main(os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json"))