From b595346a074b5347f34c27d0e0492bad41333b9e Mon Sep 17 00:00:00 2001 From: zyr Date: Sun, 6 Jun 2021 20:50:36 +0800 Subject: [PATCH] complete data processing and first vision of training script --- data_collator_for_language_model.py | 434 +++++++++++++++ my_data_collator.py | 272 +++++++++ my_run_mlm_no_trainer.py | 421 ++++++++++++++ process_data/check_length.py | 29 + process_data/convert_space_format.py | 29 + process_data/count_word_for_vocab.py | 54 ++ process_data/create_negative_examples.py | 34 ++ process_data/merge_examples_to_json.py | 91 +++ process_data/readme.md | 22 + process_data/utils.py | 11 + run_mlm_no_trainer.py | 672 +++++++++++++++++++++++ train_my_tokenizer.py | 124 +++++ 12 files changed, 2193 insertions(+) create mode 100644 data_collator_for_language_model.py create mode 100644 my_data_collator.py create mode 100644 my_run_mlm_no_trainer.py create mode 100644 process_data/check_length.py create mode 100644 process_data/convert_space_format.py create mode 100644 process_data/count_word_for_vocab.py create mode 100644 process_data/create_negative_examples.py create mode 100644 process_data/merge_examples_to_json.py create mode 100644 process_data/readme.md create mode 100644 process_data/utils.py create mode 100644 run_mlm_no_trainer.py create mode 100644 train_my_tokenizer.py diff --git a/data_collator_for_language_model.py b/data_collator_for_language_model.py new file mode 100644 index 0000000..ff73763 --- /dev/null +++ b/data_collator_for_language_model.py @@ -0,0 +1,434 @@ +from dataclasses import dataclass +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + NamedTuple, + Optional, + Sequence, + Tuple, + Union, +) + +import numpy as np +import tokenizers +import torch +from transformers import BatchEncoding + +EncodedInput = List[int] + + +@dataclass +class MyDataCollatorForPreTraining: + # """ + # Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they + # are not all of the same length. + + # Args: + # # tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): + + # tokenizer (:class:`tokenizers.Tokenizer`) + # The tokenizer used for encoding the data. + # mlm (:obj:`bool`, `optional`, defaults to :obj:`True`): + # Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the + # inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for + # non-masked tokens and the value to predict for the masked token. + # mlm_probability (:obj:`float`, `optional`, defaults to 0.15): + # The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`. + # pad_to_multiple_of (:obj:`int`, `optional`): + # If set will pad the sequence to a multiple of the provided value. + + # .. note:: + + # For best performance, this data collator should be used with a dataset having items that are dictionaries or + # BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a + # :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the + # argument :obj:`return_special_tokens_mask=True`. + # """ + # def __init__( + # self, + # tokenizer: tokenizers.Tokenizer, + # mlm: bool = True, + # mlm_probability: float = 0.15, + # pad_to_multiple_of: Optional[int] = None, + # ): + # self.tokenizer = tokenizer + # self.mlm = mlm + # self.mlm_probability = mlm_probability + # self.pad_to_multiple_of = pad_to_multiple_of + tokenizer: tokenizers.Tokenizer + mlm: bool = True + mlm_probability: float = 0.15 + pad_to_multiple_of: Optional[int] = None + + def __post_init__(self): + if self.mlm and self.tokenizer.token_to_id("[MASK]") is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. " + "You should pass `mlm=False` to train on causal language modeling instead." + ) + + def __call__( + self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]], + ) -> Dict[str, torch.Tensor]: + # Handle dict or lists with proper padding and conversion to tensor. + if isinstance(examples[0], (dict, BatchEncoding)): + batch = pad( + examples, + return_tensors="pt", + pad_to_multiple_of=self.pad_to_multiple_of, + ) + else: + batch = { + "input_ids": _collate_batch( + examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of + ) + } + + # If special token mask has been preprocessed, pop it from the dict. + special_tokens_mask = batch.pop("special_tokens_mask", None) + if self.mlm: + batch["input_ids"], batch["labels"] = self.mask_tokens( + batch["input_ids"], special_tokens_mask=special_tokens_mask + ) + # else: + # labels = batch["input_ids"].clone() + # if self.tokenizer.pad_token_id is not None: + # labels[labels == self.tokenizer.pad_token_id] = -100 + # batch["labels"] = labels + return batch + + def mask_tokens( + self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + labels = inputs.clone() + # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) + probability_matrix = torch.full(labels.shape, self.mlm_probability) + if special_tokens_mask is None: + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask( + val, already_has_special_tokens=True + ) + for val in labels.tolist() + ] + special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) + else: + special_tokens_mask = special_tokens_mask.bool() + + probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = ( + torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + ) + # inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( + # self.tokenizer.mask_token + # ) + inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]") + + # 10% of the time, we replace masked input tokens with random word + indices_random = ( + torch.bernoulli(torch.full(labels.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced + ) + random_words = torch.randint( + self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long + ) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + +def pad( + self, + encoded_inputs: Union[ + BatchEncoding, + List[BatchEncoding], + Dict[str, EncodedInput], + Dict[str, List[EncodedInput]], + List[Dict[str, EncodedInput]], + ], + padding=True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors=None, + verbose: bool = True, +) -> BatchEncoding: + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``, + ``self.pad_token_id`` and ``self.pad_token_type_id``) + .. note:: + If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with ``return_tensors``. In the + case of PyTorch tensors, you will lose the specific device of your tensors however. + Args: + encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str, + List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str, + List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as + well as in a PyTorch Dataloader collate function. + Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), + see the note above for the return type. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + `What are attention masks? <../glossary.html#attention-mask>`__ + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to print more information and warnings. + """ + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(encoded_inputs, (list, tuple)) and isinstance( + encoded_inputs[0], (dict, BatchEncoding) + ): + encoded_inputs = { + key: [example[key] for example in encoded_inputs] + for key in encoded_inputs[0].keys() + } + + # The model's main input name, usually `input_ids`, has be passed for padding + # if self.model_input_names[0] not in encoded_inputs: + # raise ValueError( + # "You should supply an encoding or a list of encodings to this method " + # f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" + # ) + + required_input = encoded_inputs["input_ids"] + + if not required_input: + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + index = 0 + while len(required_input[index]) == 0: + index += 1 + if index < len(required_input): + first_element = required_input[index][0] + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (int, list, tuple)): + if isinstance(first_element, torch.Tensor): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + f"Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in encoded_inputs.items(): + encoded_inputs[key] = to_py_obj(value) + + # # Convert padding_strategy in PaddingStrategy + # padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( + # padding=padding, max_length=max_length, verbose=verbose + # ) + + required_input = encoded_inputs["input_ids"] + if required_input and not isinstance(required_input[0], (list, tuple)): + # encoded_inputs = _pad( + # encoded_inputs, + # max_length=max_length, + # # padding_strategy=padding_strategy, + # pad_to_multiple_of=pad_to_multiple_of, + # return_attention_mask=return_attention_mask, + # ) + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all( + len(v) == batch_size for v in encoded_inputs.values() + ), "Some items in the output dictionary have a different batch size than others." + + # if padding_strategy == PaddingStrategy.LONGEST: + # max_length = max(len(inputs) for inputs in required_input) + # padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) + # outputs = self._pad( + # inputs, + # max_length=max_length, + # # padding_strategy=padding_strategy, + # pad_to_multiple_of=pad_to_multiple_of, + # return_attention_mask=return_attention_mask, + # ) + for key, value in inputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + +# def _pad( +# self, +# encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], +# max_length: Optional[int] = None, +# padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, +# pad_to_multiple_of: Optional[int] = None, +# return_attention_mask: Optional[bool] = None, +# ) -> dict: +# """ +# Pad encoded inputs (on left/right and up to predefined length or max length in the batch) +# Args: +# encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). +# max_length: maximum length of the returned list and optionally padding length (see below). +# Will truncate by taking into account the special tokens. +# padding_strategy: PaddingStrategy to use for padding. +# - PaddingStrategy.LONGEST Pad to the longest sequence in the batch +# - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) +# - PaddingStrategy.DO_NOT_PAD: Do not pad +# The tokenizer padding sides are defined in self.padding_side: +# - 'left': pads on the left of the sequences +# - 'right': pads on the right of the sequences +# pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. +# This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability +# >= 7.5 (Volta). +# return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) +# """ +# # Load from model defaults +# if return_attention_mask is None: +# return_attention_mask = "attention_mask" in self.model_input_names + +# required_input = encoded_inputs[self.model_input_names[0]] + +# if padding_strategy == PaddingStrategy.LONGEST: +# max_length = len(required_input) + +# if ( +# max_length is not None +# and pad_to_multiple_of is not None +# and (max_length % pad_to_multiple_of != 0) +# ): +# max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + +# needs_to_be_padded = ( +# padding_strategy != PaddingStrategy.DO_NOT_PAD +# and len(required_input) != max_length +# ) + +# if needs_to_be_padded: +# difference = max_length - len(required_input) +# if self.padding_side == "right": +# if return_attention_mask: +# encoded_inputs["attention_mask"] = [1] * len(required_input) + [ +# 0 +# ] * difference +# if "token_type_ids" in encoded_inputs: +# encoded_inputs["token_type_ids"] = ( +# encoded_inputs["token_type_ids"] +# + [self.pad_token_type_id] * difference +# ) +# if "special_tokens_mask" in encoded_inputs: +# encoded_inputs["special_tokens_mask"] = ( +# encoded_inputs["special_tokens_mask"] + [1] * difference +# ) +# encoded_inputs[self.model_input_names[0]] = ( +# required_input + [self.pad_token_id] * difference +# ) +# elif self.padding_side == "left": +# if return_attention_mask: +# encoded_inputs["attention_mask"] = [0] * difference + [1] * len( +# required_input +# ) +# if "token_type_ids" in encoded_inputs: +# encoded_inputs["token_type_ids"] = [ +# self.pad_token_type_id +# ] * difference + encoded_inputs["token_type_ids"] +# if "special_tokens_mask" in encoded_inputs: +# encoded_inputs["special_tokens_mask"] = [ +# 1 +# ] * difference + encoded_inputs["special_tokens_mask"] +# encoded_inputs[self.model_input_names[0]] = [ +# self.pad_token_id +# ] * difference + required_input +# else: +# raise ValueError("Invalid padding strategy:" + str(self.padding_side)) +# elif return_attention_mask and "attention_mask" not in encoded_inputs: +# encoded_inputs["attention_mask"] = [1] * len(required_input) + +# return encoded_inputs + + +def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): + """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + # Tensorize if necessary. + if isinstance(examples[0], (list, tuple)): + examples = [torch.tensor(e, dtype=torch.long) for e in examples] + + # Check if padding is necessary. + length_of_first = examples[0].size(0) + are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + if are_tensors_same_length and ( + pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0 + ): + return torch.stack(examples, dim=0) + + # If yes, check if we have a `pad_token`. + if tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({tokenizer.__class__.__name__}) does not have a pad token." + ) + + # Creating the full tensor and filling it with our data. + max_length = max(x.size(0) for x in examples) + if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id) + for i, example in enumerate(examples): + if tokenizer.padding_side == "right": + result[i, : example.shape[0]] = example + else: + result[i, -example.shape[0] :] = example + return result + + +def to_py_obj(obj): + if isinstance(obj, torch.Tensor): + return obj.detach().cpu().tolist() + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return obj diff --git a/my_data_collator.py b/my_data_collator.py new file mode 100644 index 0000000..fe3f21c --- /dev/null +++ b/my_data_collator.py @@ -0,0 +1,272 @@ +from dataclasses import dataclass +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + NamedTuple, + Optional, + Sequence, + Tuple, + Union, +) + +import numpy as np +import tokenizers +import torch +from transformers import BatchEncoding + +EncodedInput = List[int] + + +@dataclass +class MyDataCollatorForPreTraining: + tokenizer: tokenizers.Tokenizer + mlm: bool = True + mlm_probability: float = 0.15 + pad_to_multiple_of: Optional[int] = None + + def __post_init__(self): + if self.mlm and self.tokenizer.token_to_id("[MASK]") is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. " + "You should pass `mlm=False` to train on causal language modeling instead." + ) + + def __call__( + self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]], + ) -> Dict[str, torch.Tensor]: + # Handle dict or lists with proper padding and conversion to tensor. + if isinstance(examples[0], (dict, BatchEncoding)): + batch = pad( + encoded_inputs=examples, + return_tensors="pt", + pad_to_multiple_of=self.pad_to_multiple_of, + ) + else: + batch = { + "input_ids": _collate_batch( + examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of + ) + } + + # If special token mask has been preprocessed, pop it from the dict. + special_tokens_mask = batch.pop("special_tokens_mask", None) + if self.mlm: + batch["input_ids"], batch["labels"] = self.mask_tokens( + batch["input_ids"], special_tokens_mask=special_tokens_mask + ) + return batch + + def mask_tokens( + self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + labels = inputs.clone() + # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) + probability_matrix = torch.full(labels.shape, self.mlm_probability) + if special_tokens_mask is None: + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask( + val, already_has_special_tokens=True + ) + for val in labels.tolist() + ] + special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) + else: + special_tokens_mask = special_tokens_mask.bool() + + probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = ( + torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + ) + # inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( + # self.tokenizer.mask_token + # ) + inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]") + + # 10% of the time, we replace masked input tokens with random word + indices_random = ( + torch.bernoulli(torch.full(labels.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced + ) + random_words = torch.randint( + self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long + ) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + +def pad( + encoded_inputs: Union[ + BatchEncoding, + List[BatchEncoding], + Dict[str, EncodedInput], + Dict[str, List[EncodedInput]], + List[Dict[str, EncodedInput]], + ], + padding=True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors=None, + verbose: bool = True, +) -> BatchEncoding: + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``, + ``self.pad_token_id`` and ``self.pad_token_type_id``) + .. note:: + If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with ``return_tensors``. In the + case of PyTorch tensors, you will lose the specific device of your tensors however. + Args: + encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str, + List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str, + List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as + well as in a PyTorch Dataloader collate function. + Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), + see the note above for the return type. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + `What are attention masks? <../glossary.html#attention-mask>`__ + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to print more information and warnings. + """ + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(encoded_inputs, (list, tuple)) and isinstance( + encoded_inputs[0], (dict, BatchEncoding) + ): + encoded_inputs = { + key: [example[key] for example in encoded_inputs] + for key in encoded_inputs[0].keys() + } + + required_input = encoded_inputs["input_ids"] + + if not required_input: + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + index = 0 + while len(required_input[index]) == 0: + index += 1 + if index < len(required_input): + first_element = required_input[index][0] + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (int, list, tuple)): + if isinstance(first_element, torch.Tensor): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + f"Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in encoded_inputs.items(): + encoded_inputs[key] = to_py_obj(value) + + required_input = encoded_inputs["input_ids"] + if required_input and not isinstance(required_input[0], (list, tuple)): + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all( + len(v) == batch_size for v in encoded_inputs.values() + ), "Some items in the output dictionary have a different batch size than others." + + batch_outputs = {} + for i in range(batch_size): + inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) + for key, value in inputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + +def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): + """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + # Tensorize if necessary. + if isinstance(examples[0], (list, tuple)): + examples = [torch.tensor(e, dtype=torch.long) for e in examples] + + # Check if padding is necessary. + length_of_first = examples[0].size(0) + are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + if are_tensors_same_length and ( + pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0 + ): + return torch.stack(examples, dim=0) + + # If yes, check if we have a `pad_token`. + if tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({tokenizer.__class__.__name__}) does not have a pad token." + ) + + # Creating the full tensor and filling it with our data. + max_length = max(x.size(0) for x in examples) + if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id) + for i, example in enumerate(examples): + if tokenizer.padding_side == "right": + result[i, : example.shape[0]] = example + else: + result[i, -example.shape[0] :] = example + return result + + +def to_py_obj(obj): + if isinstance(obj, torch.Tensor): + return obj.detach().cpu().tolist() + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return obj diff --git a/my_run_mlm_no_trainer.py b/my_run_mlm_no_trainer.py new file mode 100644 index 0000000..76dbd10 --- /dev/null +++ b/my_run_mlm_no_trainer.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 Alan Zhao. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# """ +# Pre-train the BERT on a dataset without using HuggingFace Trainer. +# """ +# You can also adapt this script on your own mlm task. Pointers for this are left as comments. + +import argparse +import logging +import math +import os +import random + +import datasets +import numpy as np +import tokenizers +import torch +import transformers +from accelerate import Accelerator +from datasets import load_dataset +from torch.nn import DataParallel +from torch.utils.data.dataloader import DataLoader +from tqdm.auto import tqdm +from transformers import ( + CONFIG_MAPPING, + MODEL_MAPPING, + AdamW, + AutoConfig, + AutoModelForMaskedLM, + AutoTokenizer, + BatchEncoding, + BertConfig, + BertForPreTraining, + DataCollatorForLanguageModeling, + SchedulerType, + get_scheduler, + set_seed, +) + +from my_data_collator import MyDataCollatorForPreTraining +from process_data.utils import CURRENT_DATA_BASE + +logger = logging.getLogger(__name__) +MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Finetune a transformers model on a Masked Language Modeling task" + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=16, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=64, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--weight_decay", type=float, default=0.0, help="Weight decay to use." + ) + parser.add_argument( + "--num_train_epochs", + type=int, + default=40, + help="Total number of training epochs to perform.", + ) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=[ + "linear", + "cosine", + "cosine_with_restarts", + "polynomial", + "constant", + "constant_with_warmup", + ], + ) + parser.add_argument( + "--num_warmup_steps", + type=int, + default=0, + help="Number of steps for the warmup in the lr scheduler.", + ) + parser.add_argument( + "--output_dir", type=str, default=None, help="Where to store the final model." + ) + parser.add_argument( + "--seed", type=int, default=None, help="A seed for reproducible training." + ) + parser.add_argument( + "--preprocessing_num_workers", + type=int, + default=None, + help="The number of processes to use for the preprocessing.", + ) + parser.add_argument( + "--mlm_probability", + type=float, + default=0.15, + help="Ratio of tokens to mask for masked language modeling loss", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--eval_every_steps", + type=int, + default=5000, + help="Number of steps before evaluating the model.", + ) + + args = parser.parse_args() + + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + return args + + +def main(): + args = parse_args() + + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. + accelerator = Accelerator() + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state) + + # Setup logging, we only want one process per machine to log things on the screen. + # accelerator.is_local_main_process is only True for one process per machine. + logger.setLevel( + logging.INFO if accelerator.is_local_main_process else logging.ERROR + ) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # we take control of the load of dataset by oursevles + # there will be several json file for training + # `raw_dataset` has two features: + # `text`: "sentA\tsentB" + # `is_next`: 0 or 1 + # raw_datasets = load_dataset( + # "json", + # data_files={ + # "train": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json", + # "validation": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json", + # }, + # field="data", + # ) + train_files = [ + os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128) + ] + valid_file = "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json" + raw_datasets = load_dataset( + "json", + data_files={"train": train_files, "validation": valid_file,}, + field="data", + ) + + # we use the tokenizer previously trained on the dataset above + tokenizer = tokenizers.Tokenizer.from_file( + os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json") + ) + + # NOTE: have to promise the `length` here is consistent with the one used in `train_my_tokenizer.py` + tokenizer.enable_padding( + pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=32 + ) + + # NOTE: `max_position_embeddings` here should be consistent with `length` above + # we use a much smaller BERT, config is: + config = BertConfig( + vocab_size=tokenizer.get_vocab_size(), + hidden_size=96, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=384, + max_position_embeddings=32, + ) + + # initalize a new BERT for pre-training + model = BertForPreTraining(config) + + # Preprocessing the datasets. + column_names = raw_datasets["train"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # First we aplly `tokenize_function` on the dataset. + def tokenize_function(examples): + text = [tuple(sent) for sent in examples["text"]] + encoded_inputs = {} + results = tokenizer.encode_batch(text) + encoded_inputs["input_ids"] = [result.ids for result in results] + encoded_inputs["token_type_ids"] = [result.type_ids for result in results] + encoded_inputs["special_tokens_mask"] = [ + result.special_tokens_mask for result in results + ] + # according to the document of BERT in HuggingFace + # 0: is + # 1: is not + encoded_inputs["next_sentence_label"] = [ + 1 - label for label in examples["is_next"] + ] + # use `np` rather than `pt` in case of reporting of error + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type="np", prepend_batch_axis=False, + ) + return batch_outputs + + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + ) + + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["validation"] + + # Log a few random samples from the training set: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Data collator + # This one will take care of randomly masking the tokens. + data_collator = MyDataCollatorForPreTraining( + tokenizer=tokenizer, mlm_probability=args.mlm_probability + ) + + # DataLoaders creation: + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + collate_fn=data_collator, + batch_size=args.per_device_train_batch_size, + ) + eval_dataloader = DataLoader( + eval_dataset, + collate_fn=data_collator, + batch_size=args.per_device_eval_batch_size, + ) + + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p + for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": args.weight_decay, + }, + { + "params": [ + p + for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) + + # Prepare everything with our `accelerator`. + # model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( + # model, optimizer, train_dataloader, eval_dataloader + # ) + model, optimizer, train_dataloader = accelerator.prepare( + model, optimizer, train_dataloader + ) + + model = DataParallel(model) + # model.to("cuda:0") + + # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be + # shorter in multiprocess) + + # Scheduler and math around the number of training steps. + num_update_steps_per_epoch = math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + else: + args.num_train_epochs = math.ceil( + args.max_train_steps / num_update_steps_per_epoch + ) + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.max_train_steps, + ) + + # Train! + total_batch_size = ( + args.per_device_train_batch_size + * accelerator.num_processes + * args.gradient_accumulation_steps + ) + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info( + f" Instantaneous batch size per device = {args.per_device_train_batch_size}" + ) + logger.info( + f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" + ) + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + logger.info(f" Evalute every {args.eval_every_steps} steps") + # Only show the progress bar once on each machine. + progress_bar = tqdm( + range(args.max_train_steps), disable=not accelerator.is_local_main_process + ) + completed_steps = 0 + + for epoch in range(args.num_train_epochs): + model.train() + for step, batch in enumerate(train_dataloader): + outputs = model(**batch) + loss = outputs.loss + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + if ( + step % args.gradient_accumulation_steps == 0 + or step == len(train_dataloader) - 1 + ): + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + completed_steps += 1 + + if completed_steps >= args.max_train_steps: + break + + if completed_steps % args.eval_every_steps == 0: + model.eval() + losses = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + outputs = model(**batch) + + loss = outputs.loss + losses.append( + accelerator.gather(loss.repeat(args.per_device_eval_batch_size)) + ) + + losses = torch.cat(losses) + # losses = losses[: len(eval_dataset)] + try: + perplexity = math.exp(torch.mean(losses)) + except OverflowError: + perplexity = float("inf") + + logger.info(f"steps {completed_steps}: perplexity: {perplexity}") + model.train() + + if args.output_dir is not None: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) + + +if __name__ == "__main__": + main() diff --git a/process_data/check_length.py b/process_data/check_length.py new file mode 100644 index 0000000..e803142 --- /dev/null +++ b/process_data/check_length.py @@ -0,0 +1,29 @@ +import os + +from utils import ORIGINAL_DATA_BASE, read_file + + +def check(filename): + sents = read_file(filename) + result = 0 + for sent in sents: + result = max(result, len(sent[-1].replace("\t", " ").split())) + print("The longest sentence in {} has {} words".format(filename, result)) + return result + + +def main(): + longest = 0 + # for i in range(6): + for i in [1]: + for group in ("pos", "neg"): + filename = os.path.join( + ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group) + ) + longest = max(check(filename), longest) + print("The longest sentence in all files has {} words.".format(longest)) + + +if __name__ == "__main__": + main() + diff --git a/process_data/convert_space_format.py b/process_data/convert_space_format.py new file mode 100644 index 0000000..43799ed --- /dev/null +++ b/process_data/convert_space_format.py @@ -0,0 +1,29 @@ +import os + +from utils import ORIGINAL_DATA_BASE, read_file + + +def write_file(data, filename): + print("Writing data into {}...".format(filename)) + with open(filename, "w", encoding="utf-8") as fout: + for sent in data: + fout.write(sent.replace("", "SPACE")) + + +def convert(fin, fout): + print("Start the replacement task for {}...".format(fin)) + # filename = "/home/ming/malware/data/elfasm_inst_pairs/linux32_00xxxx.all" + sents = read_file(fin) + write_file(sents, fout) + + +def main(): + # for i in range(6): + for i in [1]: + fin = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i)) + fout = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.txt".format(i)) + convert(fin, fout) + + +if __name__ == "__main__": + main() diff --git a/process_data/count_word_for_vocab.py b/process_data/count_word_for_vocab.py new file mode 100644 index 0000000..543590b --- /dev/null +++ b/process_data/count_word_for_vocab.py @@ -0,0 +1,54 @@ +import os +from multiprocessing import Pool, Process, Queue + +from tqdm import tqdm + +from utils import ORIGINAL_DATA_BASE, read_file + +q = Queue(128) +BASE = 4600000 + + +def counter_worker(sents): + cnt = set() + for sent in tqdm(sents): + cnt = cnt.union(set(sent[:-1].replace("\t", " ").split())) + print("Process {} get {} words".format(os.getpid(), len(cnt))) + q.put(cnt) + return + + +def counter(filename): + sents = read_file(filename) + + p = Pool(36) + + for i in range(64): + p.apply_async(counter_worker, args=(sents[i * BASE : (i + 1) * BASE],)) + print("Waiting for all sub-processes done...") + p.close() + p.join() + print("All subprocess done.") + cnt = set() + # for sent in tqdm(sents): + # cnt += set(sent[-1].replace("\t", " ").split()) + for _ in tqdm(range(64)): + cnt = cnt.union(q.get()) + print("There are {} charcters in {}".format(len(cnt), filename)) + return cnt + + +def main(): + cnt = set() + # for i in range(6): + for i in [1]: + for group in ["pos", "neg"]: + filename = os.path.join( + ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group) + ) + cnt += counter(filename) + print("There are {} charcters in all files".format(len(cnt))) + + +if __name__ == "__main__": + main() diff --git a/process_data/create_negative_examples.py b/process_data/create_negative_examples.py new file mode 100644 index 0000000..6ca9a2a --- /dev/null +++ b/process_data/create_negative_examples.py @@ -0,0 +1,34 @@ +import os +from random import randint + +from tqdm import tqdm + +from utils import ORIGINAL_DATA_BASE, read_file + + +def create(pos, neg, tgt): + pos_sents = read_file(pos) + + neg_sents = read_file(neg) + neg_length = len(neg_sents) + print("Start writing negative examples to {}...".format(tgt)) + with open(tgt, "w", encoding="utf-8") as fout: + for sent in tqdm(pos_sents): + first = sent.split("\t")[0] + index = randint(0, neg_length - 1) + pair = neg_sents[index].split("\t")[randint(0, 1)].replace("\n", "") + fout.write(first + "\t" + pair + "\n") + + +def main(): + # for i in range(6): + for i in [1]: + j = (i + 1) % 6 + pos = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i)) + neg = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(j)) + tgt = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.txt".format(i)) + create(pos, neg, tgt) + + +if __name__ == "__main__": + main() diff --git a/process_data/merge_examples_to_json.py b/process_data/merge_examples_to_json.py new file mode 100644 index 0000000..9a49068 --- /dev/null +++ b/process_data/merge_examples_to_json.py @@ -0,0 +1,91 @@ +import gc +import json +import os +from multiprocessing import Pool, Process, Queue + +from tqdm import tqdm + +from utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file + +BASE = 4600000 + + +def write_worker(sents, json_file, index): + examples = [] + for sent in tqdm(sents): + tmp = sent[:-1].split("\t") + examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])}) + examples[-1]["text"] = tuple(examples[-1]["text"]) + + print("Writing to {}...".format(json_file + "{}.json".format(index))) + results = {"data": examples} + with open(json_file + "{}.json".format(index), "w") as f: + json.dump(results, f) + + +def merge_to_json(pos, neg, json_file): + sents = read_file(pos) + + p = Pool(36) + + for i in range(64): + p.apply_async( + write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, i,) + ) + print("Waiting for all sub-processes done...") + p.close() + p.join() + print("All subprocess done.") + + # length = len(sents) + # base = length // 20000000 + 1 + + # for i in tqdm(range(length)): + # examples = [] + # tmp = sents[i][:-1].split("\t") + # examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])}) + # examples[i]["text"] = tuple(examples[i]["text"]) + # index = i // 20000000 + # print("Writing to {}...".format(json_file + "{}.json".format(index))) + # with open(json_file + "{}.json".format(index), "w") as f: + # json.dump(examples, f) + + del sents + gc.collect() + + sents = read_file(neg) + + p = Pool(8) + + for i in range(64): + p.apply_async( + write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, 64 + i,) + ) + print("Waiting for all sub-processes done...") + p.close() + p.join() + print("All subprocess done.") + + # length = len(sents) + # for i in tqdm(range(length)): + # examples = [] + # tmp = sents[i][:-1].split("\t") + # examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])}) + # examples[i]["text"] = tuple(examples[i]["text"]) + # index = i // 20000000 + # print("Writing to {}...".format(json_file + "{}.json".format(base + index))) + # with open(json_file + "{}.json".format(base + index), "w") as f: + # json.dump(examples, f) + + +def main(): + # for i in range(6): + for i in [1]: + pos = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.label.txt".format(i)) + neg = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.label.txt".format(i)) + json_file = os.path.join(CURRENT_DATA_BASE, "inst.{}.".format(i)) + merge_to_json(pos, neg, json_file) + + +if __name__ == "__main__": + main() diff --git a/process_data/readme.md b/process_data/readme.md new file mode 100644 index 0000000..c2e7611 --- /dev/null +++ b/process_data/readme.md @@ -0,0 +1,22 @@ +# Pre-processing steps +### 1. run `convert_space_format.py` +Convert the string `` to `SPACE` + +### 2. run `create_negtive_examples.py` +We use the next file of the current file as its negative examples, which is apparently rational. + +Specifically, for each instruction in the current positive file, we randomly choose a line in its next file and select one of two instructions in the line as its negative example. + +### 3. run `merge_examples_to_json.py` +We dump the positive and negative examples with their corresponding labels into several json files. +Each json file contains 20m lines of examples. + +### 4. run `check_length.py` +We will specify the length padded to when we use the tokenizer, `tokenizer.enable_padding(..., length=)`. + +So we need to know the longest sentences in the dataset. + +### 5. run `count_word_for_vocab.py` +Similarly, we also need to specify the size of vocabulary when we train the tokenizer, `WordLevelTrainer(vocab_size=, ...)`. + +So we need to know how many characters in the dataset. \ No newline at end of file diff --git a/process_data/utils.py b/process_data/utils.py new file mode 100644 index 0000000..f26b57c --- /dev/null +++ b/process_data/utils.py @@ -0,0 +1,11 @@ +import os + +ORIGINAL_DATA_BASE = "/home/ming/malware/data/elfasm_inst_pairs" +CURRENT_DATA_BASE = "/home/ming/malware/inst2vec_bert/data/asm_bert" + + +def read_file(filename): + print("Reading data from {}...".format(filename)) + with open(filename, "r", encoding="utf-8") as fin: + return fin.readlines() + diff --git a/run_mlm_no_trainer.py b/run_mlm_no_trainer.py new file mode 100644 index 0000000..538d5c8 --- /dev/null +++ b/run_mlm_no_trainer.py @@ -0,0 +1,672 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# """ +# Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) +# on a text file or a dataset without using HuggingFace Trainer. +# Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +# https://huggingface.co/models?filter=masked-lm +# """ +# You can also adapt this script on your own mlm task. Pointers for this are left as comments. + +import argparse +import logging +import math +import os +import random + +import datasets +import numpy as np +import tokenizers +import torch +import transformers +from accelerate import Accelerator +from datasets import load_dataset +from torch.utils.data.dataloader import DataLoader +from tqdm.auto import tqdm +from transformers import ( + CONFIG_MAPPING, + MODEL_MAPPING, + AdamW, + AutoConfig, + AutoModelForMaskedLM, + AutoTokenizer, + BatchEncoding, + BertConfig, + BertForPreTraining, + DataCollatorForLanguageModeling, + SchedulerType, + get_scheduler, + set_seed, +) + +logger = logging.getLogger(__name__) +MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Finetune a transformers model on a Masked Language Modeling task" + ) + # parser.add_argument( + # "--dataset_name", + # type=str, + # default=None, + # help="The name of the dataset to use (via the datasets library).", + # ) + # parser.add_argument( + # "--dataset_config_name", + # type=str, + # default=None, + # help="The configuration name of the dataset to use (via the datasets library).", + # ) + # parser.add_argument( + # "--train_file", + # type=str, + # default=None, + # help="A csv or a json file containing the training data.", + # ) + # parser.add_argument( + # "--validation_file", + # type=str, + # default=None, + # help="A csv or a json file containing the validation data.", + # ) + # parser.add_argument( + # "--validation_split_percentage", + # default=5, + # help="The percentage of the train set used as validation set in case there's no validation split", + # ) + # parser.add_argument( + # "--pad_to_max_length", + # action="store_true", + # help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.", + # ) + # parser.add_argument( + # "--model_name_or_path", + # type=str, + # help="Path to pretrained model or model identifier from huggingface.co/models.", + # required=True, + # ) + # parser.add_argument( + # "--config_name", + # type=str, + # default=None, + # help="Pretrained config name or path if not the same as model_name", + # ) + # parser.add_argument( + # "--tokenizer_name", + # type=str, + # default=None, + # help="Pretrained tokenizer name or path if not the same as model_name", + # ) + # parser.add_argument( + # "--use_slow_tokenizer", + # action="store_true", + # help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", + # ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--weight_decay", type=float, default=0.0, help="Weight decay to use." + ) + parser.add_argument( + "--num_train_epochs", + type=int, + default=40, + help="Total number of training epochs to perform.", + ) + # parser.add_argument( + # "--max_train_steps", + # type=int, + # default=None, + # help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + # ) + # parser.add_argument( + # "--gradient_accumulation_steps", + # type=int, + # default=1, + # help="Number of updates steps to accumulate before performing a backward/update pass.", + # ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=[ + "linear", + "cosine", + "cosine_with_restarts", + "polynomial", + "constant", + "constant_with_warmup", + ], + ) + parser.add_argument( + "--num_warmup_steps", + type=int, + default=0, + help="Number of steps for the warmup in the lr scheduler.", + ) + parser.add_argument( + "--output_dir", type=str, default=None, help="Where to store the final model." + ) + parser.add_argument( + "--seed", type=int, default=None, help="A seed for reproducible training." + ) + # parser.add_argument( + # "--model_type", + # type=str, + # default=None, + # help="Model type to use if training from scratch.", + # choices=MODEL_TYPES, + # ) + # parser.add_argument( + # "--max_seq_length", + # type=int, + # default=None, + # help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.", + # ) + # parser.add_argument( + # "--line_by_line", + # type=bool, + # default=False, + # help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.", + # ) + parser.add_argument( + "--preprocessing_num_workers", + type=int, + default=None, + help="The number of processes to use for the preprocessing.", + ) + # parser.add_argument( + # "--overwrite_cache", + # type=bool, + # default=False, + # help="Overwrite the cached training and evaluation sets", + # ) + parser.add_argument( + "--mlm_probability", + type=float, + default=0.15, + help="Ratio of tokens to mask for masked language modeling loss", + ) + + args = parser.parse_args() + + # Sanity checks + # if ( + # args.dataset_name is None + # and args.train_file is None + # and args.validation_file is None + # ): + # raise ValueError("Need either a dataset name or a training/validation file.") + # else: + # if args.train_file is not None: + # extension = args.train_file.split(".")[-1] + # assert extension in [ + # "csv", + # "json", + # "txt", + # ], "`train_file` should be a csv, json or txt file." + # if args.validation_file is not None: + # extension = args.validation_file.split(".")[-1] + # assert extension in [ + # "csv", + # "json", + # "txt", + # ], "`validation_file` should be a csv, json or txt file." + + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + return args + + +def main(): + args = parse_args() + + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. + accelerator = Accelerator() + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state) + + # Setup logging, we only want one process per machine to log things on the screen. + # accelerator.is_local_main_process is only True for one process per machine. + logger.setLevel( + logging.INFO if accelerator.is_local_main_process else logging.ERROR + ) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # # (the dataset will be downloaded automatically from the datasets Hub). + # # + # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # # 'text' is found. You can easily tweak this behavior (see below). + # # + # # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # # download the dataset. + # if args.dataset_name is not None: + # # Downloading and loading a dataset from the hub. + # raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + # if "validation" not in raw_datasets.keys(): + # raw_datasets["validation"] = load_dataset( + # args.dataset_name, + # args.dataset_config_name, + # split=f"train[:{args.validation_split_percentage}%]", + # ) + # raw_datasets["train"] = load_dataset( + # args.dataset_name, + # args.dataset_config_name, + # split=f"train[{args.validation_split_percentage}%:]", + # ) + # else: + # data_files = {} + # if args.train_file is not None: + # data_files["train"] = args.train_file + # if args.validation_file is not None: + # data_files["validation"] = args.validation_file + # extension = args.train_file.split(".")[-1] + # if extension == "txt": + # extension = "text" + # raw_datasets = load_dataset(extension, data_files=data_files) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # we use dataset in json format, + # `raw_dataset` has two features: `text`:"sentA\tsentB" and `is_next`:0 or 1 + + # we take control of the load of dataset by oursevles + # there will be several json file for training + # raw_datasets = load_dataset("json", data_files=args.train_file, field="data") + raw_datasets = load_dataset( + "json", + data_files="/home/ming/malware/inst2vec_bert/data/test_lm/inst.json", + field="data", + ) + + # we use the tokenizer trained on the positive dataset before + # tokenizer = tokenizers.Tokenizer.from_file(args.tokenizer_file) + tokenizer = tokenizers.Tokenizer.from_file( + "/home/ming/malware/inst2vec_bert/bert/tokenizer-inst.json" + ) + tokenizer.enable_padding( + pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=64 + ) + + # we use a much smaller BERT, config is: + config = BertConfig( + vocab_size=tokenizer.get_vocab_size(), + hidden_size=64, + num_hidden_layers=8, + num_attention_heads=8, + intermediate_size=256, + max_position_embeddings=64, + ) + # initalize the new BERT for pre-training + model = BertForPreTraining(config) + + # all_special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] + # all_special_ids = [tokenizer.token_to_id(token) for token in all_special_tokens] + + # # Load pretrained model and tokenizer + # # + # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # # download model & vocab. + # if args.config_name: + # config = AutoConfig.from_pretrained(args.config_name) + # elif args.model_name_or_path: + # config = AutoConfig.from_pretrained(args.model_name_or_path) + # else: + # config = CONFIG_MAPPING[args.model_type]() + # logger.warning("You are instantiating a new config instance from scratch.") + + # if args.tokenizer_name: + # tokenizer = AutoTokenizer.from_pretrained( + # args.tokenizer_name, use_fast=not args.use_slow_tokenizer + # ) + # elif args.model_name_or_path: + # tokenizer = AutoTokenizer.from_pretrained( + # args.model_name_or_path, use_fast=not args.use_slow_tokenizer + # ) + # else: + # raise ValueError( + # "You are instantiating a new tokenizer from scratch. This is not supported by this script." + # "You can do it from another script, save it, and load it from here, using --tokenizer_name." + # ) + + # if args.model_name_or_path: + # model = AutoModelForMaskedLM.from_pretrained( + # args.model_name_or_path, + # from_tf=bool(".ckpt" in args.model_name_or_path), + # config=config, + # ) + # else: + # logger.info("Training new model from scratch") + # model = AutoModelForMaskedLM.from_config(config) + + # model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + column_names = raw_datasets["train"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # if args.max_seq_length is None: + # max_seq_length = tokenizer.model_max_length + # if max_seq_length > 1024: + # logger.warning( + # f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + # "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + # ) + # max_seq_length = 1024 + # else: + # if args.max_seq_length > tokenizer.model_max_length: + # logger.warning( + # f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" + # f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + # ) + # max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) + + # if args.line_by_line: + # # When using line_by_line, we just tokenize each nonempty line. + # padding = "max_length" if args.pad_to_max_length else False + + # def tokenize_function(examples): + # # Remove empty lines + # examples["text"] = [ + # line + # for line in examples["text"] + # if len(line) > 0 and not line.isspace() + # ] + # return tokenizer( + # examples["text"], + # padding=padding, + # truncation=True, + # max_length=max_seq_length, + # # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it + # # receives the `special_tokens_mask`. + # return_special_tokens_mask=True, + # ) + + # tokenized_datasets = raw_datasets.map( + # tokenize_function, + # batched=True, + # num_proc=args.preprocessing_num_workers, + # remove_columns=[text_column_name], + # load_from_cache_file=not args.overwrite_cache, + # ) + # else: + # # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. + # # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more + # # efficient when it receives the `special_tokens_mask`. + # def tokenize_function(examples): + # return tokenizer( + # examples[text_column_name], return_special_tokens_mask=True + # ) + + # tokenized_datasets = raw_datasets.map( + # tokenize_function, + # batched=True, + # num_proc=args.preprocessing_num_workers, + # remove_columns=column_names, + # load_from_cache_file=not args.overwrite_cache, + # ) + + # # Main data processing function that will concatenate all texts from our dataset and generate chunks of + # # max_seq_length. + # def group_texts(examples): + # # Concatenate all texts. + # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + # total_length = len(concatenated_examples[list(examples.keys())[0]]) + # # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # # customize this part to your needs. + # total_length = (total_length // max_seq_length) * max_seq_length + # # Split by chunks of max_len. + # result = { + # k: [ + # t[i : i + max_seq_length] + # for i in range(0, total_length, max_seq_length) + # ] + # for k, t in concatenated_examples.items() + # } + # return result + + # # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a + # # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value + # # might be slower to preprocess. + # # + # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + # tokenized_datasets = tokenized_datasets.map( + # group_texts, + # batched=True, + # num_proc=args.preprocessing_num_workers, + # load_from_cache_file=not args.overwrite_cache, + # ) + + def tokenize_function(examples): + text = [tuple(sent) for sent in examples["text"]] + encoded_inputs = {} + results = tokenizer.encode_batch(text) + # input_ids, type_ids, special_token_masks = [], [], [] + # for i, result in enumerate(results): + # input_ids.append(result.ids) + # type_ids.append(result.type_ids) + # special_token_masks.append(result.special_tokens_mask) + encoded_inputs["input_ids"] = [result.ids for result in results] + encoded_inputs["token_type_ids"] = [result.type_ids for result in results] + # special_tokens_mask = [1 if token in all_special_ids else 0 for token in ids] + encoded_inputs["special_tokens_mask"] = [ + result.special_tokens_mask for result in results + ] + # 0: is ; 1 : is not + encoded_inputs["next_sentence_label "] = [ + 1 - label for label in examples["is_next"] + ] + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type="np", prepend_batch_axis=False, + ) + return batch_outputs + + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + # load_from_cache_file=not args.overwrite_cache, + ) + + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["validation"] + + # Log a few random samples from the training set: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Data collator + # This one will take care of randomly masking the tokens. + data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm_probability=args.mlm_probability + ) + + # DataLoaders creation: + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + collate_fn=data_collator, + batch_size=args.per_device_train_batch_size, + ) + eval_dataloader = DataLoader( + eval_dataset, + collate_fn=data_collator, + batch_size=args.per_device_eval_batch_size, + ) + + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p + for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": args.weight_decay, + }, + { + "params": [ + p + for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader + ) + + # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be + # shorter in multiprocess) + + # Scheduler and math around the number of training steps. + num_update_steps_per_epoch = math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + else: + args.num_train_epochs = math.ceil( + args.max_train_steps / num_update_steps_per_epoch + ) + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.max_train_steps, + ) + + # Train! + total_batch_size = ( + args.per_device_train_batch_size + * accelerator.num_processes + * args.gradient_accumulation_steps + ) + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info( + f" Instantaneous batch size per device = {args.per_device_train_batch_size}" + ) + logger.info( + f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" + ) + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm( + range(args.max_train_steps), disable=not accelerator.is_local_main_process + ) + completed_steps = 0 + + for epoch in range(args.num_train_epochs): + model.train() + for step, batch in enumerate(train_dataloader): + outputs = model(**batch) + loss = outputs.loss + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + if ( + step % args.gradient_accumulation_steps == 0 + or step == len(train_dataloader) - 1 + ): + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + completed_steps += 1 + + if completed_steps >= args.max_train_steps: + break + + model.eval() + losses = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + outputs = model(**batch) + + loss = outputs.loss + losses.append( + accelerator.gather(loss.repeat(args.per_device_eval_batch_size)) + ) + + losses = torch.cat(losses) + losses = losses[: len(eval_dataset)] + try: + perplexity = math.exp(torch.mean(losses)) + except OverflowError: + perplexity = float("inf") + + logger.info(f"epoch {epoch}: perplexity: {perplexity}") + + if args.output_dir is not None: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) + + +if __name__ == "__main__": + main() diff --git a/train_my_tokenizer.py b/train_my_tokenizer.py new file mode 100644 index 0000000..e31eb4f --- /dev/null +++ b/train_my_tokenizer.py @@ -0,0 +1,124 @@ +import argparse +import os +from itertools import chain + +from datasets import load_dataset +from tokenizers import Tokenizer +from tokenizers.models import WordLevel +from tokenizers.pre_tokenizers import Whitespace +from tokenizers.processors import TemplateProcessing +from tokenizers.trainers import WordLevelTrainer + +from process_data.utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file + +BASE_PATH = "/home/ming/malware/inst2vec_bert/bert/" + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Train a word level tokenizer for ASM_BERT" + ) + parser.add_argument( + "--vocab_size", + type=int, + default=2000, + help="The size of vocabulary used to train the tokenizer.", + ) + parser.add_argument( + "--padding_length", + type=int, + default=32, + help="The length will be padded to by the tokenizer.", + ) + args = parser.parse_args() + + return args + + +def train_tokenizer(args, dataset): + tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) + tokenizer.pre_tokenizer = Whitespace() + + trainer = WordLevelTrainer( + vocab_size=args.vocab_size, + special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], + ) + + # def batch_iterator(batch_size=1000): + # for i in range(0, len(dataset), batch_size): + # yield dataset[i : i + batch_size]["text"] + + # tokenizer.train_from_iterator( + # batch_iterator(), trainer=trainer, length=len(dataset) + # ) + + tokenizer.train_from_iterator(dataset, trainer) + + return tokenizer + + +def save_tokenizer(tokenizer, tokenizer_file): + tokenizer.save(tokenizer_file) + + +def load_tokenizer(tokenizer_file): + if not os.path.exists(tokenizer_file): + print("{} doesn't exist, will be retrained...".format(tokenizer_file)) + return None + print("The tokenizer has already been trained.") + return Tokenizer.from_file(tokenizer_file) + + +def post_process(tokenizer): + tokenizer.post_processor = TemplateProcessing( + single="[CLS] $A [SEP]", + pair="[CLS] $A [SEP] $B:1 [SEP]:1", + special_tokens=[ + ("[CLS]", tokenizer.token_to_id("[CLS]")), + ("[SEP]", tokenizer.token_to_id("[SEP]")), + ], + ) + return tokenizer + + +def tokenizer_encode(tokenizer, data): + return tokenizer.encode_batch(data) + + +def main(tokenizer_file=""): + args = parse_args() + + tokenizer = load_tokenizer(tokenizer_file) + + if tokenizer is not None: + return + + # json_files = [ + # os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128) + # ] + # dataset = load_dataset("json", data_files=json_files, field="data") + + text_files = [ + os.path.join(ORIGINAL_DATA_BASE, "inst.1.{}.txt".format(group)) + for group in ["pos", "neg"] + ] + + dataset = [] + for f in text_files: + dataset += read_file(f) + + dataset = [tuple(sent[:-1].split("\t")) for sent in dataset] + + print("Trainging tokenizer...") + tokenizer = train_tokenizer(args, chain.from_iterable(dataset)) + tokenizer = post_process(tokenizer) + tokenizer.enable_padding( + pad_id=tokenizer.token_to_id("[PAD]"), + pad_token="[PAD]", + length=args.padding_length, + ) + save_tokenizer(tokenizer, tokenizer_file) + + +if __name__ == "__main__": + main(os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json"))