complete data processing and first vision of training script

2021-06-06 20:50:36 +08:00 · 2021-06-06 20:50:36 +08:00 · b595346a07
commit b595346a07
12 changed files with 2193 additions and 0 deletions
--- a/data_collator_for_language_model.py
+++ b/data_collator_for_language_model.py
@ -0,0 +1,434 @@
+from dataclasses import dataclass
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+import numpy as np
+import tokenizers
+import torch
+from transformers import BatchEncoding
+
+EncodedInput = List[int]
+
+
+@dataclass
+class MyDataCollatorForPreTraining:
+    # """
+    # Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    # are not all of the same length.
+
+    # Args:
+    #     # tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+
+    #     tokenizer (:class:`tokenizers.Tokenizer`)
+    #         The tokenizer used for encoding the data.
+    #     mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+    #         Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
+    #         inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
+    #         non-masked tokens and the value to predict for the masked token.
+    #     mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+    #         The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
+    #     pad_to_multiple_of (:obj:`int`, `optional`):
+    #         If set will pad the sequence to a multiple of the provided value.
+
+    # .. note::
+
+    #     For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    #     BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+    #     :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+    #     argument :obj:`return_special_tokens_mask=True`.
+    # """
+    # def __init__(
+    #     self,
+    #     tokenizer: tokenizers.Tokenizer,
+    #     mlm: bool = True,
+    #     mlm_probability: float = 0.15,
+    #     pad_to_multiple_of: Optional[int] = None,
+    # ):
+    #     self.tokenizer = tokenizer
+    #     self.mlm = mlm
+    #     self.mlm_probability = mlm_probability
+    #     self.pad_to_multiple_of = pad_to_multiple_of
+    tokenizer: tokenizers.Tokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
+
+    def __post_init__(self):
+        if self.mlm and self.tokenizer.token_to_id("[MASK]") is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+
+    def __call__(
+        self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]],
+    ) -> Dict[str, torch.Tensor]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], (dict, BatchEncoding)):
+            batch = pad(
+                examples,
+                return_tensors="pt",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+        else:
+            batch = {
+                "input_ids": _collate_batch(
+                    examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
+                )
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        # else:
+        #     labels = batch["input_ids"].clone()
+        #     if self.tokenizer.pad_token_id is not None:
+        #         labels[labels == self.tokenizer.pad_token_id] = -100
+        #     batch["labels"] = labels
+        return batch
+
+    def mask_tokens(
+        self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(
+                    val, already_has_special_tokens=True
+                )
+                for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        )
+        # inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
+        #     self.tokenizer.mask_token
+        # )
+        inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]")
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(
+            self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long
+        )
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+def pad(
+    self,
+    encoded_inputs: Union[
+        BatchEncoding,
+        List[BatchEncoding],
+        Dict[str, EncodedInput],
+        Dict[str, List[EncodedInput]],
+        List[Dict[str, EncodedInput]],
+    ],
+    padding=True,
+    max_length: Optional[int] = None,
+    pad_to_multiple_of: Optional[int] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_tensors=None,
+    verbose: bool = True,
+) -> BatchEncoding:
+    """
+    Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+    in the batch.
+    Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+    ``self.pad_token_id`` and ``self.pad_token_type_id``)
+    .. note::
+        If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+        case of PyTorch tensors, you will lose the specific device of your tensors however.
+    Args:
+        encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+            Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+            List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+            List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+            well as in a PyTorch Dataloader collate function.
+            Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+            see the note above for the return type.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                single sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+            >= 7.5 (Volta).
+        return_attention_mask (:obj:`bool`, `optional`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            If set, will return tensors instead of list of python integers. Acceptable values are:
+            * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+            * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+            * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to print more information and warnings.
+    """
+    # If we have a list of dicts, let's convert it in a dict of lists
+    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+    if isinstance(encoded_inputs, (list, tuple)) and isinstance(
+        encoded_inputs[0], (dict, BatchEncoding)
+    ):
+        encoded_inputs = {
+            key: [example[key] for example in encoded_inputs]
+            for key in encoded_inputs[0].keys()
+        }
+
+    # The model's main input name, usually `input_ids`, has be passed for padding
+    # if self.model_input_names[0] not in encoded_inputs:
+    #     raise ValueError(
+    #         "You should supply an encoding or a list of encodings to this method "
+    #         f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+    #     )
+
+    required_input = encoded_inputs["input_ids"]
+
+    if not required_input:
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = []
+        return encoded_inputs
+
+    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+    # and rebuild them afterwards if no return_tensors is specified
+    # Note that we lose the specific device the tensor may be on for PyTorch
+
+    first_element = required_input[0]
+    if isinstance(first_element, (list, tuple)):
+        # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+        index = 0
+        while len(required_input[index]) == 0:
+            index += 1
+        if index < len(required_input):
+            first_element = required_input[index][0]
+    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+    if not isinstance(first_element, (int, list, tuple)):
+        if isinstance(first_element, torch.Tensor):
+            return_tensors = "pt" if return_tensors is None else return_tensors
+        elif isinstance(first_element, np.ndarray):
+            return_tensors = "np" if return_tensors is None else return_tensors
+        else:
+            raise ValueError(
+                f"type of {first_element} unknown: {type(first_element)}. "
+                f"Should be one of a python, numpy, pytorch or tensorflow object."
+            )
+
+        for key, value in encoded_inputs.items():
+            encoded_inputs[key] = to_py_obj(value)
+
+    # # Convert padding_strategy in PaddingStrategy
+    # padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+    #     padding=padding, max_length=max_length, verbose=verbose
+    # )
+
+    required_input = encoded_inputs["input_ids"]
+    if required_input and not isinstance(required_input[0], (list, tuple)):
+        # encoded_inputs = _pad(
+        #     encoded_inputs,
+        #     max_length=max_length,
+        #     # padding_strategy=padding_strategy,
+        #     pad_to_multiple_of=pad_to_multiple_of,
+        #     return_attention_mask=return_attention_mask,
+        # )
+        return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+
+    batch_size = len(required_input)
+    assert all(
+        len(v) == batch_size for v in encoded_inputs.values()
+    ), "Some items in the output dictionary have a different batch size than others."
+
+    # if padding_strategy == PaddingStrategy.LONGEST:
+    #     max_length = max(len(inputs) for inputs in required_input)
+    #     padding_strategy = PaddingStrategy.MAX_LENGTH
+
+    batch_outputs = {}
+    for i in range(batch_size):
+        inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+        # outputs = self._pad(
+        #     inputs,
+        #     max_length=max_length,
+        #     # padding_strategy=padding_strategy,
+        #     pad_to_multiple_of=pad_to_multiple_of,
+        #     return_attention_mask=return_attention_mask,
+        # )
+        for key, value in inputs.items():
+            if key not in batch_outputs:
+                batch_outputs[key] = []
+            batch_outputs[key].append(value)
+
+    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+
+# def _pad(
+#     self,
+#     encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+#     max_length: Optional[int] = None,
+#     padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+#     pad_to_multiple_of: Optional[int] = None,
+#     return_attention_mask: Optional[bool] = None,
+# ) -> dict:
+#     """
+#         Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+#         Args:
+#             encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+#             max_length: maximum length of the returned list and optionally padding length (see below).
+#                 Will truncate by taking into account the special tokens.
+#             padding_strategy: PaddingStrategy to use for padding.
+#                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+#                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+#                 - PaddingStrategy.DO_NOT_PAD: Do not pad
+#                 The tokenizer padding sides are defined in self.padding_side:
+#                     - 'left': pads on the left of the sequences
+#                     - 'right': pads on the right of the sequences
+#             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+#                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+#                 >= 7.5 (Volta).
+#             return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+#         """
+#     # Load from model defaults
+#     if return_attention_mask is None:
+#         return_attention_mask = "attention_mask" in self.model_input_names
+
+#     required_input = encoded_inputs[self.model_input_names[0]]
+
+#     if padding_strategy == PaddingStrategy.LONGEST:
+#         max_length = len(required_input)
+
+#     if (
+#         max_length is not None
+#         and pad_to_multiple_of is not None
+#         and (max_length % pad_to_multiple_of != 0)
+#     ):
+#         max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+#     needs_to_be_padded = (
+#         padding_strategy != PaddingStrategy.DO_NOT_PAD
+#         and len(required_input) != max_length
+#     )
+
+#     if needs_to_be_padded:
+#         difference = max_length - len(required_input)
+#         if self.padding_side == "right":
+#             if return_attention_mask:
+#                 encoded_inputs["attention_mask"] = [1] * len(required_input) + [
+#                     0
+#                 ] * difference
+#             if "token_type_ids" in encoded_inputs:
+#                 encoded_inputs["token_type_ids"] = (
+#                     encoded_inputs["token_type_ids"]
+#                     + [self.pad_token_type_id] * difference
+#                 )
+#             if "special_tokens_mask" in encoded_inputs:
+#                 encoded_inputs["special_tokens_mask"] = (
+#                     encoded_inputs["special_tokens_mask"] + [1] * difference
+#                 )
+#             encoded_inputs[self.model_input_names[0]] = (
+#                 required_input + [self.pad_token_id] * difference
+#             )
+#         elif self.padding_side == "left":
+#             if return_attention_mask:
+#                 encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
+#                     required_input
+#                 )
+#             if "token_type_ids" in encoded_inputs:
+#                 encoded_inputs["token_type_ids"] = [
+#                     self.pad_token_type_id
+#                 ] * difference + encoded_inputs["token_type_ids"]
+#             if "special_tokens_mask" in encoded_inputs:
+#                 encoded_inputs["special_tokens_mask"] = [
+#                     1
+#                 ] * difference + encoded_inputs["special_tokens_mask"]
+#             encoded_inputs[self.model_input_names[0]] = [
+#                 self.pad_token_id
+#             ] * difference + required_input
+#         else:
+#             raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+#     elif return_attention_mask and "attention_mask" not in encoded_inputs:
+#         encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+#     return encoded_inputs
+
+
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = examples[0].size(0)
+    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    if are_tensors_same_length and (
+        pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0
+    ):
+        return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def to_py_obj(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj.detach().cpu().tolist()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
--- a/my_data_collator.py
+++ b/my_data_collator.py
@ -0,0 +1,272 @@
+from dataclasses import dataclass
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+import numpy as np
+import tokenizers
+import torch
+from transformers import BatchEncoding
+
+EncodedInput = List[int]
+
+
+@dataclass
+class MyDataCollatorForPreTraining:
+    tokenizer: tokenizers.Tokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
+
+    def __post_init__(self):
+        if self.mlm and self.tokenizer.token_to_id("[MASK]") is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+
+    def __call__(
+        self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]],
+    ) -> Dict[str, torch.Tensor]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], (dict, BatchEncoding)):
+            batch = pad(
+                encoded_inputs=examples,
+                return_tensors="pt",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+        else:
+            batch = {
+                "input_ids": _collate_batch(
+                    examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
+                )
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        return batch
+
+    def mask_tokens(
+        self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(
+                    val, already_has_special_tokens=True
+                )
+                for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        )
+        # inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
+        #     self.tokenizer.mask_token
+        # )
+        inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]")
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(
+            self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long
+        )
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+def pad(
+    encoded_inputs: Union[
+        BatchEncoding,
+        List[BatchEncoding],
+        Dict[str, EncodedInput],
+        Dict[str, List[EncodedInput]],
+        List[Dict[str, EncodedInput]],
+    ],
+    padding=True,
+    max_length: Optional[int] = None,
+    pad_to_multiple_of: Optional[int] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_tensors=None,
+    verbose: bool = True,
+) -> BatchEncoding:
+    """
+    Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+    in the batch.
+    Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+    ``self.pad_token_id`` and ``self.pad_token_type_id``)
+    .. note::
+        If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+        case of PyTorch tensors, you will lose the specific device of your tensors however.
+    Args:
+        encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+            Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+            List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+            List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+            well as in a PyTorch Dataloader collate function.
+            Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+            see the note above for the return type.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                single sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+            >= 7.5 (Volta).
+        return_attention_mask (:obj:`bool`, `optional`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            If set, will return tensors instead of list of python integers. Acceptable values are:
+            * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+            * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+            * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to print more information and warnings.
+    """
+    # If we have a list of dicts, let's convert it in a dict of lists
+    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+    if isinstance(encoded_inputs, (list, tuple)) and isinstance(
+        encoded_inputs[0], (dict, BatchEncoding)
+    ):
+        encoded_inputs = {
+            key: [example[key] for example in encoded_inputs]
+            for key in encoded_inputs[0].keys()
+        }
+
+    required_input = encoded_inputs["input_ids"]
+
+    if not required_input:
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = []
+        return encoded_inputs
+
+    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+    # and rebuild them afterwards if no return_tensors is specified
+    # Note that we lose the specific device the tensor may be on for PyTorch
+
+    first_element = required_input[0]
+    if isinstance(first_element, (list, tuple)):
+        # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+        index = 0
+        while len(required_input[index]) == 0:
+            index += 1
+        if index < len(required_input):
+            first_element = required_input[index][0]
+    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+    if not isinstance(first_element, (int, list, tuple)):
+        if isinstance(first_element, torch.Tensor):
+            return_tensors = "pt" if return_tensors is None else return_tensors
+        elif isinstance(first_element, np.ndarray):
+            return_tensors = "np" if return_tensors is None else return_tensors
+        else:
+            raise ValueError(
+                f"type of {first_element} unknown: {type(first_element)}. "
+                f"Should be one of a python, numpy, pytorch or tensorflow object."
+            )
+
+        for key, value in encoded_inputs.items():
+            encoded_inputs[key] = to_py_obj(value)
+
+    required_input = encoded_inputs["input_ids"]
+    if required_input and not isinstance(required_input[0], (list, tuple)):
+        return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+
+    batch_size = len(required_input)
+    assert all(
+        len(v) == batch_size for v in encoded_inputs.values()
+    ), "Some items in the output dictionary have a different batch size than others."
+
+    batch_outputs = {}
+    for i in range(batch_size):
+        inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+        for key, value in inputs.items():
+            if key not in batch_outputs:
+                batch_outputs[key] = []
+            batch_outputs[key].append(value)
+
+    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = examples[0].size(0)
+    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    if are_tensors_same_length and (
+        pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0
+    ):
+        return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def to_py_obj(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj.detach().cpu().tolist()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
--- a/my_run_mlm_no_trainer.py
+++ b/my_run_mlm_no_trainer.py
@ -0,0 +1,421 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 Alan Zhao. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# """
+# Pre-train the BERT on a dataset without using HuggingFace Trainer.
+# """
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import tokenizers
+import torch
+import transformers
+from accelerate import Accelerator
+from datasets import load_dataset
+from torch.nn import DataParallel
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    BatchEncoding,
+    BertConfig,
+    BertForPreTraining,
+    DataCollatorForLanguageModeling,
+    SchedulerType,
+    get_scheduler,
+    set_seed,
+)
+
+from my_data_collator import MyDataCollatorForPreTraining
+from process_data.utils import CURRENT_DATA_BASE
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a Masked Language Modeling task"
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=64,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=40,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="Where to store the final model."
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--mlm_probability",
+        type=float,
+        default=0.15,
+        help="Ratio of tokens to mask for masked language modeling loss",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--eval_every_steps",
+        type=int,
+        default=5000,
+        help="Number of steps before evaluating the model.",
+    )
+
+    args = parser.parse_args()
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(
+        logging.INFO if accelerator.is_local_main_process else logging.ERROR
+    )
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # we take control of the load of dataset by oursevles
+    # there will be several json file for training
+    # `raw_dataset` has two features:
+    #   `text`: "sentA\tsentB"
+    #   `is_next`: 0 or 1
+    # raw_datasets = load_dataset(
+    #     "json",
+    #     data_files={
+    #         "train": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
+    #         "validation": "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
+    #     },
+    #     field="data",
+    # )
+    train_files = [
+        os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128)
+    ]
+    valid_file = "/home/ming/malware/inst2vec_bert/data/test_lm/inst.json"
+    raw_datasets = load_dataset(
+        "json",
+        data_files={"train": train_files, "validation": valid_file,},
+        field="data",
+    )
+
+    # we use the tokenizer previously trained on the dataset above
+    tokenizer = tokenizers.Tokenizer.from_file(
+        os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json")
+    )
+
+    # NOTE: have to promise the `length` here is consistent with the one used in `train_my_tokenizer.py`
+    tokenizer.enable_padding(
+        pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=32
+    )
+
+    # NOTE: `max_position_embeddings` here should be consistent with `length` above
+    # we use a much smaller BERT, config is:
+    config = BertConfig(
+        vocab_size=tokenizer.get_vocab_size(),
+        hidden_size=96,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=384,
+        max_position_embeddings=32,
+    )
+
+    # initalize a new BERT for pre-training
+    model = BertForPreTraining(config)
+
+    # Preprocessing the datasets.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    # First we aplly `tokenize_function` on the dataset.
+    def tokenize_function(examples):
+        text = [tuple(sent) for sent in examples["text"]]
+        encoded_inputs = {}
+        results = tokenizer.encode_batch(text)
+        encoded_inputs["input_ids"] = [result.ids for result in results]
+        encoded_inputs["token_type_ids"] = [result.type_ids for result in results]
+        encoded_inputs["special_tokens_mask"] = [
+            result.special_tokens_mask for result in results
+        ]
+        # according to the document of BERT in HuggingFace
+        # 0: is
+        # 1: is not
+        encoded_inputs["next_sentence_label"] = [
+            1 - label for label in examples["is_next"]
+        ]
+        # use `np` rather than `pt` in case of reporting of error
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type="np", prepend_batch_axis=False,
+        )
+        return batch_outputs
+
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+    )
+
+    train_dataset = tokenized_datasets["train"]
+    eval_dataset = tokenized_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = MyDataCollatorForPreTraining(
+        tokenizer=tokenizer, mlm_probability=args.mlm_probability
+    )
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=data_collator,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        collate_fn=data_collator,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    # model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+    #     model, optimizer, train_dataloader, eval_dataloader
+    # )
+    model, optimizer, train_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader
+    )
+
+    model = DataParallel(model)
+    # model.to("cuda:0")
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(
+            args.max_train_steps / num_update_steps_per_epoch
+        )
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = (
+        args.per_device_train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
+    )
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    logger.info(f"  Evalute every {args.eval_every_steps} steps")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if (
+                step % args.gradient_accumulation_steps == 0
+                or step == len(train_dataloader) - 1
+            ):
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+            if completed_steps % args.eval_every_steps == 0:
+                model.eval()
+                losses = []
+                for step, batch in enumerate(eval_dataloader):
+                    with torch.no_grad():
+                        outputs = model(**batch)
+
+                    loss = outputs.loss
+                    losses.append(
+                        accelerator.gather(loss.repeat(args.per_device_eval_batch_size))
+                    )
+
+                losses = torch.cat(losses)
+                # losses = losses[: len(eval_dataset)]
+                try:
+                    perplexity = math.exp(torch.mean(losses))
+                except OverflowError:
+                    perplexity = float("inf")
+
+                logger.info(f"steps {completed_steps}: perplexity: {perplexity}")
+                model.train()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/process_data/check_length.py
+++ b/process_data/check_length.py
@ -0,0 +1,29 @@
+import os
+
+from utils import ORIGINAL_DATA_BASE, read_file
+
+
+def check(filename):
+    sents = read_file(filename)
+    result = 0
+    for sent in sents:
+        result = max(result, len(sent[-1].replace("\t", " ").split()))
+    print("The longest sentence in {} has {} words".format(filename, result))
+    return result
+
+
+def main():
+    longest = 0
+    # for i in range(6):
+    for i in [1]:
+        for group in ("pos", "neg"):
+            filename = os.path.join(
+                ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group)
+            )
+            longest = max(check(filename), longest)
+    print("The longest sentence in all files has {} words.".format(longest))
+
+
+if __name__ == "__main__":
+    main()
+
--- a/process_data/convert_space_format.py
+++ b/process_data/convert_space_format.py
@ -0,0 +1,29 @@
+import os
+
+from utils import ORIGINAL_DATA_BASE, read_file
+
+
+def write_file(data, filename):
+    print("Writing data into {}...".format(filename))
+    with open(filename, "w", encoding="utf-8") as fout:
+        for sent in data:
+            fout.write(sent.replace("<space>", "SPACE"))
+
+
+def convert(fin, fout):
+    print("Start the replacement task for {}...".format(fin))
+    # filename = "/home/ming/malware/data/elfasm_inst_pairs/linux32_00xxxx.all"
+    sents = read_file(fin)
+    write_file(sents, fout)
+
+
+def main():
+    # for i in range(6):
+    for i in [1]:
+        fin = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i))
+        fout = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.txt".format(i))
+        convert(fin, fout)
+
+
+if __name__ == "__main__":
+    main()
--- a/process_data/count_word_for_vocab.py
+++ b/process_data/count_word_for_vocab.py
@ -0,0 +1,54 @@
+import os
+from multiprocessing import Pool, Process, Queue
+
+from tqdm import tqdm
+
+from utils import ORIGINAL_DATA_BASE, read_file
+
+q = Queue(128)
+BASE = 4600000
+
+
+def counter_worker(sents):
+    cnt = set()
+    for sent in tqdm(sents):
+        cnt = cnt.union(set(sent[:-1].replace("\t", " ").split()))
+    print("Process {} get {} words".format(os.getpid(), len(cnt)))
+    q.put(cnt)
+    return
+
+
+def counter(filename):
+    sents = read_file(filename)
+
+    p = Pool(36)
+
+    for i in range(64):
+        p.apply_async(counter_worker, args=(sents[i * BASE : (i + 1) * BASE],))
+    print("Waiting for all sub-processes done...")
+    p.close()
+    p.join()
+    print("All subprocess done.")
+    cnt = set()
+    # for sent in tqdm(sents):
+    #     cnt += set(sent[-1].replace("\t", " ").split())
+    for _ in tqdm(range(64)):
+        cnt = cnt.union(q.get())
+    print("There are {} charcters in {}".format(len(cnt), filename))
+    return cnt
+
+
+def main():
+    cnt = set()
+    # for i in range(6):
+    for i in [1]:
+        for group in ["pos", "neg"]:
+            filename = os.path.join(
+                ORIGINAL_DATA_BASE, "inst.{}.{}.txt".format(i, group)
+            )
+            cnt += counter(filename)
+    print("There are {} charcters in all files".format(len(cnt)))
+
+
+if __name__ == "__main__":
+    main()
--- a/process_data/create_negative_examples.py
+++ b/process_data/create_negative_examples.py
@ -0,0 +1,34 @@
+import os
+from random import randint
+
+from tqdm import tqdm
+
+from utils import ORIGINAL_DATA_BASE, read_file
+
+
+def create(pos, neg, tgt):
+    pos_sents = read_file(pos)
+
+    neg_sents = read_file(neg)
+    neg_length = len(neg_sents)
+    print("Start writing negative examples to {}...".format(tgt))
+    with open(tgt, "w", encoding="utf-8") as fout:
+        for sent in tqdm(pos_sents):
+            first = sent.split("\t")[0]
+            index = randint(0, neg_length - 1)
+            pair = neg_sents[index].split("\t")[randint(0, 1)].replace("\n", "")
+            fout.write(first + "\t" + pair + "\n")
+
+
+def main():
+    # for i in range(6):
+    for i in [1]:
+        j = (i + 1) % 6
+        pos = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(i))
+        neg = os.path.join(ORIGINAL_DATA_BASE, "linux32_0{}xxxx.all".format(j))
+        tgt = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.txt".format(i))
+        create(pos, neg, tgt)
+
+
+if __name__ == "__main__":
+    main()
--- a/process_data/merge_examples_to_json.py
+++ b/process_data/merge_examples_to_json.py
@ -0,0 +1,91 @@
+import gc
+import json
+import os
+from multiprocessing import Pool, Process, Queue
+
+from tqdm import tqdm
+
+from utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file
+
+BASE = 4600000
+
+
+def write_worker(sents, json_file, index):
+    examples = []
+    for sent in tqdm(sents):
+        tmp = sent[:-1].split("\t")
+        examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
+        examples[-1]["text"] = tuple(examples[-1]["text"])
+
+    print("Writing to {}...".format(json_file + "{}.json".format(index)))
+    results = {"data": examples}
+    with open(json_file + "{}.json".format(index), "w") as f:
+        json.dump(results, f)
+
+
+def merge_to_json(pos, neg, json_file):
+    sents = read_file(pos)
+
+    p = Pool(36)
+
+    for i in range(64):
+        p.apply_async(
+            write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, i,)
+        )
+    print("Waiting for all sub-processes done...")
+    p.close()
+    p.join()
+    print("All subprocess done.")
+
+    # length = len(sents)
+    # base = length // 20000000 + 1
+
+    # for i in tqdm(range(length)):
+    #     examples = []
+    #     tmp = sents[i][:-1].split("\t")
+    #     examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
+    #     examples[i]["text"] = tuple(examples[i]["text"])
+    #     index = i // 20000000
+    #     print("Writing to {}...".format(json_file + "{}.json".format(index)))
+    #     with open(json_file + "{}.json".format(index), "w") as f:
+    #         json.dump(examples, f)
+
+    del sents
+    gc.collect()
+
+    sents = read_file(neg)
+
+    p = Pool(8)
+
+    for i in range(64):
+        p.apply_async(
+            write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, 64 + i,)
+        )
+    print("Waiting for all sub-processes done...")
+    p.close()
+    p.join()
+    print("All subprocess done.")
+
+    # length = len(sents)
+    # for i in tqdm(range(length)):
+    #     examples = []
+    #     tmp = sents[i][:-1].split("\t")
+    #     examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
+    #     examples[i]["text"] = tuple(examples[i]["text"])
+    #     index = i // 20000000
+    #     print("Writing to {}...".format(json_file + "{}.json".format(base + index)))
+    #     with open(json_file + "{}.json".format(base + index), "w") as f:
+    #         json.dump(examples, f)
+
+
+def main():
+    # for i in range(6):
+    for i in [1]:
+        pos = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.label.txt".format(i))
+        neg = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.label.txt".format(i))
+        json_file = os.path.join(CURRENT_DATA_BASE, "inst.{}.".format(i))
+        merge_to_json(pos, neg, json_file)
+
+
+if __name__ == "__main__":
+    main()
--- a/process_data/readme.md
+++ b/process_data/readme.md
@ -0,0 +1,22 @@
+# Pre-processing steps
+### 1. run `convert_space_format.py`
+Convert the string `<space>` to `SPACE`
+
+### 2. run `create_negtive_examples.py`
+We use the next file of the current file as its negative examples, which is apparently rational.
+
+Specifically, for each instruction in the current positive file, we randomly choose a line in its next file and select one of two instructions in the line as its negative example.
+
+### 3. run `merge_examples_to_json.py`
+We dump the positive and negative examples with their corresponding labels into several json files. 
+Each json file contains 20m lines of examples.
+
+### 4. run `check_length.py`
+We will specify the length padded to when we use the tokenizer, `tokenizer.enable_padding(..., length=)`. 
+
+So we need to know the longest sentences in the dataset.
+
+### 5. run `count_word_for_vocab.py`
+Similarly, we also need to specify the size of vocabulary when we train the tokenizer, `WordLevelTrainer(vocab_size=, ...)`. 
+
+So we need to know how many characters in the dataset.
--- a/process_data/utils.py
+++ b/process_data/utils.py
@ -0,0 +1,11 @@
+import os
+
+ORIGINAL_DATA_BASE = "/home/ming/malware/data/elfasm_inst_pairs"
+CURRENT_DATA_BASE = "/home/ming/malware/inst2vec_bert/data/asm_bert"
+
+
+def read_file(filename):
+    print("Reading data from {}...".format(filename))
+    with open(filename, "r", encoding="utf-8") as fin:
+        return fin.readlines()
+
--- a/run_mlm_no_trainer.py
+++ b/run_mlm_no_trainer.py
@ -0,0 +1,672 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# """
+# Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+# on a text file or a dataset without using HuggingFace Trainer.
+# Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+# https://huggingface.co/models?filter=masked-lm
+# """
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import tokenizers
+import torch
+import transformers
+from accelerate import Accelerator
+from datasets import load_dataset
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    BatchEncoding,
+    BertConfig,
+    BertForPreTraining,
+    DataCollatorForLanguageModeling,
+    SchedulerType,
+    get_scheduler,
+    set_seed,
+)
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a Masked Language Modeling task"
+    )
+    # parser.add_argument(
+    #     "--dataset_name",
+    #     type=str,
+    #     default=None,
+    #     help="The name of the dataset to use (via the datasets library).",
+    # )
+    # parser.add_argument(
+    #     "--dataset_config_name",
+    #     type=str,
+    #     default=None,
+    #     help="The configuration name of the dataset to use (via the datasets library).",
+    # )
+    # parser.add_argument(
+    #     "--train_file",
+    #     type=str,
+    #     default=None,
+    #     help="A csv or a json file containing the training data.",
+    # )
+    # parser.add_argument(
+    #     "--validation_file",
+    #     type=str,
+    #     default=None,
+    #     help="A csv or a json file containing the validation data.",
+    # )
+    # parser.add_argument(
+    #     "--validation_split_percentage",
+    #     default=5,
+    #     help="The percentage of the train set used as validation set in case there's no validation split",
+    # )
+    # parser.add_argument(
+    #     "--pad_to_max_length",
+    #     action="store_true",
+    #     help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    # )
+    # parser.add_argument(
+    #     "--model_name_or_path",
+    #     type=str,
+    #     help="Path to pretrained model or model identifier from huggingface.co/models.",
+    #     required=True,
+    # )
+    # parser.add_argument(
+    #     "--config_name",
+    #     type=str,
+    #     default=None,
+    #     help="Pretrained config name or path if not the same as model_name",
+    # )
+    # parser.add_argument(
+    #     "--tokenizer_name",
+    #     type=str,
+    #     default=None,
+    #     help="Pretrained tokenizer name or path if not the same as model_name",
+    # )
+    # parser.add_argument(
+    #     "--use_slow_tokenizer",
+    #     action="store_true",
+    #     help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    # )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=40,
+        help="Total number of training epochs to perform.",
+    )
+    # parser.add_argument(
+    #     "--max_train_steps",
+    #     type=int,
+    #     default=None,
+    #     help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    # )
+    # parser.add_argument(
+    #     "--gradient_accumulation_steps",
+    #     type=int,
+    #     default=1,
+    #     help="Number of updates steps to accumulate before performing a backward/update pass.",
+    # )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="Where to store the final model."
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    # parser.add_argument(
+    #     "--model_type",
+    #     type=str,
+    #     default=None,
+    #     help="Model type to use if training from scratch.",
+    #     choices=MODEL_TYPES,
+    # )
+    # parser.add_argument(
+    #     "--max_seq_length",
+    #     type=int,
+    #     default=None,
+    #     help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
+    # )
+    # parser.add_argument(
+    #     "--line_by_line",
+    #     type=bool,
+    #     default=False,
+    #     help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
+    # )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    # parser.add_argument(
+    #     "--overwrite_cache",
+    #     type=bool,
+    #     default=False,
+    #     help="Overwrite the cached training and evaluation sets",
+    # )
+    parser.add_argument(
+        "--mlm_probability",
+        type=float,
+        default=0.15,
+        help="Ratio of tokens to mask for masked language modeling loss",
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    # if (
+    #     args.dataset_name is None
+    #     and args.train_file is None
+    #     and args.validation_file is None
+    # ):
+    #     raise ValueError("Need either a dataset name or a training/validation file.")
+    # else:
+    #     if args.train_file is not None:
+    #         extension = args.train_file.split(".")[-1]
+    #         assert extension in [
+    #             "csv",
+    #             "json",
+    #             "txt",
+    #         ], "`train_file` should be a csv, json or txt file."
+    #     if args.validation_file is not None:
+    #         extension = args.validation_file.split(".")[-1]
+    #         assert extension in [
+    #             "csv",
+    #             "json",
+    #             "txt",
+    #         ], "`validation_file` should be a csv, json or txt file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(
+        logging.INFO if accelerator.is_local_main_process else logging.ERROR
+    )
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # # (the dataset will be downloaded automatically from the datasets Hub).
+    # #
+    # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # # 'text' is found. You can easily tweak this behavior (see below).
+    # #
+    # # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # # download the dataset.
+    # if args.dataset_name is not None:
+    #     # Downloading and loading a dataset from the hub.
+    #     raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    #     if "validation" not in raw_datasets.keys():
+    #         raw_datasets["validation"] = load_dataset(
+    #             args.dataset_name,
+    #             args.dataset_config_name,
+    #             split=f"train[:{args.validation_split_percentage}%]",
+    #         )
+    #         raw_datasets["train"] = load_dataset(
+    #             args.dataset_name,
+    #             args.dataset_config_name,
+    #             split=f"train[{args.validation_split_percentage}%:]",
+    #         )
+    # else:
+    #     data_files = {}
+    #     if args.train_file is not None:
+    #         data_files["train"] = args.train_file
+    #     if args.validation_file is not None:
+    #         data_files["validation"] = args.validation_file
+    #     extension = args.train_file.split(".")[-1]
+    #     if extension == "txt":
+    #         extension = "text"
+    #     raw_datasets = load_dataset(extension, data_files=data_files)
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # we use dataset in json format,
+    # `raw_dataset` has two features: `text`:"sentA\tsentB" and `is_next`:0 or 1
+
+    # we take control of the load of dataset by oursevles
+    # there will be several json file for training
+    # raw_datasets = load_dataset("json", data_files=args.train_file, field="data")
+    raw_datasets = load_dataset(
+        "json",
+        data_files="/home/ming/malware/inst2vec_bert/data/test_lm/inst.json",
+        field="data",
+    )
+
+    # we use the tokenizer trained on the positive dataset before
+    # tokenizer = tokenizers.Tokenizer.from_file(args.tokenizer_file)
+    tokenizer = tokenizers.Tokenizer.from_file(
+        "/home/ming/malware/inst2vec_bert/bert/tokenizer-inst.json"
+    )
+    tokenizer.enable_padding(
+        pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=64
+    )
+
+    # we use a much smaller BERT, config is:
+    config = BertConfig(
+        vocab_size=tokenizer.get_vocab_size(),
+        hidden_size=64,
+        num_hidden_layers=8,
+        num_attention_heads=8,
+        intermediate_size=256,
+        max_position_embeddings=64,
+    )
+    # initalize the new BERT for pre-training
+    model = BertForPreTraining(config)
+
+    # all_special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    # all_special_ids = [tokenizer.token_to_id(token) for token in all_special_tokens]
+
+    # # Load pretrained model and tokenizer
+    # #
+    # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # # download model & vocab.
+    # if args.config_name:
+    #     config = AutoConfig.from_pretrained(args.config_name)
+    # elif args.model_name_or_path:
+    #     config = AutoConfig.from_pretrained(args.model_name_or_path)
+    # else:
+    #     config = CONFIG_MAPPING[args.model_type]()
+    #     logger.warning("You are instantiating a new config instance from scratch.")
+
+    # if args.tokenizer_name:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         args.tokenizer_name, use_fast=not args.use_slow_tokenizer
+    #     )
+    # elif args.model_name_or_path:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         args.model_name_or_path, use_fast=not args.use_slow_tokenizer
+    #     )
+    # else:
+    #     raise ValueError(
+    #         "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+    #         "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+    #     )
+
+    # if args.model_name_or_path:
+    #     model = AutoModelForMaskedLM.from_pretrained(
+    #         args.model_name_or_path,
+    #         from_tf=bool(".ckpt" in args.model_name_or_path),
+    #         config=config,
+    #     )
+    # else:
+    #     logger.info("Training new model from scratch")
+    #     model = AutoModelForMaskedLM.from_config(config)
+
+    # model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    # if args.max_seq_length is None:
+    #     max_seq_length = tokenizer.model_max_length
+    #     if max_seq_length > 1024:
+    #         logger.warning(
+    #             f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+    #             "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+    #         )
+    #         max_seq_length = 1024
+    # else:
+    #     if args.max_seq_length > tokenizer.model_max_length:
+    #         logger.warning(
+    #             f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+    #             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+    #         )
+    #     max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    # if args.line_by_line:
+    #     # When using line_by_line, we just tokenize each nonempty line.
+    #     padding = "max_length" if args.pad_to_max_length else False
+
+    #     def tokenize_function(examples):
+    #         # Remove empty lines
+    #         examples["text"] = [
+    #             line
+    #             for line in examples["text"]
+    #             if len(line) > 0 and not line.isspace()
+    #         ]
+    #         return tokenizer(
+    #             examples["text"],
+    #             padding=padding,
+    #             truncation=True,
+    #             max_length=max_seq_length,
+    #             # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
+    #             # receives the `special_tokens_mask`.
+    #             return_special_tokens_mask=True,
+    #         )
+
+    #     tokenized_datasets = raw_datasets.map(
+    #         tokenize_function,
+    #         batched=True,
+    #         num_proc=args.preprocessing_num_workers,
+    #         remove_columns=[text_column_name],
+    #         load_from_cache_file=not args.overwrite_cache,
+    #     )
+    # else:
+    #     # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    #     # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+    #     # efficient when it receives the `special_tokens_mask`.
+    #     def tokenize_function(examples):
+    #         return tokenizer(
+    #             examples[text_column_name], return_special_tokens_mask=True
+    #         )
+
+    #     tokenized_datasets = raw_datasets.map(
+    #         tokenize_function,
+    #         batched=True,
+    #         num_proc=args.preprocessing_num_workers,
+    #         remove_columns=column_names,
+    #         load_from_cache_file=not args.overwrite_cache,
+    #     )
+
+    #     # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+    #     # max_seq_length.
+    #     def group_texts(examples):
+    #         # Concatenate all texts.
+    #         concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    #         total_length = len(concatenated_examples[list(examples.keys())[0]])
+    #         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    #         # customize this part to your needs.
+    #         total_length = (total_length // max_seq_length) * max_seq_length
+    #         # Split by chunks of max_len.
+    #         result = {
+    #             k: [
+    #                 t[i : i + max_seq_length]
+    #                 for i in range(0, total_length, max_seq_length)
+    #             ]
+    #             for k, t in concatenated_examples.items()
+    #         }
+    #         return result
+
+    #     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+    #     # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+    #     # might be slower to preprocess.
+    #     #
+    #     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    #     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+    #     tokenized_datasets = tokenized_datasets.map(
+    #         group_texts,
+    #         batched=True,
+    #         num_proc=args.preprocessing_num_workers,
+    #         load_from_cache_file=not args.overwrite_cache,
+    #     )
+
+    def tokenize_function(examples):
+        text = [tuple(sent) for sent in examples["text"]]
+        encoded_inputs = {}
+        results = tokenizer.encode_batch(text)
+        # input_ids, type_ids, special_token_masks = [], [], []
+        # for i, result in enumerate(results):
+        #     input_ids.append(result.ids)
+        #     type_ids.append(result.type_ids)
+        #     special_token_masks.append(result.special_tokens_mask)
+        encoded_inputs["input_ids"] = [result.ids for result in results]
+        encoded_inputs["token_type_ids"] = [result.type_ids for result in results]
+        # special_tokens_mask = [1 if token in all_special_ids else 0 for token in ids]
+        encoded_inputs["special_tokens_mask"] = [
+            result.special_tokens_mask for result in results
+        ]
+        # 0: is ; 1 : is not
+        encoded_inputs["next_sentence_label "] = [
+            1 - label for label in examples["is_next"]
+        ]
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type="np", prepend_batch_axis=False,
+        )
+        return batch_outputs
+
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        # load_from_cache_file=not args.overwrite_cache,
+    )
+
+    train_dataset = tokenized_datasets["train"]
+    eval_dataset = tokenized_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm_probability=args.mlm_probability
+    )
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=data_collator,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        collate_fn=data_collator,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(
+            args.max_train_steps / num_update_steps_per_epoch
+        )
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = (
+        args.per_device_train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
+    )
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if (
+                step % args.gradient_accumulation_steps == 0
+                or step == len(train_dataloader) - 1
+            ):
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(
+                accelerator.gather(loss.repeat(args.per_device_eval_batch_size))
+            )
+
+        losses = torch.cat(losses)
+        losses = losses[: len(eval_dataset)]
+        try:
+            perplexity = math.exp(torch.mean(losses))
+        except OverflowError:
+            perplexity = float("inf")
+
+        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/train_my_tokenizer.py
+++ b/train_my_tokenizer.py
@ -0,0 +1,124 @@
+import argparse
+import os
+from itertools import chain
+
+from datasets import load_dataset
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.processors import TemplateProcessing
+from tokenizers.trainers import WordLevelTrainer
+
+from process_data.utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file
+
+BASE_PATH = "/home/ming/malware/inst2vec_bert/bert/"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Train a word level tokenizer for ASM_BERT"
+    )
+    parser.add_argument(
+        "--vocab_size",
+        type=int,
+        default=2000,
+        help="The size of vocabulary used to train the tokenizer.",
+    )
+    parser.add_argument(
+        "--padding_length",
+        type=int,
+        default=32,
+        help="The length will be padded to by the tokenizer.",
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def train_tokenizer(args, dataset):
+    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
+    tokenizer.pre_tokenizer = Whitespace()
+
+    trainer = WordLevelTrainer(
+        vocab_size=args.vocab_size,
+        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    )
+
+    # def batch_iterator(batch_size=1000):
+    #     for i in range(0, len(dataset), batch_size):
+    #         yield dataset[i : i + batch_size]["text"]
+
+    # tokenizer.train_from_iterator(
+    #     batch_iterator(), trainer=trainer, length=len(dataset)
+    # )
+
+    tokenizer.train_from_iterator(dataset, trainer)
+
+    return tokenizer
+
+
+def save_tokenizer(tokenizer, tokenizer_file):
+    tokenizer.save(tokenizer_file)
+
+
+def load_tokenizer(tokenizer_file):
+    if not os.path.exists(tokenizer_file):
+        print("{} doesn't exist, will be retrained...".format(tokenizer_file))
+        return None
+    print("The tokenizer has already been trained.")
+    return Tokenizer.from_file(tokenizer_file)
+
+
+def post_process(tokenizer):
+    tokenizer.post_processor = TemplateProcessing(
+        single="[CLS] $A [SEP]",
+        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
+        special_tokens=[
+            ("[CLS]", tokenizer.token_to_id("[CLS]")),
+            ("[SEP]", tokenizer.token_to_id("[SEP]")),
+        ],
+    )
+    return tokenizer
+
+
+def tokenizer_encode(tokenizer, data):
+    return tokenizer.encode_batch(data)
+
+
+def main(tokenizer_file=""):
+    args = parse_args()
+
+    tokenizer = load_tokenizer(tokenizer_file)
+
+    if tokenizer is not None:
+        return
+
+    # json_files = [
+    #     os.path.join(CURRENT_DATA_BASE, "inst.1.{}.json".format(i)) for i in range(128)
+    # ]
+    # dataset = load_dataset("json", data_files=json_files, field="data")
+
+    text_files = [
+        os.path.join(ORIGINAL_DATA_BASE, "inst.1.{}.txt".format(group))
+        for group in ["pos", "neg"]
+    ]
+
+    dataset = []
+    for f in text_files:
+        dataset += read_file(f)
+
+    dataset = [tuple(sent[:-1].split("\t")) for sent in dataset]
+
+    print("Trainging tokenizer...")
+    tokenizer = train_tokenizer(args, chain.from_iterable(dataset))
+    tokenizer = post_process(tokenizer)
+    tokenizer.enable_padding(
+        pad_id=tokenizer.token_to_id("[PAD]"),
+        pad_token="[PAD]",
+        length=args.padding_length,
+    )
+    save_tokenizer(tokenizer, tokenizer_file)
+
+
+if __name__ == "__main__":
+    main(os.path.join(CURRENT_DATA_BASE, "tokenizer-inst.1.json"))