完整版

2024-04-15 20:01:20 +08:00 · 2024-04-15 20:01:20 +08:00 · adb6acc3cf
commit adb6acc3cf
parent 9379fc80d6
7 changed files with 2735 additions and 83 deletions
--- a/bert/bert.json
+++ b/bert/bert.json
@ -0,0 +1,24 @@
+{
+  "architectures": [
+    "BertForPreTraining"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 16,
+  "initializer_range": 0.02,
+  "intermediate_size": 64,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 50,
+  "model_type": "bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 4,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 2000
+}
--- a/bert/my_data_collator.py
+++ b/bert/my_data_collator.py
@ -0,0 +1,269 @@
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional,
+                    Sequence, Tuple, Union)
+
+import numpy as np
+import tokenizers
+import torch
+from transformers import BatchEncoding
+
+EncodedInput = List[int]
+
+
+@dataclass
+class MyDataCollatorForPreTraining:
+    tokenizer: tokenizers.Tokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
+
+    def __post_init__(self):
+        # print(self.mlm, self.tokenzier.token_to_id("[MASK]"))
+        # input()
+        if self.mlm and self.tokenizer.token_to_id("[MASK]") is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+
+    def __call__(
+        self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]],
+    ) -> Dict[str, torch.Tensor]:
+        # print(examples)
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], (dict, BatchEncoding)):
+            batch = pad(
+                encoded_inputs=examples,
+                return_tensors="pt",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+        else:
+            batch = {
+                "input_ids": _collate_batch(
+                    examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
+                )
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        else:
+            batch["input_ids"] = torch.squeeze(batch["input_ids"], dim=0)
+            batch["token_type_ids"] = torch.squeeze(batch["token_type_ids"], dim=0)
+        return batch
+
+    def mask_tokens(
+        self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(
+                    val, already_has_special_tokens=True
+                )
+                for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        )
+        # inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
+        #     self.tokenizer.mask_token
+        # )
+        inputs[indices_replaced] = self.tokenizer.token_to_id("[MASK]")
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(
+            self.tokenizer.get_vocab_size(), labels.shape, dtype=torch.long
+        )
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+def pad(
+    encoded_inputs: Union[
+        BatchEncoding,
+        List[BatchEncoding],
+        Dict[str, EncodedInput],
+        Dict[str, List[EncodedInput]],
+        List[Dict[str, EncodedInput]],
+    ],
+    padding=True,
+    max_length: Optional[int] = None,
+    pad_to_multiple_of: Optional[int] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_tensors=None,
+    verbose: bool = True,
+) -> BatchEncoding:
+    """
+    Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+    in the batch.
+    Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+    ``self.pad_token_id`` and ``self.pad_token_type_id``)
+    .. note::
+        If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+        case of PyTorch tensors, you will lose the specific device of your tensors however.
+    Args:
+        encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+            Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+            List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+            List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+            well as in a PyTorch Dataloader collate function.
+            Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+            see the note above for the return type.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                single sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+            >= 7.5 (Volta).
+        return_attention_mask (:obj:`bool`, `optional`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            If set, will return tensors instead of list of python integers. Acceptable values are:
+            * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+            * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+            * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to print more information and warnings.
+    """
+    # If we have a list of dicts, let's convert it in a dict of lists
+    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+    if isinstance(encoded_inputs, (list, tuple)) and isinstance(
+        encoded_inputs[0], (dict, BatchEncoding)
+    ):
+        encoded_inputs = {
+            key: [example[key] for example in encoded_inputs]
+            for key in encoded_inputs[0].keys()
+        }
+
+    required_input = encoded_inputs["input_ids"]
+
+    if not required_input:
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = []
+        return encoded_inputs
+
+    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+    # and rebuild them afterwards if no return_tensors is specified
+    # Note that we lose the specific device the tensor may be on for PyTorch
+
+    first_element = required_input[0]
+    if isinstance(first_element, (list, tuple)):
+        # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+        index = 0
+        while len(required_input[index]) == 0:
+            index += 1
+        if index < len(required_input):
+            first_element = required_input[index][0]
+    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+    if not isinstance(first_element, (int, list, tuple)):
+        if isinstance(first_element, torch.Tensor):
+            return_tensors = "pt" if return_tensors is None else return_tensors
+        elif isinstance(first_element, np.ndarray):
+            return_tensors = "np" if return_tensors is None else return_tensors
+        else:
+            raise ValueError(
+                f"type of {first_element} unknown: {type(first_element)}. "
+                f"Should be one of a python, numpy, pytorch or tensorflow object."
+            )
+
+        for key, value in encoded_inputs.items():
+            encoded_inputs[key] = to_py_obj(value)
+
+    required_input = encoded_inputs["input_ids"]
+    if required_input and not isinstance(required_input[0], (list, tuple)):
+        return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+
+    batch_size = len(required_input)
+    assert all(
+        len(v) == batch_size for v in encoded_inputs.values()
+    ), "Some items in the output dictionary have a different batch size than others."
+
+    batch_outputs = {}
+    for i in range(batch_size):
+        inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+        for key, value in inputs.items():
+            if key not in batch_outputs:
+                batch_outputs[key] = []
+            batch_outputs[key].append(value)
+
+    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = examples[0].size(0)
+    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    if are_tensors_same_length and (
+        pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0
+    ):
+        return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def to_py_obj(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj.detach().cpu().tolist()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
--- a/bert/obtain_inst_vec.py
+++ b/bert/obtain_inst_vec.py
@ -0,0 +1,111 @@
+import os
+import numpy as np
+import tokenizers
+import torch
+
+from transformers import (
+    BatchEncoding,
+    BertConfig,
+    BertForPreTraining
+)
+
+from .my_data_collator import MyDataCollatorForPreTraining
+model_file = os.path.join("./bert/pytorch_model.bin")
+tokenizer_file = os.path.join("./bert/tokenizer-inst.all.json")
+config_file = os.path.join('./bert/bert.json')
+
+# from my_data_collator import MyDataCollatorForPreTraining
+# model_file = os.path.join("./pytorch_model.bin")
+# tokenizer_file = os.path.join("./tokenizer-inst.all.json")
+# config_file = os.path.join('./bert.json')
+
+
+def load_model():
+    config = BertConfig.from_json_file(config_file)
+    model = BertForPreTraining(config)
+    state_dict = torch.load(model_file)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file)
+    tokenizer.enable_padding(
+        pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=50
+    )
+    return model, tokenizer
+
+
+def process_input(inst, tokenizer):
+    encoded_input = {}
+    if isinstance(inst, str):
+        # make a batch by myself
+        inst = [inst for _ in range(8)]
+    results = tokenizer.encode_batch(inst)
+    encoded_input["input_ids"] = [result.ids for result in results]
+    encoded_input["token_type_ids"] = [result.type_ids for result in results]
+    encoded_input["special_tokens_mask"] = [
+        result.special_tokens_mask for result in results
+    ]
+
+    # print(encoded_input["input_ids"])
+
+    # use `np` rather than `pt` in case of reporting of error
+    batch_output = BatchEncoding(
+        encoded_input, tensor_type="np", prepend_batch_axis=False,
+    )
+
+    # print(batch_output["input_ids"])
+
+    # NOTE: utilize the "special_tokens_mask",
+    # only work if the input consists of single instruction
+    length_mask = 1 - batch_output["special_tokens_mask"]
+
+    data_collator = MyDataCollatorForPreTraining(tokenizer=tokenizer, mlm=False)
+
+    model_input = data_collator([batch_output])
+
+    # print(model_input["input_ids"])
+
+    return model_input, length_mask
+
+
+def generate_inst_vec(inst, method="mean"):
+    model, tokenizer = load_model()
+
+    model_input, length_mask = process_input(inst, tokenizer)
+    length_mask = torch.from_numpy(length_mask).to(model_input["input_ids"].device)
+
+    output = model(**model_input, output_hidden_states=True)
+
+    if method == "cls":
+        if isinstance(inst, str):
+            return output.hidden_states[-1][0][0]
+        elif isinstance(inst, list):
+            return output.hidden_states[-1, :, 0, :]
+    elif method == "mean":
+        result = output.hidden_states[-1] * torch.unsqueeze(length_mask, dim=-1)
+        # print(result.shape)
+        if isinstance(inst, str):
+            result = torch.mean(result[0], dim=0)
+        elif isinstance(inst, list):
+            result = torch.mean(result, dim=1)
+        return result
+    elif method == "max":
+        result = output.hidden_states[-1] * torch.unsqueeze(length_mask, dim=-1)
+        # print(result.shape)
+        if isinstance(inst, str):
+            result = torch.max(result[0], dim=0)
+        elif isinstance(inst, list):
+            result = torch.max(result, dim=1)
+        return result
+
+
+def bb2vec(inst):
+    tmp = generate_inst_vec(inst, method="mean")
+    return list(np.mean(tmp.detach().numpy(), axis=0))
+
+
+if __name__ == "__main__":
+    temp = bb2vec(['adc byte [ ebp - 0x74 ] cl','mov dh 0x79','adc eax 1'])
+    temp = list(temp)
+    print(temp)
+
--- a/bert/pytorch_model.bin
+++ b/bert/pytorch_model.bin
--- a/bert/tokenizer-inst.all.json
+++ b/bert/tokenizer-inst.all.json
--- a/exe2json.py
+++ b/exe2json.py
@ -1,11 +1,36 @@
-import os
 import r2pipe
-import re
 import hashlib
-import log_utils
+from my_utils import *
 import json
+# 基础块抽取
+from bert.obtain_inst_vec import bb2vec
+from tqdm import tqdm
+import numpy as np
+import os


+# 禁用分词器多线程
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+ret_trap_opcode_family = ["ret", "hlt", "int3", "ud2"]
+
+
+def extract_opcode(disasm_text):
+    """
+    从反汇编文本中提取操作码和操作数
+    正则表达式用于匹配操作码和操作数，考虑到操作数可能包含空格和逗号
+    将操作码与操作数转化为bert模型输入
+    """
+    op_list = disasm_text.split(' ')
+    res = []
+    for item in op_list:
+        item = item.strip().replace(',', '')
+        if '[' in item:
+            res.append('[')
+        res.append(item.replace('[', '').replace(']', ''))
+        if ']' in item:
+            res.append(']')
+    return ' '.join(res)
+
 def calc_sha256(file_path):
    with open(file_path, 'rb') as f:
        bytes = f.read()
@ -13,57 +38,88 @@ def calc_sha256(file_path):
        sha256 = sha256obj.hexdigest()
        return sha256

-def extract_opcode(disasm_text):
-    """
-    从反汇编文本中提取操作码和操作数
-    正则表达式用于匹配操作码和操作数，考虑到操作数可能包含空格和逗号
-    """
-    match = re.search(r"^\s*(\S+)(?:\s+(.*))?$", disasm_text)
-    if match:
-        opcode = match.group(1)
-        # operands_str = match.group(2) if match.group(2) is not None else ""
-        # split_pattern = re.compile(r",(?![^\[]*\])")  # 用于切分操作数的正则表达式
-        # operands = split_pattern.split(operands_str)
-        # return opcode, [op.strip() for op in operands if op.strip()]
-        return opcode
-    return ""

-def get_graph_cfg_r2pipe(r2pipe_open):
+def get_graph_cfg_r2pipe(r2pipe_open, file_path):
    # CFG提取
    acfg_item = []
    try:
        # 获取函数列表
        function_list = r2pipe_open.cmdj("aflj")
-
-
        for function in function_list:
-            # 局部函数内的特征提取
            node_list = []
            edge_list = []
            temp_edge_list = []
-            block_list = r2pipe_open.cmdj("afbj @" + str(function['offset']))
-            block_number = len(block_list)
            block_feature_list = []
-            for block in block_list:
-                node_list.append(block["addr"])
+            # 基本快块列表
+            block_list = r2pipe_open.cmdj("afbj @" + str(function['offset']))
+            # 获取基本块数量
+            block_number = len(block_list)

+
+            for block in block_list:
+                # 基础块内的语句
+                block_addr = block["addr"]
+                block_Statement = []
+
+                node_list.append(block["addr"])
                # 获取基本块的反汇编指令
                disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"]))
                if disasm:
-                    for op in disasm:
-                        if op["type"] == "invalid":
-                            continue
-                        # TODO ：这里需要处理指令的特征提取
-                        block_feature = ''
-                        block_feature_list.append(block_feature)
+                    for op_index, op in enumerate(disasm):
+                        # 提取操作码并转换为bert模型输入格式
+                        block_Statement.append(extract_opcode(op["disasm"]))

+                        # 处理跳转码并构建cfg
+                        if 'jump' in op:
+                            if op['jump'] == 0:
+                                if op_index != len(disasm) - 1:
+                                    node_list.append(disasm[op_index + 1]['offset'])

-                        # 处理跳转指令
-                        if "jump" in op and op["jump"] != 0:
-                            temp_edge_list.append([block["addr"], op["jump"]])
+                            elif op['type'] == 'jmp':
+                                temp_edge_list.append([block["addr"], op['jump']])
+                                if op_index != len(disasm) - 1:
+                                    node_list.append(disasm[op_index + 1]['offset'])
+
+                            elif op['type'] == 'cjmp':
+                                temp_edge_list.append([block["addr"], op['jump']])
+                                if op_index == len(disasm) - 1:
+                                    temp_edge_list.append([block_addr, op['jump']])
+
+                                else:
+                                    temp_edge_list.append([block_addr, disasm[op_index + 1]["offset"]])
+                                    node_list.append(disasm[op_index + 1]["offset"])
+                            elif op['type'] == 'call':
+                                temp_edge_list.append([block_addr, op["jump"]])
+                                temp_edge_list.append([op["jump"], block_addr])
+                                if op_index == len(disasm) - 1:
+                                    temp_edge_list.append([block_addr, op["offset"] + op["size"]])
+                            else:
+                                logger.warning(
+                                    f"二进制可执行文件解析警告，跳转指令识别出错，指令{op}")
+
+                        # 操作码不存在跳转指令
+                        else:
+                            if op_index != len(disasm) - 1:
+                                # 当前指令不是基础块的最后一条指令
+                                if op in ret_trap_opcode_family and op["type"] in ["ret", "trap"]:
+                                    node_list.append(disasm[op_index + 1]["offset"])
+
+                            else:
+                                # 当前指令是基础块的最后一条指令
+                                if op not in ret_trap_opcode_family or op["type"] not in ["ret", "trap"]:
+                                    temp_edge_list.append([block_addr, op["offset"] + op["size"]])
+
+                # bert模型转化特征
+                block_feature_list = bb2vec(block_Statement)
+                # block_feature_list = []
+            # 过滤不存在的边
            for temp_edge in temp_edge_list:
-                if temp_edge[1] in node_list:
+                if temp_edge[0] in node_list and temp_edge[1] in node_list:
                    edge_list.append(temp_edge)
+            # 单独错误信息日志
+            if block_number == 0 or len(block_feature_list) == 0:
+                logger.warning(f"二进制可执行文件解析出错，出错文件：{file_path}，出错函数地址：{function['offset']},基础块个数{block_number},基础块特征{block_feature_list}")
+            # cfg构建
            acfg = {
                'block_number': block_number,
                'block_edges': [[d[0] for d in edge_list], [d[1] for d in edge_list]],
@ -74,45 +130,16 @@ def get_graph_cfg_r2pipe(r2pipe_open):
    except Exception as e:
        return False, e, None

-            # for block in block_list:
-            #     node_list.append(block["addr"])
-            #
-            #     # 获取基本块的反汇编指令
-            #     disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"]))
-            #     node_info = []
-            #     if disasm:
-            #         for op in disasm:
-            #             if op["type"] == "invalid":
-            #                 continue
-            #             opcode, operands = extract_opcode(op["disasm"])
-            #             # 处理跳转指令
-            #             if "jump" in op and op["jump"] != 0:
-            #                 temp_edge_list.append([block["addr"], op["jump"]])
-            #                 node_info.append([op["offset"], op["bytes"], opcode, op["jump"]])
-            #             else:
-            #                 node_info.append([op["offset"], op["bytes"], opcode, None])
-            #     node_info_list.append(node_info)

-        # 完成 CFG 构建后, 检查并清理不存在的出边
-
-
-        # 获取排序后元素的原始索引
-    #     sorted_indices = [i for i, v in sorted(enumerate(node_list), key=lambda x: x[1])]
-    #     # 根据这些索引重新排列
-    #     node_list = [node_list[i] for i in sorted_indices]
-    #     node_info_list = [node_info_list[i] for i in sorted_indices]
-    #
-    #     return True, "二进制可执行文件解析成功", node_list, edge_list, node_info_list
-    # except Exception as e:
-    #     return False, e, None, None, None
 def get_graph_fcg_r2pipe(r2pipe_open):
    # FCG提取
    try:
-        function_list = r2pipe_open.cmdj("aflj")
        node_list = []
        func_name_list = []
        edge_list = []
        temp_edge_list = []
+
+        function_list = r2pipe_open.cmdj("aflj")
        function_num = len(function_list)

        for function in function_list:
@ -121,13 +148,11 @@ def get_graph_fcg_r2pipe(r2pipe_open):
            pdf = r2pipe_open.cmdj('pdfj')
            if pdf is None:
                continue
-
            node_bytes = ""
            node_opcode = ""
            for op in pdf["ops"]:
                if op["type"] == "invalid":
                    continue
-
                node_bytes += op["bytes"]
                opcode = extract_opcode(op["disasm"])
                node_opcode += opcode + " "
@ -141,13 +166,14 @@ def get_graph_fcg_r2pipe(r2pipe_open):
        for temp_edge in temp_edge_list:
            if temp_edge[1] in node_list:
                edge_list.append(temp_edge)
-        sub_function_name_list = ('fcn.', 'loc.', 'main', 'entry')
-        func_name_list = [func_name for func_name in func_name_list if not func_name.startswith(sub_function_name_list)]
+        sub_function_name_list = ('sym.','sub','imp')
+        func_name_list = [func_name for func_name in func_name_list if func_name.startswith(sub_function_name_list)]
        return True, "二进制可执行文件解析成功", function_num, edge_list, func_name_list
    except Exception as e:
        return False, e, None, None, None

 def get_r2pipe(file_path):
+    # 初始化r2pipe
    try:
        r2 = r2pipe.open(file_path, flags=['-2'])
        r2.cmd("aaa")
@ -157,16 +183,21 @@ def get_r2pipe(file_path):
        return None

 def init_logging():
-    log_file = "./out/exe2json.log"
-    logging = log_utils.setup_logger('exe2json', log_file)
-    return logging
+    # 初始化日志
+    log_file = "./log/exe2json.log"
+    return setup_logger('exe2json', log_file)


-def exe_to_json(file_path, output_path):
-    logging = init_logging()
+def exe_to_json(file_path):
+    output_path = "./out/json/malware"
+
+
+    # 获取r2pipe并解析文件  解析完即释放r2
    r2 = get_r2pipe(file_path)
    fcg_Operation_flag, fcg_Operation_message, function_num, function_fcg_edge_list, function_names = get_graph_fcg_r2pipe(r2)
-    cfg_Operation_flag, cfg_Operation_message, cfg_item = get_graph_cfg_r2pipe(r2)
+    cfg_Operation_flag, cfg_Operation_message, cfg_item = get_graph_cfg_r2pipe(r2,file_path)
+    r2.quit()
+    # 文件json构建
    file_fingerprint = calc_sha256(file_path)
    if fcg_Operation_flag and cfg_Operation_flag:
        json_obj = {
@ -178,19 +209,24 @@ def exe_to_json(file_path, output_path):
            'function_names': function_names
        }
    else:
-        logging.error(f"二进制可执行文件解析失败 文件地址{file_path}")
+        logger.error(f"二进制可执行文件解析失败 文件名{file_path}")
        if not fcg_Operation_flag:
-            logging.error(f"fcg错误：{fcg_Operation_message}")
+            logger.error(f"fcg错误：{fcg_Operation_message}")
        if not cfg_Operation_flag:
-            logging.error(f"cfg错误：{cfg_Operation_message}")
+            logger.error(f"cfg错误：{cfg_Operation_message}")
        return False
-    r2.quit()
-    result = json.dumps(json_obj,ensure_ascii=False)
+    # json写入
+    result = json.dumps(json_obj,ensure_ascii=False, default=lambda x: float(x) if isinstance(x, np.float32) else x)
+    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, file_fingerprint + '.jsonl'), 'w') as out:
        out.write(result)
        out.close()
    return True

 if __name__ == '__main__':
-    test_file_path = '/mnt/d/bishe/exe2json/sample/VirusShare_0a3b625380161cf92c4bb10135326bb5'
-    exe_to_json(test_file_path, './out/json')
+    logger = init_logging()
+    sample_file_path = "/mnt/d/bishe/dataset/sample_malware"
+    sample_file_list = os.listdir(sample_file_path)
+    multi_thread(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list])
+    # test_file_path = '/mnt/d/bishe/exe2json/sample/VirusShare_0a3b625380161cf92c4bb10135326bb5'
+    # exe_to_json(test_file_path)
--- a/my_utils.py
+++ b/my_utils.py
@ -0,0 +1,65 @@
+import logging
+import os
+
+"""
+日志工具
+
+使用方法：
+    logger = setup_logger(日志记录器的实例名字, 日志文件目录)
+"""
+def setup_logger(name, log_file, level=logging.INFO):
+    """Function setup as many loggers as you want"""
+    if not os.path.exists(os.path.dirname(log_file)):
+        os.makedirs(os.path.dirname(log_file))
+
+    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    handler = logging.FileHandler(log_file)
+    handler.setFormatter(formatter)
+
+    # 控制台是否输出日志信息
+    # stream_handler = logging.StreamHandler()
+    # stream_handler.setFormatter(formatter)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.addHandler(handler)
+    # 控制台
+    # logger.addHandler(stream_handler)
+
+    # 刷新原有log文件
+
+    if os.path.exists(log_file):
+        open(log_file, 'w').close()
+
+    return logger
+
+
+"""
+多线程工具
+"""
+THREAD_FULL = os.cpu_count()
+THREAD_HALF = int(os.cpu_count() / 2)
+def multi_thread(func, args, thread_num=THREAD_FULL):
+    """
+    多线程执行函数
+    :param func: 函数
+    :param args: list函数参数
+    :param thread_num: 线程数
+    :return:
+    """
+    import concurrent.futures
+    from tqdm import tqdm
+    logger = setup_logger('multi_thread', './multi_thread.log')
+    result = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:
+        futures_to_args = {
+            executor.submit(func, arg): arg for arg in args
+        }
+        for future in tqdm(concurrent.futures.as_completed(futures_to_args), total=len(args)):
+            try:
+                result.append(future.result())
+            except Exception as exc:
+                logger.error('%r generated an exception: %s' % (futures_to_args[future], exc))
+    return result
+