MalGraph/samples/PreProcess.py

import json
import torch
from torch_geometric.data import Data
from tqdm import tqdm

from src.utils.Vocabulary import Vocab


def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab):
#def parse_json_list_2_pyg_object(jsonl_file: str):
    index = 0
    with open(jsonl_file, "r", encoding="utf-8") as file:
        for item in tqdm(file):
            item = json.loads(item)
            item_hash = item['hash']
            
            acfg_list = []
            for one_acfg in item['acfg_list']:  # list of dict of acfg
                block_features = one_acfg['block_features']
                block_edges = one_acfg['block_edges']
                one_acfg_data = Data(x=torch.tensor(block_features, dtype=torch.float), edge_index=torch.tensor(block_edges, dtype=torch.long))
                acfg_list.append(one_acfg_data)
            
            item_function_names = item['function_names']
            item_function_edges = item['function_edges']
            
            local_function_name_list = item_function_names[:len(acfg_list)]
            assert len(acfg_list) == len(local_function_name_list), "The length of ACFG_List should be equal to the length of Local_Function_List"
            external_function_name_list = item_function_names[len(acfg_list):]
            
            external_function_index_list = [vocab[f_name] for f_name in external_function_name_list]
            index += 1
            torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./cache/benign_{}.pt".format(index))


if __name__ == '__main__':
    json_path = "./benign_result.jsonl"
    train_vocab_file = "../data/processed_dataset/train_external_function_name_vocab.jsonl"
    # train_vocab_file = "./res.jsonl"
    max_vocab_size = 10000
    vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size)
    parse_json_list_2_pyg_object(jsonl_file=json_path, label=1, vocab=vocabulary)
first commit 2023-11-09 14:30:38 +08:00			`import json`
			`import torch`
			`from torch_geometric.data import Data`
			`from tqdm import tqdm`

生成文件脚本 2023-12-28 17:01:36 +08:00			`from src.utils.Vocabulary import Vocab`
first commit 2023-11-09 14:30:38 +08:00

			`def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab):`
生成文件脚本 2023-12-28 17:01:36 +08:00			`#def parse_json_list_2_pyg_object(jsonl_file: str):`
first commit 2023-11-09 14:30:38 +08:00			`index = 0`
			`with open(jsonl_file, "r", encoding="utf-8") as file:`
			`for item in tqdm(file):`
			`item = json.loads(item)`
			`item_hash = item['hash']`

			`acfg_list = []`
			`for one_acfg in item['acfg_list']: # list of dict of acfg`
			`block_features = one_acfg['block_features']`
			`block_edges = one_acfg['block_edges']`
			`one_acfg_data = Data(x=torch.tensor(block_features, dtype=torch.float), edge_index=torch.tensor(block_edges, dtype=torch.long))`
			`acfg_list.append(one_acfg_data)`

			`item_function_names = item['function_names']`
			`item_function_edges = item['function_edges']`

			`local_function_name_list = item_function_names[:len(acfg_list)]`
			`assert len(acfg_list) == len(local_function_name_list), "The length of ACFG_List should be equal to the length of Local_Function_List"`
			`external_function_name_list = item_function_names[len(acfg_list):]`

			`external_function_index_list = [vocab[f_name] for f_name in external_function_name_list]`
			`index += 1`
测试改动 2024-01-05 14:30:45 +08:00			`torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./cache/benign_{}.pt".format(index))`
first commit 2023-11-09 14:30:38 +08:00

			`if __name__ == '__main__':`
测试改动 2024-01-05 14:30:45 +08:00			`json_path = "./benign_result.jsonl"`
生成文件脚本 2023-12-28 17:01:36 +08:00			`train_vocab_file = "../data/processed_dataset/train_external_function_name_vocab.jsonl"`
			`# train_vocab_file = "./res.jsonl"`
first commit 2023-11-09 14:30:38 +08:00			`max_vocab_size = 10000`
			`vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size)`
生成文件脚本 2023-12-28 17:01:36 +08:00			`parse_json_list_2_pyg_object(jsonl_file=json_path, label=1, vocab=vocabulary)`