backup

2023-10-10 22:12:18 +08:00 · 2023-10-10 22:12:18 +08:00 · d599236e94
commit d599236e94
parent ddf9ff3b59
6 changed files with 359 additions and 28 deletions
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
@ -2,7 +2,14 @@
 <project version="4">
  <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
    <serverData>
-      <paths name="root@region-41.seetacloud.com:29208">
+      <paths name="304">
        <serverdata>
          <mappings>
            <mapping local="$PROJECT_DIR$" web="/" />
          </mappings>
        </serverdata>
      </paths>
      <paths name="root@region-42.seetacloud.com:58034 password">
        <serverdata>
          <mappings>
            <mapping local="$PROJECT_DIR$" web="/" />
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
@ -0,0 +1,81 @@
 class HierarchicalGraphNeuralNetwork(nn.Module):
    def __init__(self, external_vocab: Vocab):
        super(HierarchicalGraphNeuralNetwork, self).__init__()
        self.pool = 'global_max_pool'
        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
        cfg_filter_list =[200, 200]
        cfg_filter_list.insert(0, 11)
        self.cfg_filter_length = len(cfg_filter_list)
        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
                                i in range(self.cfg_filter_length - 1)]
        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
        cfg_constructor = cfg_conv['constructor']
        for i in range(self.cfg_filter_length - 1):
            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
        self.dropout = nn.Dropout(p=0.2)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
                                                     embedding_dim=cfg_filter_list[-1],
                                                     padding_idx=external_vocab.pad_idx)
        fcg_filter_list = [200, 200]
        fcg_filter_list.insert(0, cfg_filter_list[-1])
        self.fcg_filter_length = len(fcg_filter_list)
        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
                                i in range(self.fcg_filter_length - 1)]
        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
        fcg_constructor = fcg_conv['constructor']
        for i in range(self.fcg_filter_length - 1):
            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
        # Last Projection Function: gradually project with more linear layers
        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
        self.last_activation = nn.Softmax(dim=1)
    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
                bt_all_function_edges: list):
        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
        fcg_list = []
        fcg_internal_list = []
        for idx_batch in range(len(real_bt_positions) - 1):
            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
            fcg_internal_list.append(idx_x_cfg)
            idx_x_external = self.external_embedding_layer(
                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
            idx_x_external = idx_x_external.squeeze(dim=0)
            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
            idx_graph_data.validate()
            fcg_list.append(idx_graph_data)
        fcg_batch = Batch.from_data_list(fcg_list)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
        batch_final = x_fcg_pool
        # step last project to the number_of_classes (multiclass)
        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
        bt_pred = self.last_activation(bt_final_embed)
        return bt_pred
    def forward_cfg_gnn(self, local_batch: Batch):
        in_x, edge_index = local_batch.x, local_batch.edge_index
        for i in range(self.cfg_filter_length - 1):
            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        local_batch.x = in_x
        return local_batch
    def forward_fcg_gnn(self, function_batch: Batch):
        in_x, edge_index = function_batch.x, function_batch.edge_index
        for i in range(self.fcg_filter_length - 1):
            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        function_batch.x = in_x
        return function_batch
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -6,7 +6,7 @@ import os
 from tqdm import tqdm
-def convert(start, end):
+def convert(start, end, overhaul):
    for workflow in range(start, end):
        # workflow = 0
        cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
@ -16,6 +16,12 @@ def convert(start, end):
        log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
        process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
        if overhaul:
            if os.path.exists(log_path):
                os.remove(log_path)
            if os.path.exists(process_log_path):
                os.remove(process_log_path)
        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
            logged = log.readline()
            if logged == '':
@ -112,5 +118,116 @@ def convert(start, end):
                    process_log.write("index {}, {} process done.\n".format(index, cfg))
 def convert_benign(overhaul):
    cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
    output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new"
    dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new"
    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
    if overhaul:
        if os.path.exists(log_path):
            os.remove(log_path)
        if os.path.exists(process_log_path):
            os.remove(process_log_path)
    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
        logged = log.readline()
        if logged == '':
            log_index = 0
        else:
            log_index = int(logged)
        for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
            if index < log_index:
                continue
            name = cfg[:-4]  # 纯文件名
            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
            try:
                data = pk.load(cfg_file)
            except EOFError:
                process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
                continue
            except ValueError:
                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
                continue
            finally:
                cfg_file.close()
            dot_file_path = os.path.join(dot_dir, name + '.dot')
            if not os.path.exists(dot_file_path):
                process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
            else:
                # 打开dot文件获取fcg
                raw_function_edges = []
                # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
                functions_list = []
                with open(dot_file_path, 'r') as dot:
                    for line in dot:
                        if '->' in line:
                            raw_function_edges.append(re.findall(r'\b\d+\b', line))
                        elif 'label' in line:
                            functions_list.append(line[line.find('= "') + 3:line.find('",')])
                # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
                if raw_function_edges.__len__() == 0:
                    continue
                # 为当前pe文件创建json对象
                json_obj = {
                    'hash': data.binary_name[11:],
                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
                    # 'function_number': data.raw_graph_list.__len__(),
                    'function_number': len(functions_list),
                    'function_edges': [[int(d[0]) for d in raw_function_edges],
                                       [int(d[1]) for d in raw_function_edges]],
                    'acfg_list': [],
                    'function_names': functions_list
                }
                # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
                # 读取pkl文件，一个acfg由一个函数分解而来
                for acfg in data.raw_graph_list:
                    # 函数为外部函数，不需要构建cfg
                    if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
                        continue
                    # 这里2是因为Genius框架提取特征时将后代数量放在2
                    offspring = [d.get('v')[2] for d in acfg.g.node.values()]
                    # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
                    # 以框架为主，将bb_features数组削减为和g.node长度一致
                    diff = acfg.g.__len__() - len(acfg.bb_features)
                    if diff != 0:
                        del acfg.bb_features[diff:]
                    # 将后代数量的特征放入bb_features中
                    for i, offs in enumerate(offspring):
                        acfg.bb_features[i].append(offs)
                    acfg_item = {
                        'block_number': acfg.g.__len__(),
                        'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
                        'block_features': acfg.bb_features
                    }
                    json_obj['acfg_list'].append(acfg_item)
                    # json_obj['function_names'].append(acfg.funcname)
                # 将结果写入json本地文件
                result = json.dumps(json_obj, ensure_ascii=False)
                with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
                    out.write(result)
                log.truncate(0)
                log.seek(0)
                log.write(str(index))
                log.flush()
                process_log.write("index {}, {} process done.\n".format(index, cfg))
 if __name__ == '__main__':
-    convert(35, 69)
+    # convert(35, 69)
    convert_benign(True)
--- a/Genius3/raw-feature-extractor/ida_batch.py
+++ b/Genius3/raw-feature-extractor/ida_batch.py
@ -19,10 +19,75 @@ def call_preprocess(cmd_line):
    subprocess.call(cmd_line, shell=True)
-def batch_mode(start, end):
+# 良性软件分析模式，ida的命令中将workflow改为-1
 def benign_batch_mode(overhaul):
    # 总失败数据数量
    total_failed = 0
    log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
    process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
    benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new'
    if overhaul:
        if os.path.exists(log_path):
            os.remove(log_path)
        if os.path.exists(process_log_path):
            os.remove(process_log_path)
    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
        logged = log.readline()
        if logged == '':
            log_index = 0
        else:
            log_index = int(logged)
        for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))):
            if index < log_index:
                continue
            cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
                os.path.join(benign_pe_dir, pe))
            p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
            p.start()
            flag_kill = True
            start = time.time()
            while time.time() - start <= TIMEOUT:
                if not p.is_alive():
                    flag_kill = False
                    break
                else:
                    time.sleep(1)
            if flag_kill:
                subprocess.call('taskkill /im idaq64.exe /f')
                process_log.write(
                    "index {}, {} stuck, process terminated.\n".format(index, pe))
                total_failed += 1
            else:
                # 正常运行结束
                log.truncate(0)
                log.seek(0)
                log.write(str(index))
                log.flush()
                process_log.write("index {}, {} process done.\n".format(index, pe))
    # 所有副产物删除
    delete_output()
    print('总失败数{}'.format(total_failed))
 def mal_batch_mode(start, end):
    # 只选其中这些类的pe进行分析，其他的就直接跳过
    families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
                                'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
    # 记录ida处理报错的数据来自哪些家族
    failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
                     'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
    # 总失败数据数量
    total_failed = 0
    for workflow in range(start, end):
        # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
@ -73,6 +138,9 @@ def batch_mode(start, end):
                    subprocess.call('taskkill /im idaq64.exe /f')
                    process_log.write(
                        "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
                    failed_family[pe_family] += 1
                    total_failed += 1
                else:
                    # 正常运行结束
                    log.truncate(0)
@ -85,6 +153,10 @@ def batch_mode(start, end):
        # 一次workflow结束后将所有副产物删除
        delete_output()
    print(families_need_to_analyze)
    print('\n')
    print(failed_family, '总失败数{}'.format(total_failed))
 def delete_output():
    out_dir = 'F:\\iout'
@ -96,4 +168,5 @@ def delete_output():
 # 注意：该py文件必须放在IDA的根目录下，且必须使用cmd命令执行，否则无法链接到python库
 # F:\\kkk\\IDA_6.6
 if __name__ == '__main__':
-    batch_mode(36, 69)
+    benign_batch_mode(True)
    # mal_batch_mode(35, 69)
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,11 +1,8 @@
 # -*- coding: UTF-8 -*-
 import pickle
 from func import *
 from raw_graphs import *
 from idc import *
 import idautils
 import os
 import sys
 def preprocess():
@ -18,9 +15,13 @@ def preprocess():
    binary_name = idc.GetInputFile()
    workflow = idc.ARGV[1]
-    # workflow = 0
+    # workflow为特定值时分析良性软件，否则分析恶意软件
-    cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
+    if workflow == '-1':
-    gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
+        cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
        gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name)
    else:
        cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
        gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
--- a/Genius3/raw-feature-extractor/test.py
+++ b/Genius3/raw-feature-extractor/test.py
@ -46,7 +46,6 @@ def create(parent_dir, folder):
        os.mkdir(os.path.join(parent_dir, folder))
 def change_max_item_lines():
    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
    s = f.read()
@ -89,6 +88,7 @@ def delete_error():
 def check_json():
    print('start checking json')
    for workflow in tqdm(range(0, 69)):
        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
        for json_file in os.listdir(json_dir):
@ -99,9 +99,13 @@ def check_json():
                continue
            finally:
                f.close()
-            for acfg in data['acfg_list']:
+
-                if acfg['block_number'] != len(acfg['block_features']):
+            if len(data['function_edges'][0]) == 0:
-                    print("{} {}\n".format(workflow, json_file))
+                print("{} {} function_edges null\n".format(workflow, json_file))
                # continue
            # for acfg in data['acfg_list']:
            #     if acfg['block_number'] != len(acfg['block_features']):
            #         print("{} {}\n".format(workflow, json_file))
 # 临时函数，删除所有jsonl文件
@ -112,21 +116,44 @@ def delete_jsonl():
            os.remove(os.path.join(json_dir, f))
-# 临时函数，重命名pt文件使之与代码相符
+def delete_all_local():
-def rename():
+    src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
    dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
            'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
    for d in dirs:
        path = os.path.join(src, d)
        for f in os.listdir(path):
            os.remove(os.path.join(path, f))
 # 重命名pt文件使之与代码相符
 def rename(mal_or_be, postfix):
    tag_set = ['train', 'test', 'valid']
    for tag in tag_set:
-        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
+        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
        for index, f in enumerate(os.listdir(data_dir)):
            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
    for tag in tag_set:
-        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
+        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
        for index, f in enumerate(os.listdir(data_dir)):
-            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index)))
+            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
-def split_samples():
+def split_samples(flag):
-    path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
+    postfix = ''
    if flag == 'one_family':
        path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
        tag = 'malware'
    elif flag == 'standard':
        path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
        postfix = '_backup'
        tag = 'malware'
    elif flag == 'benign':
        path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
        tag = 'benign'
    else:
        return
    out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
    os_list = os.listdir(path)
    random.shuffle(os_list)
@ -135,11 +162,12 @@ def split_samples():
    test_len = int(train_len / 8)
    for index, f in enumerate(os_list):
        if index < train_len:
-            shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware'))
+            shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
        elif train_len <= index < train_len + test_len:
-            shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware'))
+            shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
        else:
-            shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware'))
+            shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
    rename(tag, postfix)
 def half_divide():
@ -206,6 +234,19 @@ def del_redundant():
                    os.remove(os.path.join(pe_dir, name))
 def delete_pe():
    dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
    cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
    dot_list = os.listdir(dot_dir)
    for cfg in os.listdir(cfg_dir):
        name = cfg[:-4] + ".dot"
        if name in dot_list:
            continue
        else:
            print(os.path.join(dot_dir, name))
            # os.remove(os.path.join(dot_dir, cfg))
 if __name__ == '__main__':
    # create_dir()
    # change_max_item_lines()
@ -213,9 +254,20 @@ if __name__ == '__main__':
    # delete_error()
    # test()
    # delete_jsonl()
    delete_all_local()
    # check_json()
-    split_samples()
+    # delete_pe()
-    # rename()
+
    # rename('malware', '_backup')
    # 指定 'standard' or 'benign' or 'one_family'
    # standard表示处理所有恶意样本
    # split_samples('standard')
    # one_family表示仅处理一个家族，仅用于测试原模型的二分类
    # split_samples('one_family')
    # benign表示处理良性样本
    # split_samples('benign')
    # half_divide()
    # copy_train_data()
    # clear_dot()