Merge branch 'master' into detached

# Conflicts: # Genius3/raw-feature-extractor/convert_pkl_to_json.py # Genius3/raw-feature-extractor/preprocessing_ida.py
log写入
2024-03-03 16:59:09 +08:00 · 2024-03-03 16:39:19 +08:00 · 2024-03-03 14:34:47 +08:00 · 2024-03-03 14:34:19 +08:00 · 2024-03-01 16:42:02 +08:00 · 2024-03-01 16:11:26 +08:00
16 changed files with 189 additions and 920 deletions
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@ -4,7 +4,7 @@
    <content url="file://$MODULE_DIR$">
      <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
 </project>
--- a/Genius3/bat_file/benign/ida_file_cerate.bat
+++ b/Genius3/bat_file/benign/ida_file_cerate.bat
@ -2,7 +2,7 @@
 setlocal EnableDelayedExpansion


-set "FOLDER_PATH=D:\bishe\dataset\train_benign"
+set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"



--- a/Genius3/bat_file/malware/ida_file_cerate_malware.bat
+++ b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
@ -0,0 +1,16 @@
+@echo off
+setlocal EnableDelayedExpansion
+
+
+set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
+
+
+
+for %%f in ("%FOLDER_PATH%\*") do (
+    echo !time! %%f
+    D:\IDA_Pro_v6.8\idaq64.exe -c -A  -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
+
+)
+
+endlocal
+
--- a/Genius3/main.py
+++ b/Genius3/main.py
@ -1,16 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-
-from func import *
-from raw_graphs import *
-from idc import *
-import os
-import argparse
-if __name__ == '__main__':
-	print "hello"
-
-	#
-	# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
-	# -c 删除旧数据库  -A 自动分析，不显示对话框
-	# -B 相当于 -c -A
-
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
@ -1,81 +0,0 @@
-class HierarchicalGraphNeuralNetwork(nn.Module):
-    def __init__(self, external_vocab: Vocab):
-        super(HierarchicalGraphNeuralNetwork, self).__init__()
-        self.pool = 'global_max_pool'
-        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
-        cfg_filter_list =[200, 200]
-        cfg_filter_list.insert(0, 11)
-        self.cfg_filter_length = len(cfg_filter_list)
-        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
-                                i in range(self.cfg_filter_length - 1)]
-        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
-        cfg_constructor = cfg_conv['constructor']
-        for i in range(self.cfg_filter_length - 1):
-            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
-        self.dropout = nn.Dropout(p=0.2)
-        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
-        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
-                                                     embedding_dim=cfg_filter_list[-1],
-                                                     padding_idx=external_vocab.pad_idx)
-        fcg_filter_list = [200, 200]
-        fcg_filter_list.insert(0, cfg_filter_list[-1])
-        self.fcg_filter_length = len(fcg_filter_list)
-        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
-                                i in range(self.fcg_filter_length - 1)]
-        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
-        fcg_constructor = fcg_conv['constructor']
-        for i in range(self.fcg_filter_length - 1):
-            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
-        # Last Projection Function: gradually project with more linear layers
-        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
-        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
-        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
-        self.last_activation = nn.Softmax(dim=1)
-
-    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
-                bt_all_function_edges: list):
-        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
-        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
-        fcg_list = []
-        fcg_internal_list = []
-        for idx_batch in range(len(real_bt_positions) - 1):
-            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
-            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
-            fcg_internal_list.append(idx_x_cfg)
-            idx_x_external = self.external_embedding_layer(
-                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
-            idx_x_external = idx_x_external.squeeze(dim=0)
-            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
-            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
-            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
-            idx_graph_data.validate()
-            fcg_list.append(idx_graph_data)
-        fcg_batch = Batch.from_data_list(fcg_list)
-        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
-        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
-        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
-        batch_final = x_fcg_pool
-        # step last project to the number_of_classes (multiclass)
-        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
-        bt_pred = self.last_activation(bt_final_embed)
-        return bt_pred
-
-    def forward_cfg_gnn(self, local_batch: Batch):
-        in_x, edge_index = local_batch.x, local_batch.edge_index
-        for i in range(self.cfg_filter_length - 1):
-            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
-            out_x = torch.nn.functional.relu(out_x, inplace=True)
-            out_x = self.dropout(out_x)
-            in_x = out_x
-        local_batch.x = in_x
-        return local_batch
-
-    def forward_fcg_gnn(self, function_batch: Batch):
-        in_x, edge_index = function_batch.x, function_batch.edge_index
-        for i in range(self.fcg_filter_length - 1):
-            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
-            out_x = torch.nn.functional.relu(out_x, inplace=True)
-            out_x = self.dropout(out_x)
-            in_x = out_x
-        function_batch.x = in_x
-        return function_batch
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ b/Genius3/raw-feature-extractor/discovRe.py
@ -1,264 +0,0 @@
-# coding=utf-8
-#
-# Reference Lister
-#
-# List all functions and all references to them in the current section.
-#
-# Implemented with the idautils module
-#
-import networkx as nx
-import pdb
-from graph_analysis_ida import *
-from graph_property import *
-
-
-# import wingdbstub
-# wingdbstub.Ensure()
-
-def get_funcs(ea):
-    funcs = {}
-    # Get current ea
-    # Loop from start to end in the current segment
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        func = get_func(funcea)
-        blocks = FlowChart(func)
-        funcs[funcname] = []
-        for bl in blocks:
-            start = bl.startEA
-            end = bl.endEA
-            funcs[funcname].append((start, end))
-    return funcs
-
-
-# 似乎是没用的函数
-# def get_funcs_for_discoverRe(ea):
-#     features = {}
-#     for funcea in Functions(SegStart(ea)):
-#         funcname = GetFunctionName(funcea)
-#         print(funcname)
-#         func = get_func(funcea)
-#         feature = get_discoverRe_feature(func)
-#         features[funcname] = feature
-#     return features
-
-
-# 获取所有bb的11维属性特征
-# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
-def get_bb_features(func):
-    bb_features = []
-    blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-    for bl in blocks:
-        calls = calCalls(bl)
-        transferIns = calTransferIns(bl)
-        mathematicsIns = calArithmeticIns(bl)
-        logicIns = calLogicInstructions(bl)
-        cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
-        movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
-        interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
-        declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
-        totalIns = calInsts(bl)
-        consts = getBBconsts(bl)
-        stringOrIntConsts = len(consts[0]) + len(consts[1])
-        bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
-                            interruptIns, declareIns, totalIns, stringOrIntConsts])
-    return bb_features
-
-
-def get_discoverRe_feature(func, icfg):
-    start = func.startEA
-    end = func.endEA
-    features = []
-    FunctionCalls = getFuncCalls(func)
-    # 1
-    features.append(FunctionCalls)
-    LogicInstr = getLogicInsts(func)
-    # 2
-    features.append(LogicInstr)
-    Transfer = getTransferInsts(func)
-    # 3
-    features.append(Transfer)
-    Locals = getLocalVariables(func)
-    # 4
-    features.append(Locals)
-    BB = getBasicBlocks(func)
-    # 5
-    features.append(BB)
-    Edges = len(icfg.edges())
-    # 6
-    features.append(Edges)
-    Incoming = getIncommingCalls(func)
-    # 7
-    features.append(Incoming)
-    # 8
-    Instrs = getIntrs(func)
-    features.append(Instrs)
-    between = retrieveGP(icfg)
-    # 9
-    features.append(between)
-
-    strings, consts = getfunc_consts(func)
-    # 10
-    features.append(strings)
-    # 11
-    features.append(consts)
-    return features
-
-
-def get_func_names(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        funcs[funcname] = funcea
-    return funcs
-
-
-def get_func_bases(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        funcs[funcea] = funcname
-    return funcs
-
-
-def get_func_range(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        func = get_func(funcea)
-        funcs[funcname] = (func.startEA, func.endEA)
-    return funcs
-
-
-def get_func_sequences(ea):
-    funcs_bodylist = {}
-    funcs = get_funcs(ea)
-    for funcname in funcs:
-        if funcname not in funcs_bodylist:
-            funcs_bodylist[funcname] = []
-        for start, end in funcs[funcname]:
-            inst_addr = start
-            while inst_addr <= end:
-                opcode = GetMnem(inst_addr)
-                funcs_bodylist[funcname].append(opcode)
-                inst_addr = NextHead(inst_addr)
-    return funcs_bodylist
-
-
-def get_func_cfgs(ea):
-    func_cfglist = {}
-    i = 0
-    start, end = get_section('LOAD')
-    # print start, end
-    for funcea in Functions(SegStart(ea)):
-        if start <= funcea <= end:
-            funcname = GetFunctionName(funcea)
-            func = get_func(funcea)
-            print(i)
-            i += 1
-            try:
-                icfg = cfg.cfg_construct(func)
-                func_cfglist[funcname] = icfg
-            except:
-                pass
-
-    return func_cfglist
-
-
-def get_section(t):
-    base = SegByName(t)
-    start = SegByBase(base)
-    end = SegEnd(start)
-    return start, end
-
-
-def get_func_cfg_sequences(func_cfglist):
-    func_cfg_seqlist = {}
-    for funcname in func_cfglist:
-        func_cfg_seqlist[funcname] = {}
-        cfg = func_cfglist[funcname][0]
-        for start, end in cfg:
-            codesq = get_sequences(start, end)
-            func_cfg_seqlist[funcname][(start, end)] = codesq
-
-    return func_cfg_seqlist
-
-
-def get_sequences(start, end):
-    seq = []
-    inst_addr = start
-    while inst_addr <= end:
-        opcode = GetMnem(inst_addr)
-        seq.append(opcode)
-        inst_addr = NextHead(inst_addr)
-    return seq
-
-
-def get_stack_arg(func_addr):
-    print(func_addr)
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-        return []
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <= lastM:
-        mName = GetMemberName(stack, i)
-        mSize = GetMemberSize(stack, i)
-        if mSize:
-            i = i + mSize
-        else:
-            i = i + 4
-        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
-            args.append(mName)
-    return args
-
-    # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-
-
-def processDataSegs():
-    funcdata = {}
-    datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
-        ea = seg.startEA
-        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
-            start = idc.SegStart(ea)
-            end = idc.SegEnd(ea)
-            cur = start
-            while cur <= end:
-                refs = [v for v in DataRefsTo(cur)]
-                for fea in refs:
-                    name = GetFunctionName(fea)
-                    if len(name) == 0:
-                        continue
-                    if name not in funcdata:
-                        funcdata[name] = [cur]
-                    else:
-                        funcdata[name].append(cur)
-                    if cur not in datafunc:
-                        datafunc[cur] = [name]
-                    else:
-                        datafunc[cur].append(name)
-                cur = NextHead(cur)
-    return funcdata, datafunc
-
-
-def obtainDataRefs(callgraph):
-    datarefs = {}
-    funcdata, datafunc = processDataSegs()
-    for node in callgraph:
-        if node in funcdata:
-            datas = funcdata[node]
-            for dd in datas:
-                refs = datafunc[dd]
-                refs = list(set(refs))
-                if node in datarefs:
-                    print(refs)
-                    datarefs[node] += refs
-                    datarefs[node] = list(set(datarefs[node]))
-                else:
-                    datarefs[node] = refs
-    return datarefs
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@ -16,9 +16,7 @@ from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *

-sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
-#import wingdbstub
-#wingdbstub.Ensure()
+



--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@ -119,24 +119,23 @@ def getIncommingCalls(func):


 def get_stackVariables(func_addr):
-    #print func_addr
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-            return 0
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <=lastM:
-        mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack,i)
-        if mSize:
-                i = i + mSize
-        else:
-                i = i+4
-        if mName not in args and mName and 'var_' in mName:
-            args.append(mName)
-    return len(args)
+	args = []
+	stack = GetFrame(func_addr)
+	if not stack:
+		return 0
+	firstM = GetFirstMember(stack)
+	lastM = GetLastMember(stack)
+	i = firstM
+	while i <= lastM:
+		mName = GetMemberName(stack, i)
+		mSize = GetMemberSize(stack, i)
+		if mSize:
+			i = i + mSize
+		else:
+			i = i + 4
+		if mName not in args and mName and 'var_' in mName:
+			args.append(mName)
+	return len(args)


 # 计算算数指令数量
--- a/Genius3/raw-feature-extractor/log_utils.py
+++ b/Genius3/raw-feature-extractor/log_utils.py
@ -0,0 +1,26 @@
+import logging
+import os
+
+
+def setup_logger(name, log_file, level=logging.INFO):
+    """Function setup as many loggers as you want"""
+
+    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    handler = logging.FileHandler(log_file)
+    handler.setFormatter(formatter)
+
+    # Also add a stream handler for console output
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.addHandler(handler)
+    logger.addHandler(stream_handler)
+
+    # If the file already exists, clear its contents to start fresh
+    if os.path.exists(log_file):
+        open(log_file, 'w').close()
+
+    return logger
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,73 +1,54 @@
-# coding=utf-8
-import os
+# -*- coding: UTF-8 -*-
 import pickle
-import idc
-import idaapi
+from func import *
+from idc import *
+import os

-# 定义常量
-DATA_DIR = "D:\\bishe\\dataset"
-INFECTED_DIR = os.path.join(DATA_DIR, "infected")
-BENIGN_DIR = os.path.join(DATA_DIR, "benign")
-CFG_EXTENSION = ".ida"
-GDL_EXTENSION = ".dot"
-ASM_EXTENSION = ".asm"

-def preprocess(binary_name, workflow):
-    cfg_path = os.path.join(
-        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
-        f"{binary_name}{CFG_EXTENSION}"
-    )
-    gdl_path = os.path.join(
-        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
-        f"{binary_name}{GDL_EXTENSION}"
-    )
-    asm_path = os.path.join(
-        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
-        f"{binary_name}{ASM_EXTENSION}"
-    )
+def preprocess():
+    # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
+    # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
+    # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
+    # print idc.ARGV[2]
+    # print type(idc.ARGV[2])

-    if os.path.exists(cfg_path):
-        idc.Exit(0)
+    binary_name = idc.GetInputFile()
+
+    workflow = idc.ARGV[1]
+    # workflow为特定值时分析良性软件，否则分析恶意软件
+    if workflow == '-1':
+        cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
+        gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
+        asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
    else:
-        analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
-        analysis_flags &= ~idc.AF_IMMOFF
-        idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+        cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
+        gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
+        asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)

-        idaapi.autoWait()
+    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+    analysis_flags &= ~idc.AF_IMMOFF
+    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+    idaapi.autoWait()

-        # 生成CFG
-        generate_cfg(binary_name, cfg_path)
+    # 生成pe文件的cfg列表
+    # cfgs = get_func_cfgs_c(FirstSeg())
+    # 将cfg保存为.ida
+    # pickle.dump(cfgs, open(cfg_path, 'w'))

-        # 生成GDL
-        generate_gdl(gdl_path)
+    # 生成pe文件的fcg，保存为.dot文件
+    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
+    # idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)

-        # 生成ASM
-        generate_asm(asm_path)
-
-        # 关闭IDA Pro
-        idc.Exit(0)
-
-def generate_cfg(binary_name, cfg_path):
-    cfgs = get_func_cfgs_c(FirstSeg())
-    with open(cfg_path, 'wb') as cfg_file:
-        pickle.dump(cfgs, cfg_file)
-
-def generate_gdl(gdl_path):
-    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
-
-def generate_asm(asm_path):
+    # 生成.asm文件
    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)

-# 主函数
-def main():
-    binary_name = idc.GetInputFile()
-    try:
-        workflow = idc.ARGV[1]
-    except IndexError:
-        print("Workflow argument not provided.")
-        return
-    preprocess(binary_name, workflow)
+    # 关闭IDA Pro
+    idc.Exit(0)

-# 如果是作为IDA Pro的脚本运行，调用主函数
-if __name__ == "__main__":
-    main()
+
+# 通用命令行格式  idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
+# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path，完整命令行如下
+# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
+# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
+if __name__ == '__main__':
+    preprocess()
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ b/Genius3/raw-feature-extractor/read_idaFILE.py
@ -1,101 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-from matplotlib import pyplot as plt
-import networkx as nx
-import pickle
-# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
-
-
-def print_obj(obj):
-    # "打印对象的所有属性"
-    print(obj.__dict__)
-
-
-# sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
-# sub_166C4 393
-if __name__ == '__main__':
-    testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
-    fr = open(testpath, 'r')
-    data = pickle.load(fr) #一个二进制文件的acfgs
-    fr.close()
-
-    # print(type(data1))
-    # print_obj(data1)
-    # print data1.raw_graph_list[393]
-    # print_obj(data1.raw_graph_list[393])
-    # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
-    # plt.show()
-
-    print("一个二进制文件的所有函数的原始特征，list。")
-    print_obj(data)  # acfg list
-    print("\n")
-
-    print("一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成")
-    print_obj(data.raw_graph_list[0])  # 一个函数的acfg
-    print("其中fun_features = 函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
-    # feature = data.raw_graph_list[0].fun_features
-    print("old_g:{}".format(data.raw_graph_list[0].old_g))
-    print("g:{}".format(data.raw_graph_list[0].g))
-
-
-    # G = data1.raw_graph_list[393].old_g
-    # print G.node[0] # G.node[i]是dict
-    # for key, value in G.node[0].items():
-    #     print('{key}:{value}'.format(key=key, value=value))
-
-    # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
-    G = data.raw_graph_list[0].g
-    print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
-    # print(G.node[0])
-    # print("\n")
-    # 函数内所有基本快的特征
-    for key, value in G.node.items():
-        print('{}:{}'.format(key, value))
-
-
-
-    #oldg就是读取IDA的CFG，所以数量、方向等都一样；g根据old_g生成，也一样
-    #old g
-    G = data.raw_graph_list[0].old_g
-    nx.draw(G, with_labels=True)
-    #plt.title('old_g')
-    plt.show()
-
-
-    # g
-    G = data.raw_graph_list[0].g
-    nx.draw(G, with_labels=True)
-    #plt.title('Genius_g')
-    plt.show()
-
-    # draw graph with labels
-    pos = nx.spring_layout(G)
-    nx.draw(G, pos)
-    node_labels = nx.get_node_attributes(G, 'v')  #networkx的node，由属性。g的属性为'v'，意为原始特征的vector。old_g的属性见cfg_constructor.py
-    nx.draw_networkx_labels(G, pos, labels=node_labels)
-    #plt.title('Genius_g with raw feature vector')
-    plt.show()
-
-
-# 1 function calls（本函数的函数调用指令（call jal jalr）数量）。。注意arm中没有这些指令
-
-# 2 logic instructions ，本函数的逻辑运算指令数量。如and、or的数量
-
-# 3 TransferIns 转移指令（如jmp arm中为mov）数量
-
-# 4 LocalVariables 局部变量数量
-
-# 5 BB basicblocks数量
-
-# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征，这里暂时不管
-
-# 7 IncommingCalls，调用本函数的指令数量
-
-# 8 Intrs 指令数量
-
-# 9 between 结构特征中的betweeness。
-
-# 10 strings 字符串
-
-# 11 consts  数字常量
--- a/Genius3/raw-feature-extractor/thread.py
+++ b/Genius3/raw-feature-extractor/thread.py
@ -0,0 +1,83 @@
+# coding=utf-8
+import os
+import subprocess
+import threading
+import time
+from log_utils import setup_logger
+
+# 设置最大并发线程数
+max_threads = 20
+
+# 创建一个锁和条件变量用于线程同步
+thread_lock = threading.Lock()
+active_threads = 0
+threads_completed = 0
+condition = threading.Condition(thread_lock)
+timer_event = threading.Event()
+
+
+def execute_command(cmd, log):
+    """
+    在子线程中执行给定的命令。
+    """
+    global active_threads, threads_completed
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    stdout, stderr = process.communicate()
+
+    with condition:
+        if stdout:
+            log.info("stdout: %s" % stdout.decode('gbk'))
+        if stderr:
+            log.warning("stderr: %s\n err_cmd: %cmd" % (stderr.decode('gbk'), cmd))
+        # 当前线程完成任务后释放一个线程位置
+        active_threads -= 1
+        threads_completed += 1
+        condition.notify_all()
+        print threads_completed
+
+
+def timer_thread():
+    print 'start timer thread'
+    while True:
+        with condition:
+            # 每隔1秒检查一次并输出已完成命令的数量
+            while not timer_event.is_set() and threads_completed < len(commands):
+                timer_event.wait(1)
+                print "done file: %d" % threads_completed
+
+        # 如果所有命令都已完成，则停止计时器线程
+        if threads_completed == len(commands):
+            timer_event.set()
+            break
+
+
+if __name__ == '__main__':
+    # timer = threading.Thread(target=timer_thread)
+    # timer.start()
+    # timer_event.clear()
+    log = setup_logger('thread_out', 'ida_asm_create_out.log')
+    # 样本文件夹
+    sample_dir = "D:/bishe/dataset/sample_20230130_458"
+    # 创建并启动线程
+    commands = []
+
+    for file in os.listdir(sample_dir):
+        com = r'D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out '
+        commands.append(com + "D:/bishe/dataset/sample_20230130_458/" + file)
+    threads = []
+    for cmd in commands:
+        while active_threads >= max_threads:
+            with condition:
+                # 等待有线程完成任务
+                condition.wait()
+
+        thread = threading.Thread(target=execute_command, args=(cmd, log))
+        thread.start()
+        active_threads += 1
+        threads.append(thread)
+
+    # 等待所有线程完成
+    for thread in threads:
+        thread.join()
+
+    print("所有命令已执行完毕.")
--- a/Genius3/search-engine/db.py
+++ b/Genius3/search-engine/db.py
@ -1,356 +0,0 @@
-import cPickle as pickle 
-from search import *
-from nearpy import Engine
-from nearpy.hashes import RandomDiscretizedProjections
-from nearpy.filters import NearestFilter, UniqueFilter
-from nearpy.distances import EuclideanDistance
-from nearpy.distances import CosineDistance
-from nearpy.hashes import RandomBinaryProjections
-from nearpy.experiments import DistanceRatioExperiment
-from redis import Redis
-from nearpy.storage import RedisStorage
-from feature import *
-import numpy as np
-import os
-import pdb
-import argparse
-import time
-import numpy as np
-from refactoring import *
-import pymongo
-from pymongo import MongoClient
-
-def initDB():
-	client = MongoClient()
-	client = MongoClient('localhost', 27017)
-	client = MongoClient('mongodb://localhost:27017/')
-	db = client.test_database
-	db = client['iot-encoding']
-	return db
-
-db = initDB()
-posts = db.posts
-
-class db:
-	
-	def __init__(self):
-		self.feature_list = {}
-		self.engine = None
-
-	def loadHashmap(self, feature_size, result_n):
-		# Create redis storage adapter
-		redis_object = Redis(host='localhost', port=6379, db=0)
-		redis_storage = RedisStorage(redis_object)
-		pdb.set_trace()
-		try:
-			# Get hash config from redis
-			config = redis_storage.load_hash_configuration('test')
-			# Config is existing, create hash with None parameters
-			lshash = RandomBinaryProjections(None, None)
-			# Apply configuration loaded from redis
-			lshash.apply_config(config)
-			
-		except:
-			# Config is not existing, create hash from scratch, with 10 projections
-			lshash = RandomBinaryProjections('test', 0)
-			
-
-		# Create engine for feature space of 100 dimensions and use our hash.
-		# This will set the dimension of the lshash only the first time, not when
-		# using the configuration loaded from redis. Use redis storage to store
-		# buckets.
-		nearest = NearestFilter(1000)
-		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
-		pdb.set_trace()
-		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
-
-		# Do some stuff like indexing or querying with the engine...
-
-		# Finally store hash configuration in redis for later use
-		redis_storage.store_hash_configuration(lshash)
-
-	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
-		if fvector is None:
-			return
-		#ftuple = tuple([fvector])
-		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
-
-	def batch_appendDB(self, binary_name, features, firmware_name=""):
-		for funcname in features:
-			feature = features[funcname]
-			#pdb.set_trace()
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir(self, base_dir):
-		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
-		i = 0
-		for v in cursor:
-			print i
-			i+=1
-			binary_name = v['binary_name']
-			funcname = v['func_name']
-			firmware_name = v['firmware_name']
-			feature = v['fvector']
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir1(self, base_dir):
-		image_dir = os.path.join(base_dir, "image")
-		firmware_featrues={}
-		bnum = 0
-		fnum = 0
-		i  = 0
-		pdb.set_trace()
-		for firmware_name in os.listdir(image_dir):
-			print firmware_name
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(image_dir, firmware_name)
-			for binary_name in os.listdir(firmware_dir):
-				if binary_name.endswith(".features"):
-					bnum += 1
-					featrues_dir = os.path.join(firmware_dir, binary_name)
-					featrues = pickle.load(open(featrues_dir, "r"))
-					for funcname in featrues:
-						fnum +=1
-						#pdb.set_trace()
-						feature = featrues[funcname]
-						self.appendToDB(binary_name, funcname, feature, firmware_name)
-					del featrues
-		print("bnum ", bnum)
-		print("fnum ", fnum)
-
-	def dump(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		pickle.dump(self.feature_list, open(db_dir, 'w'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		pickle.dump(self.engine, open(db_dir, 'w'))
-
-	def loadDB(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		self.feature_list = pickle.load(open(db_dir, 'r'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		self.engine = pickle.load(open(db_dir, 'r'))
-
-	def findF(self, binary_name, funcname):
-		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
-		return x[0]
-
-def retrieveFeaturesByDir(n, base_dir):
-	firmware_featrues={}
-	i = 0
-	for firmware_name in os.listdir(base_dir):
-		if firmware_name.endWith(".features"):
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(base_dir, firmware_name)
-			if i > 0:
-				break
-			i += 1
-			pdb.set_trace()
-			for binary_name in os.listdir(firmware_dir):
-				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
-				featrues = pickle.load(open(featrues_dir, "r"))
-				for funcname in featrues:
-					feature = featrues[funcname]
-					self.appendToDB(firmware_name, binary_name, funcname, feature)
-				del featrues
-
-def retrieveFeatures(n, base_dir, filename, funcs):
-	feature_dic = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename)
-	for name in featrues:
-		#if name in funcs:
-		x = featrues[name] 
-		#+ featuresx[name]
-		feature_dic[name] = np.asarray(x)
-	return feature_dic
-
-def retrieveVuldb(base_input_dir):
-	vul_path = os.path.join(base_input_dir, "vul")
-	vul_db = pickle.load(open(vul_path, "r"))
-	return vul_db
-
-
-def retrieveFeaturesx(filename):
-	ida_input_dir = os.path.join("./data/", filename + ".features")
-	featuresx = pickle.load(open(ida_input_dir, "r"))
-	return featuresx
-
-def retrieveQueries(n, base_dir, filename1, featrues_src):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename1)
-	for name in featrues:
-		#if name in featrues_src:
-		x = featrues[name] 
-		#+ featuresx[name]
-		queries[name] = np.asarray(x)
-	return queries
-
-def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	for name in featrues:
-		#del featrues[name][5]
-		queries[name] = np.asarray(featrues[name])
-	return queries
-
-def retrieveQuery(n, base_dir, filename, funcname):
-	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	f = [featrues[v] for v in featrues if funcname in v ][0]
-	return np.asarray(f)
-
-def parse_command():
-	parser = argparse.ArgumentParser(description='Process some integers.')
-	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
-	parser.add_argument('--output_dir', type=str, help="output dir")
-	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
-	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
-	parser.add_argument("--size", type=int, help="the size of each graphlet")
-	#parser.add_argument("--size", type=int, help="the size of each graphlet")
-	args = parser.parse_args()
-	return args
-
-def loadFuncs(path):
-	funcs = {}
-	x86_dir = os.path.join(path, "func_candid")
-	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
-	fp = open(x86_dir,"r")
-	for line in fp:
-		items = line.split("\n")
-		funcname = items[0]
-		funcs[funcname] = 1
-	return funcs
-
-def dump(path, featrues, queries):
-	fp = open(path + "/" + "matrix", 'w')
-	for name in featrues:
-		row = []
-		row.append("x86")
-		row.append(name)
-		row += featrues[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
-	for name in queries:
-		row = []
-		row.append("mips")
-		row.append(name)
-		row += queries[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
-	fp.close()
-
-
-def queryBytwo(base_input_dir, filename1, filename2, n):
-	threthold = 50
-	db_instance = db()
-	funcs = loadFuncs(base_input_dir)
-	db_instance.loadHashmap(n, 50000)
-	#pdb.set_trace()
-	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
-	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
-	#queries = refactoring(queries, featrues)
-	vul_db = retrieveVuldb(base_input_dir)
-	pdb.set_trace()
-	#dump(base_input_dir, featrues, queries)
-	#start = time.time()
-	#db_instance.batch_appendDBbyDir(base_input_dir)
-	#end = time.time()
-	#total = end - start
-	#print total
-	db_instance.batch_appendDB(filename1, featrues)
-	pdb.set_trace()
-	ranks = []
-	times = []
-	for threthold in xrange(1, 210, 10):
-		hit = []
-		i = 0
-		for name in queries:
-			#print i 
-			i += 1
-			'''
-			if i == 1000:
-				print (sum(times)/len(times))
-				pdb.set_trace()
-				print "s"
-			'''
-			#if name not in vul_db['openssl']:
-			#	continue
-			if name not in featrues:
-				continue
-			#pdb.set_trace()
-			query = queries[name]
-			#start = time.time()
-			x = db_instance.engine.neighbours(query)
-			#end = time.time()
-			#total = end - start
-			#times.append(total)
-			#print total
-			#pdb.set_trace()
-			try:
-				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
-				ranks.append((name, rank))
-				if rank <= threthold:
-					hit.append(1)
-				else:
-					hit.append(0)
-			except:
-				#pdb.set_trace()
-				hit.append(0)
-				pass
-		#pdb.set_trace()
-		acc = sum(hit) * 1.0 / len(hit)
-		print acc
-
-def queryAll(base_dir, firmware_name, filename1, n):
-	threthold = 155
-	db_instance = db()
-	db_instance.loadHashmap(n, 50000)
-	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
-	start = time.time()
-	pdb.set_trace()
-	db_instance.batch_appendDBbyDir(n, base_dir)
-	end = time.time()
-	dur = end - start
-	print dur
-	pdb.set_trace()
-	hit = []
-	i = 0
-	times = []
-	for name in queries:
-		print i 
-		i += 1
-		query = queries[name]
-		start = time.clock()
-		x = db_instance.engine.neighbours(query)
-		end = time.clock()
-		dur = end - start
-		times.append(dur)
-		#pdb.set_trace()
-		try:
-			rank = [v for v in xrange(len(x)) if name in x[v][1]]
-			if len(rank) > 1:
-				pdb.set_trace()
-				print "stop"
-			if rank[0] <= threthold:
-				hit.append(1)
-			else:
-				hit.append(0)
-		except:
-			hit.append(0)
-	
-	acc = sum(hit) * 1.0 / len(hit)
-	mean = np.mean(times)
-	std =  np.std(times)
-	#pdb.set_trace()
-	print acc
-
-if __name__ == "__main__":
-	args = parse_command()
-	base_dir = args.base_input_dir
-	filename1 = args.filename1
-	filename2 = args.filename2
-	n = args.size
-	pdb.set_trace()
-	queryBytwo(base_dir, filename1, filename2, n)
--- a/ida_file_cerate_malware.bat
+++ b/ida_file_cerate_malware.bat
@ -1,16 +0,0 @@
-@echo off
-setlocal EnableDelayedExpansion
-
-
-set "FOLDER_PATH=D:\bishe\dataset\train_malware"
-
-
-
-for %%f in ("%FOLDER_PATH%\*") do (
-    echo !time! %%f
-    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
-
-)
-
-endlocal
-
Author	SHA1	Message	Date
huihun	96971fd0a7	Merge branch 'master' into detached # Conflicts: # Genius3/raw-feature-extractor/convert_pkl_to_json.py # Genius3/raw-feature-extractor/preprocessing_ida.py	2024-03-03 16:59:09 +08:00
huihun	1317c38bc6	log写入	2024-03-03 16:39:19 +08:00
huihun	835b23f7be	加入多线程与日志记录	2024-03-03 14:34:47 +08:00
huihun	93db227535	加入多线程与日志记录	2024-03-03 14:34:19 +08:00
huihun	65d25d42de	py2.7可行版本	2024-03-01 16:42:02 +08:00
huihun	548eedb292	批量化操作	2024-03-01 16:11:26 +08:00