From 0f1e3378a2db6dd4c05f5f5f3eabcac9b7c75299 Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Fri, 1 Mar 2024 14:45:10 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96=E6=93=8D?=
=?UTF-8?q?=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../convert_pkl_to_json.py | 216 +++++++++---------
.../preprocessing_ida.py | 99 ++++----
2 files changed, 173 insertions(+), 142 deletions(-)
diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
index 837483b..7807f52 100644
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@@ -1,4 +1,5 @@
# coding=utf-8
+import hashlib
import pickle as pk
import re
import json
@@ -6,125 +7,133 @@ import os
from tqdm import tqdm
-def convert(start, end, overhaul):
- for workflow in range(start, end):
- # workflow = 0
- cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
- output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
- dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
+def calc_sha256(file_path):
+ with open(file_path, 'rb') as f:
+ bytes = f.read()
+ sha256obj = hashlib.sha256(bytes)
+ sha256 = sha256obj.hexdigest()
+ return sha256
- log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
- process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
- if overhaul:
- if os.path.exists(log_path):
- os.remove(log_path)
- if os.path.exists(process_log_path):
- os.remove(process_log_path)
+def convert_malware(overhaul):
+ cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
+ output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
+ dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
+ raw_dir = "D:\\bishe\\dataset\\train_malware"
- with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
- logged = log.readline()
- if logged == '':
- log_index = 0
+ log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
+ process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
+
+ if overhaul:
+ if os.path.exists(log_path):
+ os.remove(log_path)
+ if os.path.exists(process_log_path):
+ os.remove(process_log_path)
+
+ with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+ logged = log.readline()
+ if logged == '':
+ log_index = 0
+ else:
+ log_index = int(logged)
+
+ for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
+ if index < log_index:
+ continue
+
+ name = cfg[:-4] # 纯文件名,不带后缀
+ cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
+ try:
+ data = pk.load(cfg_file)
+ except EOFError:
+ process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
+ continue
+ except ValueError:
+ process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
+ continue
+ finally:
+ cfg_file.close()
+
+ dot_file_path = os.path.join(dot_dir, name + '.dot')
+ if not os.path.exists(dot_file_path):
+ process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
else:
- log_index = int(logged)
+ # 打开dot文件获取fcg
+ raw_function_edges = []
+ # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
+ functions_list = []
+ with open(dot_file_path, 'r') as dot:
+ for line in dot:
+ if '->' in line:
+ raw_function_edges.append(re.findall(r'\b\d+\b', line))
+ elif 'label' in line:
+ functions_list.append(line[line.find('= "') + 3:line.find('",')])
- for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
- if index < log_index:
+ # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
+ if raw_function_edges.__len__() == 0:
continue
- name = cfg[:-4] # 纯文件名,不带后缀
- cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
- try:
- data = pk.load(cfg_file)
- except EOFError:
- process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
- continue
- except ValueError:
- process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
- continue
- finally:
- cfg_file.close()
+ # 为当前pe文件创建json对象
+ json_obj = {
+ 'hash': calc_sha256(raw_dir + "\\" + name),
+ # 2023.8.12 bug fix: 这里获取的是内部函数的数量
+ # 'function_number': data.raw_graph_list.__len__(),
+ 'function_number': len(functions_list),
+ 'function_edges': [[int(d[0]) for d in raw_function_edges],
+ [int(d[1]) for d in raw_function_edges]],
+ 'acfg_list': [],
+ 'function_names': functions_list
+ }
- dot_file_path = os.path.join(dot_dir, name + '.dot')
- if not os.path.exists(dot_file_path):
- process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
- else:
- # 打开dot文件获取fcg
- raw_function_edges = []
- # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
- functions_list = []
- with open(dot_file_path, 'r') as dot:
- for line in dot:
- if '->' in line:
- raw_function_edges.append(re.findall(r'\b\d+\b', line))
- elif 'label' in line:
- functions_list.append(line[line.find('= "') + 3:line.find('",')])
-
- # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
- if raw_function_edges.__len__() == 0:
+ # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
+ # 读取pkl文件,一个acfg由一个函数分解而来
+ for acfg in data.raw_graph_list:
+ # 函数为外部函数,不需要构建cfg
+ if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
continue
- # 为当前pe文件创建json对象
- json_obj = {
- 'hash': data.binary_name[11:],
- # 2023.8.12 bug fix: 这里获取的是内部函数的数量
- # 'function_number': data.raw_graph_list.__len__(),
- 'function_number': len(functions_list),
- 'function_edges': [[int(d[0]) for d in raw_function_edges],
- [int(d[1]) for d in raw_function_edges]],
- 'acfg_list': [],
- 'function_names': functions_list
+ # 这里2是因为Genius框架提取特征时将后代数量放在2
+ offspring = [d.get('v')[2] for d in acfg.g.node.values()]
+ # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
+ # 以框架为主,将bb_features数组削减为和g.node长度一致
+ diff = acfg.g.__len__() - len(acfg.bb_features)
+ if diff != 0:
+ del acfg.bb_features[diff:]
+ # 将后代数量的特征放入bb_features中
+
+ for i, offs in enumerate(offspring):
+ acfg.bb_features[i].append(offs)
+
+ acfg_item = {
+ 'block_number': acfg.g.__len__(),
+ 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
+ 'block_features': acfg.bb_features
}
- # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
- # 读取pkl文件,一个acfg由一个函数分解而来
- for acfg in data.raw_graph_list:
- # 函数为外部函数,不需要构建cfg
- if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
- continue
+ json_obj['acfg_list'].append(acfg_item)
+ # json_obj['function_names'].append(acfg.funcname)
- # 这里2是因为Genius框架提取特征时将后代数量放在2
- offspring = [d.get('v')[2] for d in acfg.g.node.values()]
- # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
- # 以框架为主,将bb_features数组削减为和g.node长度一致
- diff = acfg.g.__len__() - len(acfg.bb_features)
- if diff != 0:
- del acfg.bb_features[diff:]
- # 将后代数量的特征放入bb_features中
+ # 将结果写入json本地文件
+ result = json.dumps(json_obj, ensure_ascii=False)
- for i, offs in enumerate(offspring):
- acfg.bb_features[i].append(offs)
+ with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
+ out.write(result)
- acfg_item = {
- 'block_number': acfg.g.__len__(),
- 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
- 'block_features': acfg.bb_features
- }
-
- json_obj['acfg_list'].append(acfg_item)
- # json_obj['function_names'].append(acfg.funcname)
-
- # 将结果写入json本地文件
- result = json.dumps(json_obj, ensure_ascii=False)
-
- with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
- out.write(result)
-
- log.truncate(0)
- log.seek(0)
- log.write(str(index))
- log.flush()
- process_log.write("index {}, {} process done.\n".format(index, cfg))
+ log.truncate(0)
+ log.seek(0)
+ log.write(str(index))
+ log.flush()
+ process_log.write("index {}, {} process done.\n".format(index, cfg))
def convert_benign(overhaul):
- cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
- dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
- output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
+ cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
+ dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
+ output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
+ raw_dir = "D:\\bishe\\dataset\\train_benign"
- log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
- process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
+ log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
+ process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
if overhaul:
if os.path.exists(log_path):
@@ -145,6 +154,7 @@ def convert_benign(overhaul):
continue
name = cfg[:-4] # 纯文件名
+
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try:
data = pk.load(cfg_file)
@@ -180,7 +190,7 @@ def convert_benign(overhaul):
# 为当前pe文件创建json对象
json_obj = {
- 'hash': data.binary_name[11:],
+ 'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
@@ -233,4 +243,6 @@ def convert_benign(overhaul):
if __name__ == '__main__':
# convert(35, 69)
- convert_benign(False)
+ # convert_benign(True)
+ convert_benign(True)
+ convert_malware(True)
diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py
index 14afd45..4744c07 100644
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@@ -1,54 +1,73 @@
-# -*- coding: UTF-8 -*-
-import pickle
-from func import *
-from idc import *
+# coding=utf-8
import os
+import pickle
+import idc
+import idaapi
+# 定义常量
+DATA_DIR = "D:\\bishe\\dataset"
+INFECTED_DIR = os.path.join(DATA_DIR, "infected")
+BENIGN_DIR = os.path.join(DATA_DIR, "benign")
+CFG_EXTENSION = ".ida"
+GDL_EXTENSION = ".dot"
+ASM_EXTENSION = ".asm"
-def preprocess():
- # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
- # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
- # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
- # print idc.ARGV[2]
- # print type(idc.ARGV[2])
+def preprocess(binary_name, workflow):
+ cfg_path = os.path.join(
+ INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+ f"{binary_name}{CFG_EXTENSION}"
+ )
+ gdl_path = os.path.join(
+ INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+ f"{binary_name}{GDL_EXTENSION}"
+ )
+ asm_path = os.path.join(
+ INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+ f"{binary_name}{ASM_EXTENSION}"
+ )
- binary_name = idc.GetInputFile()
-
- workflow = idc.ARGV[1]
- # workflow为特定值时分析良性软件,否则分析恶意软件
- if workflow == '-1':
- cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
- gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
- asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
+ if os.path.exists(cfg_path):
+ idc.Exit(0)
else:
- cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
- gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
- asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
+ analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+ analysis_flags &= ~idc.AF_IMMOFF
+ idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
- analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
- analysis_flags &= ~idc.AF_IMMOFF
- idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
- idaapi.autoWait()
+ idaapi.autoWait()
- # 生成pe文件的cfg列表
+ # 生成CFG
+ generate_cfg(binary_name, cfg_path)
+
+ # 生成GDL
+ generate_gdl(gdl_path)
+
+ # 生成ASM
+ generate_asm(asm_path)
+
+ # 关闭IDA Pro
+ idc.Exit(0)
+
+def generate_cfg(binary_name, cfg_path):
cfgs = get_func_cfgs_c(FirstSeg())
- # 将cfg保存为.ida
- pickle.dump(cfgs, open(cfg_path, 'w'))
+ with open(cfg_path, 'wb') as cfg_file:
+ pickle.dump(cfgs, cfg_file)
- # 生成pe文件的fcg,保存为.dot文件
- # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件,网上几乎找不到gdl这个格式
+def generate_gdl(gdl_path):
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
- # 生成.asm文件
+def generate_asm(asm_path):
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
- # 关闭IDA Pro
- idc.Exit(0)
+# 主函数
+def main():
+ binary_name = idc.GetInputFile()
+ try:
+ workflow = idc.ARGV[1]
+ except IndexError:
+ print("Workflow argument not provided.")
+ return
+ preprocess(binary_name, workflow)
-
-# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
-# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path,完整命令行如下
-# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
-# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
-if __name__ == '__main__':
- preprocess()
+# 如果是作为IDA Pro的脚本运行,调用主函数
+if __name__ == "__main__":
+ main()
From 548eedb29236e6377d9c45b26cb02b10bc6340f9 Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Fri, 1 Mar 2024 16:11:26 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96=E6=93=8D?=
=?UTF-8?q?=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.idea/Gencoding3.iml | 2 +-
.idea/misc.xml | 2 +-
.../bat_file/benign/ida_file_cerate.bat | 2 +-
.../malware/ida_file_cerate_malware.bat | 16 +
Genius3/main.py | 16 -
.../HierarchicalGraphModel_mine.py | 81 ----
.../convert_pkl_to_json.py | 2 -
Genius3/raw-feature-extractor/discovRe.py | 264 -------------
Genius3/raw-feature-extractor/func.py | 4 +-
.../graph_analysis_ida.py | 35 +-
.../preprocessing_ida.py | 16 +-
Genius3/raw-feature-extractor/read_idaFILE.py | 101 -----
Genius3/search-engine/db.py | 356 ------------------
ida_file_cerate_malware.bat | 16 -
14 files changed, 48 insertions(+), 865 deletions(-)
rename ida_file_cerate.bat => Genius3/bat_file/benign/ida_file_cerate.bat (82%)
create mode 100644 Genius3/bat_file/malware/ida_file_cerate_malware.bat
delete mode 100644 Genius3/main.py
delete mode 100644 Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
delete mode 100644 Genius3/raw-feature-extractor/discovRe.py
delete mode 100644 Genius3/raw-feature-extractor/read_idaFILE.py
delete mode 100644 Genius3/search-engine/db.py
delete mode 100644 ida_file_cerate_malware.bat
diff --git a/.idea/Gencoding3.iml b/.idea/Gencoding3.iml
index 7805102..f7a47fa 100644
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@@ -4,7 +4,7 @@
-
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 7ba73c2..b20e505 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/ida_file_cerate.bat b/Genius3/bat_file/benign/ida_file_cerate.bat
similarity index 82%
rename from ida_file_cerate.bat
rename to Genius3/bat_file/benign/ida_file_cerate.bat
index f86dbde..618ba13 100644
--- a/ida_file_cerate.bat
+++ b/Genius3/bat_file/benign/ida_file_cerate.bat
@@ -2,7 +2,7 @@
setlocal EnableDelayedExpansion
-set "FOLDER_PATH=D:\bishe\dataset\train_benign"
+set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"
diff --git a/Genius3/bat_file/malware/ida_file_cerate_malware.bat b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
new file mode 100644
index 0000000..c061ccf
--- /dev/null
+++ b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
@@ -0,0 +1,16 @@
+@echo off
+setlocal EnableDelayedExpansion
+
+
+set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
+
+
+
+for %%f in ("%FOLDER_PATH%\*") do (
+ echo !time! %%f
+ D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
+
+)
+
+endlocal
+
diff --git a/Genius3/main.py b/Genius3/main.py
deleted file mode 100644
index 266873d..0000000
--- a/Genius3/main.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-
-from func import *
-from raw_graphs import *
-from idc import *
-import os
-import argparse
-if __name__ == '__main__':
- print "hello"
-
- #
- # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
- # -c 删除旧数据库 -A 自动分析,不显示对话框
- # -B 相当于 -c -A
-
diff --git a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
deleted file mode 100644
index b28ad47..0000000
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ /dev/null
@@ -1,81 +0,0 @@
-class HierarchicalGraphNeuralNetwork(nn.Module):
- def __init__(self, external_vocab: Vocab):
- super(HierarchicalGraphNeuralNetwork, self).__init__()
- self.pool = 'global_max_pool'
- # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
- cfg_filter_list =[200, 200]
- cfg_filter_list.insert(0, 11)
- self.cfg_filter_length = len(cfg_filter_list)
- cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
- i in range(self.cfg_filter_length - 1)]
- cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
- cfg_constructor = cfg_conv['constructor']
- for i in range(self.cfg_filter_length - 1):
- setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
- self.dropout = nn.Dropout(p=0.2)
- # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
- self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
- embedding_dim=cfg_filter_list[-1],
- padding_idx=external_vocab.pad_idx)
- fcg_filter_list = [200, 200]
- fcg_filter_list.insert(0, cfg_filter_list[-1])
- self.fcg_filter_length = len(fcg_filter_list)
- fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
- i in range(self.fcg_filter_length - 1)]
- fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
- fcg_constructor = fcg_conv['constructor']
- for i in range(self.fcg_filter_length - 1):
- setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
- # Last Projection Function: gradually project with more linear layers
- self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
- self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
- self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
- self.last_activation = nn.Softmax(dim=1)
-
- def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
- bt_all_function_edges: list):
- rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
- x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
- fcg_list = []
- fcg_internal_list = []
- for idx_batch in range(len(real_bt_positions) - 1):
- start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
- idx_x_cfg = x_cfg_pool[start_pos: end_pos]
- fcg_internal_list.append(idx_x_cfg)
- idx_x_external = self.external_embedding_layer(
- torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
- idx_x_external = idx_x_external.squeeze(dim=0)
- idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
- idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
- idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
- idx_graph_data.validate()
- fcg_list.append(idx_graph_data)
- fcg_batch = Batch.from_data_list(fcg_list)
- # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
- rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
- x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
- batch_final = x_fcg_pool
- # step last project to the number_of_classes (multiclass)
- bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
- bt_pred = self.last_activation(bt_final_embed)
- return bt_pred
-
- def forward_cfg_gnn(self, local_batch: Batch):
- in_x, edge_index = local_batch.x, local_batch.edge_index
- for i in range(self.cfg_filter_length - 1):
- out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
- out_x = torch.nn.functional.relu(out_x, inplace=True)
- out_x = self.dropout(out_x)
- in_x = out_x
- local_batch.x = in_x
- return local_batch
-
- def forward_fcg_gnn(self, function_batch: Batch):
- in_x, edge_index = function_batch.x, function_batch.edge_index
- for i in range(self.fcg_filter_length - 1):
- out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
- out_x = torch.nn.functional.relu(out_x, inplace=True)
- out_x = self.dropout(out_x)
- in_x = out_x
- function_batch.x = in_x
- return function_batch
\ No newline at end of file
diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
index 7807f52..5b291ef 100644
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@@ -242,7 +242,5 @@ def convert_benign(overhaul):
if __name__ == '__main__':
- # convert(35, 69)
- # convert_benign(True)
convert_benign(True)
convert_malware(True)
diff --git a/Genius3/raw-feature-extractor/discovRe.py b/Genius3/raw-feature-extractor/discovRe.py
deleted file mode 100644
index 451999e..0000000
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding=utf-8
-#
-# Reference Lister
-#
-# List all functions and all references to them in the current section.
-#
-# Implemented with the idautils module
-#
-import networkx as nx
-import pdb
-from graph_analysis_ida import *
-from graph_property import *
-
-
-# import wingdbstub
-# wingdbstub.Ensure()
-
-def get_funcs(ea):
- funcs = {}
- # Get current ea
- # Loop from start to end in the current segment
- for funcea in Functions(SegStart(ea)):
- funcname = GetFunctionName(funcea)
- func = get_func(funcea)
- blocks = FlowChart(func)
- funcs[funcname] = []
- for bl in blocks:
- start = bl.startEA
- end = bl.endEA
- funcs[funcname].append((start, end))
- return funcs
-
-
-# 似乎是没用的函数
-# def get_funcs_for_discoverRe(ea):
-# features = {}
-# for funcea in Functions(SegStart(ea)):
-# funcname = GetFunctionName(funcea)
-# print(funcname)
-# func = get_func(funcea)
-# feature = get_discoverRe_feature(func)
-# features[funcname] = feature
-# return features
-
-
-# 获取所有bb的11维属性特征
-# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
-def get_bb_features(func):
- bb_features = []
- blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
- for bl in blocks:
- calls = calCalls(bl)
- transferIns = calTransferIns(bl)
- mathematicsIns = calArithmeticIns(bl)
- logicIns = calLogicInstructions(bl)
- cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
- movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
- interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
- declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
- totalIns = calInsts(bl)
- consts = getBBconsts(bl)
- stringOrIntConsts = len(consts[0]) + len(consts[1])
- bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
- interruptIns, declareIns, totalIns, stringOrIntConsts])
- return bb_features
-
-
-def get_discoverRe_feature(func, icfg):
- start = func.startEA
- end = func.endEA
- features = []
- FunctionCalls = getFuncCalls(func)
- # 1
- features.append(FunctionCalls)
- LogicInstr = getLogicInsts(func)
- # 2
- features.append(LogicInstr)
- Transfer = getTransferInsts(func)
- # 3
- features.append(Transfer)
- Locals = getLocalVariables(func)
- # 4
- features.append(Locals)
- BB = getBasicBlocks(func)
- # 5
- features.append(BB)
- Edges = len(icfg.edges())
- # 6
- features.append(Edges)
- Incoming = getIncommingCalls(func)
- # 7
- features.append(Incoming)
- # 8
- Instrs = getIntrs(func)
- features.append(Instrs)
- between = retrieveGP(icfg)
- # 9
- features.append(between)
-
- strings, consts = getfunc_consts(func)
- # 10
- features.append(strings)
- # 11
- features.append(consts)
- return features
-
-
-def get_func_names(ea):
- funcs = {}
- for funcea in Functions(SegStart(ea)):
- funcname = GetFunctionName(funcea)
- funcs[funcname] = funcea
- return funcs
-
-
-def get_func_bases(ea):
- funcs = {}
- for funcea in Functions(SegStart(ea)):
- funcname = GetFunctionName(funcea)
- funcs[funcea] = funcname
- return funcs
-
-
-def get_func_range(ea):
- funcs = {}
- for funcea in Functions(SegStart(ea)):
- funcname = GetFunctionName(funcea)
- func = get_func(funcea)
- funcs[funcname] = (func.startEA, func.endEA)
- return funcs
-
-
-def get_func_sequences(ea):
- funcs_bodylist = {}
- funcs = get_funcs(ea)
- for funcname in funcs:
- if funcname not in funcs_bodylist:
- funcs_bodylist[funcname] = []
- for start, end in funcs[funcname]:
- inst_addr = start
- while inst_addr <= end:
- opcode = GetMnem(inst_addr)
- funcs_bodylist[funcname].append(opcode)
- inst_addr = NextHead(inst_addr)
- return funcs_bodylist
-
-
-def get_func_cfgs(ea):
- func_cfglist = {}
- i = 0
- start, end = get_section('LOAD')
- # print start, end
- for funcea in Functions(SegStart(ea)):
- if start <= funcea <= end:
- funcname = GetFunctionName(funcea)
- func = get_func(funcea)
- print(i)
- i += 1
- try:
- icfg = cfg.cfg_construct(func)
- func_cfglist[funcname] = icfg
- except:
- pass
-
- return func_cfglist
-
-
-def get_section(t):
- base = SegByName(t)
- start = SegByBase(base)
- end = SegEnd(start)
- return start, end
-
-
-def get_func_cfg_sequences(func_cfglist):
- func_cfg_seqlist = {}
- for funcname in func_cfglist:
- func_cfg_seqlist[funcname] = {}
- cfg = func_cfglist[funcname][0]
- for start, end in cfg:
- codesq = get_sequences(start, end)
- func_cfg_seqlist[funcname][(start, end)] = codesq
-
- return func_cfg_seqlist
-
-
-def get_sequences(start, end):
- seq = []
- inst_addr = start
- while inst_addr <= end:
- opcode = GetMnem(inst_addr)
- seq.append(opcode)
- inst_addr = NextHead(inst_addr)
- return seq
-
-
-def get_stack_arg(func_addr):
- print(func_addr)
- args = []
- stack = GetFrame(func_addr)
- if not stack:
- return []
- firstM = GetFirstMember(stack)
- lastM = GetLastMember(stack)
- i = firstM
- while i <= lastM:
- mName = GetMemberName(stack, i)
- mSize = GetMemberSize(stack, i)
- if mSize:
- i = i + mSize
- else:
- i = i + 4
- if mName not in args and mName and ' s' not in mName and ' r' not in mName:
- args.append(mName)
- return args
-
- # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-
-
-def processDataSegs():
- funcdata = {}
- datafunc = {}
- for n in xrange(idaapi.get_segm_qty()):
- seg = idaapi.getnseg(n)
- ea = seg.startEA
- segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
- if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
- start = idc.SegStart(ea)
- end = idc.SegEnd(ea)
- cur = start
- while cur <= end:
- refs = [v for v in DataRefsTo(cur)]
- for fea in refs:
- name = GetFunctionName(fea)
- if len(name) == 0:
- continue
- if name not in funcdata:
- funcdata[name] = [cur]
- else:
- funcdata[name].append(cur)
- if cur not in datafunc:
- datafunc[cur] = [name]
- else:
- datafunc[cur].append(name)
- cur = NextHead(cur)
- return funcdata, datafunc
-
-
-def obtainDataRefs(callgraph):
- datarefs = {}
- funcdata, datafunc = processDataSegs()
- for node in callgraph:
- if node in funcdata:
- datas = funcdata[node]
- for dd in datas:
- refs = datafunc[dd]
- refs = list(set(refs))
- if node in datarefs:
- print(refs)
- datarefs[node] += refs
- datarefs[node] = list(set(datarefs[node]))
- else:
- datarefs[node] = refs
- return datarefs
diff --git a/Genius3/raw-feature-extractor/func.py b/Genius3/raw-feature-extractor/func.py
index 33020aa..61f207a 100644
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@@ -16,9 +16,7 @@ from raw_graphs import *
#from discovRe_feature.discovRe import *
from discovRe import *
-sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
-#import wingdbstub
-#wingdbstub.Ensure()
+
diff --git a/Genius3/raw-feature-extractor/graph_analysis_ida.py b/Genius3/raw-feature-extractor/graph_analysis_ida.py
index 390f8f1..66194b3 100644
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@@ -119,24 +119,23 @@ def getIncommingCalls(func):
def get_stackVariables(func_addr):
- #print func_addr
- args = []
- stack = GetFrame(func_addr)
- if not stack:
- return 0
- firstM = GetFirstMember(stack)
- lastM = GetLastMember(stack)
- i = firstM
- while i <=lastM:
- mName = GetMemberName(stack,i)
- mSize = GetMemberSize(stack,i)
- if mSize:
- i = i + mSize
- else:
- i = i+4
- if mName not in args and mName and 'var_' in mName:
- args.append(mName)
- return len(args)
+ args = []
+ stack = GetFrame(func_addr)
+ if not stack:
+ return 0
+ firstM = GetFirstMember(stack)
+ lastM = GetLastMember(stack)
+ i = firstM
+ while i <= lastM:
+ mName = GetMemberName(stack, i)
+ mSize = GetMemberSize(stack, i)
+ if mSize:
+ i = i + mSize
+ else:
+ i = i + 4
+ if mName not in args and mName and 'var_' in mName:
+ args.append(mName)
+ return len(args)
# 计算算数指令数量
diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py
index 4744c07..4a968da 100644
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@@ -1,7 +1,7 @@
-# coding=utf-8
import os
import pickle
-import idc
+from func import *
+from idc import *
import idaapi
# 定义常量
@@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida"
GDL_EXTENSION = ".dot"
ASM_EXTENSION = ".asm"
+
def preprocess(binary_name, workflow):
cfg_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
@@ -29,9 +30,9 @@ def preprocess(binary_name, workflow):
if os.path.exists(cfg_path):
idc.Exit(0)
else:
- analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
- analysis_flags &= ~idc.AF_IMMOFF
- idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+ analysis_flags = idc.GetShortPrm(idc.INF_AF2)
+ analysis_flags &= ~ida_ida.AF_IMMOFF
+ idc.SetShortPrm(idc.INF_AF2, analysis_flags)
idaapi.autoWait()
@@ -47,17 +48,21 @@ def preprocess(binary_name, workflow):
# 关闭IDA Pro
idc.Exit(0)
+
def generate_cfg(binary_name, cfg_path):
cfgs = get_func_cfgs_c(FirstSeg())
with open(cfg_path, 'wb') as cfg_file:
pickle.dump(cfgs, cfg_file)
+
def generate_gdl(gdl_path):
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
+
def generate_asm(asm_path):
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
+
# 主函数
def main():
binary_name = idc.GetInputFile()
@@ -68,6 +73,7 @@ def main():
return
preprocess(binary_name, workflow)
+
# 如果是作为IDA Pro的脚本运行,调用主函数
if __name__ == "__main__":
main()
diff --git a/Genius3/raw-feature-extractor/read_idaFILE.py b/Genius3/raw-feature-extractor/read_idaFILE.py
deleted file mode 100644
index aae5416..0000000
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-from matplotlib import pyplot as plt
-import networkx as nx
-import pickle
-# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
-
-
-def print_obj(obj):
- # "打印对象的所有属性"
- print(obj.__dict__)
-
-
-# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
-# sub_166C4 393
-if __name__ == '__main__':
- testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
- fr = open(testpath, 'r')
- data = pickle.load(fr) #一个二进制文件的acfgs
- fr.close()
-
- # print(type(data1))
- # print_obj(data1)
- # print data1.raw_graph_list[393]
- # print_obj(data1.raw_graph_list[393])
- # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
- # plt.show()
-
- print("一个二进制文件的所有函数的原始特征,list。")
- print_obj(data) # acfg list
- print("\n")
-
- print("一个函数的原始特征,由old_g(discovRe方法的ACFG),g(Genius方法的ACFG),fun_feature(表示函数级别的特征的向量)三部分构成")
- print_obj(data.raw_graph_list[0]) # 一个函数的acfg
- print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
- # feature = data.raw_graph_list[0].fun_features
- print("old_g:{}".format(data.raw_graph_list[0].old_g))
- print("g:{}".format(data.raw_graph_list[0].g))
-
-
- # G = data1.raw_graph_list[393].old_g
- # print G.node[0] # G.node[i]是dict
- # for key, value in G.node[0].items():
- # print('{key}:{value}'.format(key=key, value=value))
-
- # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
- G = data.raw_graph_list[0].g
- print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
- # print(G.node[0])
- # print("\n")
- # 函数内所有基本快的特征
- for key, value in G.node.items():
- print('{}:{}'.format(key, value))
-
-
-
- #oldg就是读取IDA的CFG,所以数量、方向等都一样;g根据old_g生成,也一样
- #old g
- G = data.raw_graph_list[0].old_g
- nx.draw(G, with_labels=True)
- #plt.title('old_g')
- plt.show()
-
-
- # g
- G = data.raw_graph_list[0].g
- nx.draw(G, with_labels=True)
- #plt.title('Genius_g')
- plt.show()
-
- # draw graph with labels
- pos = nx.spring_layout(G)
- nx.draw(G, pos)
- node_labels = nx.get_node_attributes(G, 'v') #networkx的node,由属性。g的属性为'v',意为原始特征的vector。old_g的属性见cfg_constructor.py
- nx.draw_networkx_labels(G, pos, labels=node_labels)
- #plt.title('Genius_g with raw feature vector')
- plt.show()
-
-
-# 1 function calls(本函数的函数调用指令(call jal jalr)数量)。。注意arm中没有这些指令
-
-# 2 logic instructions ,本函数的逻辑运算指令数量。如and、or的数量
-
-# 3 TransferIns 转移指令(如jmp arm中为mov)数量
-
-# 4 LocalVariables 局部变量数量
-
-# 5 BB basicblocks数量
-
-# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征,这里暂时不管
-
-# 7 IncommingCalls,调用本函数的指令数量
-
-# 8 Intrs 指令数量
-
-# 9 between 结构特征中的betweeness。
-
-# 10 strings 字符串
-
-# 11 consts 数字常量
\ No newline at end of file
diff --git a/Genius3/search-engine/db.py b/Genius3/search-engine/db.py
deleted file mode 100644
index bc6c864..0000000
--- a/Genius3/search-engine/db.py
+++ /dev/null
@@ -1,356 +0,0 @@
-import cPickle as pickle
-from search import *
-from nearpy import Engine
-from nearpy.hashes import RandomDiscretizedProjections
-from nearpy.filters import NearestFilter, UniqueFilter
-from nearpy.distances import EuclideanDistance
-from nearpy.distances import CosineDistance
-from nearpy.hashes import RandomBinaryProjections
-from nearpy.experiments import DistanceRatioExperiment
-from redis import Redis
-from nearpy.storage import RedisStorage
-from feature import *
-import numpy as np
-import os
-import pdb
-import argparse
-import time
-import numpy as np
-from refactoring import *
-import pymongo
-from pymongo import MongoClient
-
-def initDB():
- client = MongoClient()
- client = MongoClient('localhost', 27017)
- client = MongoClient('mongodb://localhost:27017/')
- db = client.test_database
- db = client['iot-encoding']
- return db
-
-db = initDB()
-posts = db.posts
-
-class db:
-
- def __init__(self):
- self.feature_list = {}
- self.engine = None
-
- def loadHashmap(self, feature_size, result_n):
- # Create redis storage adapter
- redis_object = Redis(host='localhost', port=6379, db=0)
- redis_storage = RedisStorage(redis_object)
- pdb.set_trace()
- try:
- # Get hash config from redis
- config = redis_storage.load_hash_configuration('test')
- # Config is existing, create hash with None parameters
- lshash = RandomBinaryProjections(None, None)
- # Apply configuration loaded from redis
- lshash.apply_config(config)
-
- except:
- # Config is not existing, create hash from scratch, with 10 projections
- lshash = RandomBinaryProjections('test', 0)
-
-
- # Create engine for feature space of 100 dimensions and use our hash.
- # This will set the dimension of the lshash only the first time, not when
- # using the configuration loaded from redis. Use redis storage to store
- # buckets.
- nearest = NearestFilter(1000)
- #self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
- pdb.set_trace()
- self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
-
- # Do some stuff like indexing or querying with the engine...
-
- # Finally store hash configuration in redis for later use
- redis_storage.store_hash_configuration(lshash)
-
- def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
- if fvector is None:
- return
- #ftuple = tuple([fvector])
- self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
-
- def batch_appendDB(self, binary_name, features, firmware_name=""):
- for funcname in features:
- feature = features[funcname]
- #pdb.set_trace()
- self.appendToDB(binary_name, funcname, feature, firmware_name)
-
- def batch_appendDBbyDir(self, base_dir):
- cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
- i = 0
- for v in cursor:
- print i
- i+=1
- binary_name = v['binary_name']
- funcname = v['func_name']
- firmware_name = v['firmware_name']
- feature = v['fvector']
- self.appendToDB(binary_name, funcname, feature, firmware_name)
-
- def batch_appendDBbyDir1(self, base_dir):
- image_dir = os.path.join(base_dir, "image")
- firmware_featrues={}
- bnum = 0
- fnum = 0
- i = 0
- pdb.set_trace()
- for firmware_name in os.listdir(image_dir):
- print firmware_name
- firmware_featrues[firmware_name] = {}
- firmware_dir = os.path.join(image_dir, firmware_name)
- for binary_name in os.listdir(firmware_dir):
- if binary_name.endswith(".features"):
- bnum += 1
- featrues_dir = os.path.join(firmware_dir, binary_name)
- featrues = pickle.load(open(featrues_dir, "r"))
- for funcname in featrues:
- fnum +=1
- #pdb.set_trace()
- feature = featrues[funcname]
- self.appendToDB(binary_name, funcname, feature, firmware_name)
- del featrues
- print("bnum ", bnum)
- print("fnum ", fnum)
-
- def dump(self, base_dir):
- db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
- pickle.dump(self.feature_list, open(db_dir, 'w'))
- db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
- pickle.dump(self.engine, open(db_dir, 'w'))
-
- def loadDB(self, base_dir):
- db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
- self.feature_list = pickle.load(open(db_dir, 'r'))
- db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
- self.engine = pickle.load(open(db_dir, 'r'))
-
- def findF(self, binary_name, funcname):
- x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
- return x[0]
-
-def retrieveFeaturesByDir(n, base_dir):
- firmware_featrues={}
- i = 0
- for firmware_name in os.listdir(base_dir):
- if firmware_name.endWith(".features"):
- firmware_featrues[firmware_name] = {}
- firmware_dir = os.path.join(base_dir, firmware_name)
- if i > 0:
- break
- i += 1
- pdb.set_trace()
- for binary_name in os.listdir(firmware_dir):
- featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
- featrues = pickle.load(open(featrues_dir, "r"))
- for funcname in featrues:
- feature = featrues[funcname]
- self.appendToDB(firmware_name, binary_name, funcname, feature)
- del featrues
-
-def retrieveFeatures(n, base_dir, filename, funcs):
- feature_dic = {}
- featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
- featrues = pickle.load(open(featrues_dir, "r"))
- #featuresx = retrieveFeaturesx(filename)
- for name in featrues:
- #if name in funcs:
- x = featrues[name]
- #+ featuresx[name]
- feature_dic[name] = np.asarray(x)
- return feature_dic
-
-def retrieveVuldb(base_input_dir):
- vul_path = os.path.join(base_input_dir, "vul")
- vul_db = pickle.load(open(vul_path, "r"))
- return vul_db
-
-
-def retrieveFeaturesx(filename):
- ida_input_dir = os.path.join("./data/", filename + ".features")
- featuresx = pickle.load(open(ida_input_dir, "r"))
- return featuresx
-
-def retrieveQueries(n, base_dir, filename1, featrues_src):
- queries = {}
- featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
- featrues = pickle.load(open(featrues_dir, "r"))
- #featuresx = retrieveFeaturesx(filename1)
- for name in featrues:
- #if name in featrues_src:
- x = featrues[name]
- #+ featuresx[name]
- queries[name] = np.asarray(x)
- return queries
-
-def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
- queries = {}
- featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
- featrues = pickle.load(open(featrues_dir, "r"))
- for name in featrues:
- #del featrues[name][5]
- queries[name] = np.asarray(featrues[name])
- return queries
-
-def retrieveQuery(n, base_dir, filename, funcname):
- featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
- featrues = pickle.load(open(featrues_dir, "r"))
- f = [featrues[v] for v in featrues if funcname in v ][0]
- return np.asarray(f)
-
-def parse_command():
- parser = argparse.ArgumentParser(description='Process some integers.')
- parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
- parser.add_argument('--output_dir', type=str, help="output dir")
- parser.add_argument("--filename1", type=str, help="the size of each graphlet")
- parser.add_argument("--filename2", type=str, help="the size of each graphlet")
- parser.add_argument("--size", type=int, help="the size of each graphlet")
- #parser.add_argument("--size", type=int, help="the size of each graphlet")
- args = parser.parse_args()
- return args
-
-def loadFuncs(path):
- funcs = {}
- x86_dir = os.path.join(path, "func_candid")
- #mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
- fp = open(x86_dir,"r")
- for line in fp:
- items = line.split("\n")
- funcname = items[0]
- funcs[funcname] = 1
- return funcs
-
-def dump(path, featrues, queries):
- fp = open(path + "/" + "matrix", 'w')
- for name in featrues:
- row = []
- row.append("x86")
- row.append(name)
- row += featrues[name]
- fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
- for name in queries:
- row = []
- row.append("mips")
- row.append(name)
- row += queries[name]
- fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
- fp.close()
-
-
-def queryBytwo(base_input_dir, filename1, filename2, n):
- threthold = 50
- db_instance = db()
- funcs = loadFuncs(base_input_dir)
- db_instance.loadHashmap(n, 50000)
- #pdb.set_trace()
- featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
- queries = retrieveQueries(n, base_input_dir, filename2, funcs)
- #queries = refactoring(queries, featrues)
- vul_db = retrieveVuldb(base_input_dir)
- pdb.set_trace()
- #dump(base_input_dir, featrues, queries)
- #start = time.time()
- #db_instance.batch_appendDBbyDir(base_input_dir)
- #end = time.time()
- #total = end - start
- #print total
- db_instance.batch_appendDB(filename1, featrues)
- pdb.set_trace()
- ranks = []
- times = []
- for threthold in xrange(1, 210, 10):
- hit = []
- i = 0
- for name in queries:
- #print i
- i += 1
- '''
- if i == 1000:
- print (sum(times)/len(times))
- pdb.set_trace()
- print "s"
- '''
- #if name not in vul_db['openssl']:
- # continue
- if name not in featrues:
- continue
- #pdb.set_trace()
- query = queries[name]
- #start = time.time()
- x = db_instance.engine.neighbours(query)
- #end = time.time()
- #total = end - start
- #times.append(total)
- #print total
- #pdb.set_trace()
- try:
- rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
- ranks.append((name, rank))
- if rank <= threthold:
- hit.append(1)
- else:
- hit.append(0)
- except:
- #pdb.set_trace()
- hit.append(0)
- pass
- #pdb.set_trace()
- acc = sum(hit) * 1.0 / len(hit)
- print acc
-
-def queryAll(base_dir, firmware_name, filename1, n):
- threthold = 155
- db_instance = db()
- db_instance.loadHashmap(n, 50000)
- queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
- start = time.time()
- pdb.set_trace()
- db_instance.batch_appendDBbyDir(n, base_dir)
- end = time.time()
- dur = end - start
- print dur
- pdb.set_trace()
- hit = []
- i = 0
- times = []
- for name in queries:
- print i
- i += 1
- query = queries[name]
- start = time.clock()
- x = db_instance.engine.neighbours(query)
- end = time.clock()
- dur = end - start
- times.append(dur)
- #pdb.set_trace()
- try:
- rank = [v for v in xrange(len(x)) if name in x[v][1]]
- if len(rank) > 1:
- pdb.set_trace()
- print "stop"
- if rank[0] <= threthold:
- hit.append(1)
- else:
- hit.append(0)
- except:
- hit.append(0)
-
- acc = sum(hit) * 1.0 / len(hit)
- mean = np.mean(times)
- std = np.std(times)
- #pdb.set_trace()
- print acc
-
-if __name__ == "__main__":
- args = parse_command()
- base_dir = args.base_input_dir
- filename1 = args.filename1
- filename2 = args.filename2
- n = args.size
- pdb.set_trace()
- queryBytwo(base_dir, filename1, filename2, n)
diff --git a/ida_file_cerate_malware.bat b/ida_file_cerate_malware.bat
deleted file mode 100644
index cd555ed..0000000
--- a/ida_file_cerate_malware.bat
+++ /dev/null
@@ -1,16 +0,0 @@
-@echo off
-setlocal EnableDelayedExpansion
-
-
-set "FOLDER_PATH=D:\bishe\dataset\train_malware"
-
-
-
-for %%f in ("%FOLDER_PATH%\*") do (
- echo !time! %%f
- D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
-
-)
-
-endlocal
-