backup
This commit is contained in:
parent
ddf9ff3b59
commit
d599236e94
@ -2,7 +2,14 @@
|
|||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
|
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
|
||||||
<serverData>
|
<serverData>
|
||||||
<paths name="root@region-41.seetacloud.com:29208">
|
<paths name="304">
|
||||||
|
<serverdata>
|
||||||
|
<mappings>
|
||||||
|
<mapping local="$PROJECT_DIR$" web="/" />
|
||||||
|
</mappings>
|
||||||
|
</serverdata>
|
||||||
|
</paths>
|
||||||
|
<paths name="root@region-42.seetacloud.com:58034 password">
|
||||||
<serverdata>
|
<serverdata>
|
||||||
<mappings>
|
<mappings>
|
||||||
<mapping local="$PROJECT_DIR$" web="/" />
|
<mapping local="$PROJECT_DIR$" web="/" />
|
||||||
|
81
Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
Normal file
81
Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
class HierarchicalGraphNeuralNetwork(nn.Module):
|
||||||
|
def __init__(self, external_vocab: Vocab):
|
||||||
|
super(HierarchicalGraphNeuralNetwork, self).__init__()
|
||||||
|
self.pool = 'global_max_pool'
|
||||||
|
# Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
|
||||||
|
cfg_filter_list =[200, 200]
|
||||||
|
cfg_filter_list.insert(0, 11)
|
||||||
|
self.cfg_filter_length = len(cfg_filter_list)
|
||||||
|
cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
|
||||||
|
i in range(self.cfg_filter_length - 1)]
|
||||||
|
cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
|
||||||
|
cfg_constructor = cfg_conv['constructor']
|
||||||
|
for i in range(self.cfg_filter_length - 1):
|
||||||
|
setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
|
||||||
|
self.dropout = nn.Dropout(p=0.2)
|
||||||
|
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
|
||||||
|
self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
|
||||||
|
embedding_dim=cfg_filter_list[-1],
|
||||||
|
padding_idx=external_vocab.pad_idx)
|
||||||
|
fcg_filter_list = [200, 200]
|
||||||
|
fcg_filter_list.insert(0, cfg_filter_list[-1])
|
||||||
|
self.fcg_filter_length = len(fcg_filter_list)
|
||||||
|
fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
|
||||||
|
i in range(self.fcg_filter_length - 1)]
|
||||||
|
fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
|
||||||
|
fcg_constructor = fcg_conv['constructor']
|
||||||
|
for i in range(self.fcg_filter_length - 1):
|
||||||
|
setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
|
||||||
|
# Last Projection Function: gradually project with more linear layers
|
||||||
|
self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
|
||||||
|
self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
|
||||||
|
self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
|
||||||
|
self.last_activation = nn.Softmax(dim=1)
|
||||||
|
|
||||||
|
def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
|
||||||
|
bt_all_function_edges: list):
|
||||||
|
rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
|
||||||
|
x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
|
||||||
|
fcg_list = []
|
||||||
|
fcg_internal_list = []
|
||||||
|
for idx_batch in range(len(real_bt_positions) - 1):
|
||||||
|
start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
|
||||||
|
idx_x_cfg = x_cfg_pool[start_pos: end_pos]
|
||||||
|
fcg_internal_list.append(idx_x_cfg)
|
||||||
|
idx_x_external = self.external_embedding_layer(
|
||||||
|
torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
|
||||||
|
idx_x_external = idx_x_external.squeeze(dim=0)
|
||||||
|
idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
|
||||||
|
idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
|
||||||
|
idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
|
||||||
|
idx_graph_data.validate()
|
||||||
|
fcg_list.append(idx_graph_data)
|
||||||
|
fcg_batch = Batch.from_data_list(fcg_list)
|
||||||
|
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
|
||||||
|
rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
|
||||||
|
x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
|
||||||
|
batch_final = x_fcg_pool
|
||||||
|
# step last project to the number_of_classes (multiclass)
|
||||||
|
bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
|
||||||
|
bt_pred = self.last_activation(bt_final_embed)
|
||||||
|
return bt_pred
|
||||||
|
|
||||||
|
def forward_cfg_gnn(self, local_batch: Batch):
|
||||||
|
in_x, edge_index = local_batch.x, local_batch.edge_index
|
||||||
|
for i in range(self.cfg_filter_length - 1):
|
||||||
|
out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
|
||||||
|
out_x = torch.nn.functional.relu(out_x, inplace=True)
|
||||||
|
out_x = self.dropout(out_x)
|
||||||
|
in_x = out_x
|
||||||
|
local_batch.x = in_x
|
||||||
|
return local_batch
|
||||||
|
|
||||||
|
def forward_fcg_gnn(self, function_batch: Batch):
|
||||||
|
in_x, edge_index = function_batch.x, function_batch.edge_index
|
||||||
|
for i in range(self.fcg_filter_length - 1):
|
||||||
|
out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
|
||||||
|
out_x = torch.nn.functional.relu(out_x, inplace=True)
|
||||||
|
out_x = self.dropout(out_x)
|
||||||
|
in_x = out_x
|
||||||
|
function_batch.x = in_x
|
||||||
|
return function_batch
|
@ -6,7 +6,7 @@ import os
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def convert(start, end):
|
def convert(start, end, overhaul):
|
||||||
for workflow in range(start, end):
|
for workflow in range(start, end):
|
||||||
# workflow = 0
|
# workflow = 0
|
||||||
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
||||||
@ -16,6 +16,12 @@ def convert(start, end):
|
|||||||
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
||||||
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
||||||
|
|
||||||
|
if overhaul:
|
||||||
|
if os.path.exists(log_path):
|
||||||
|
os.remove(log_path)
|
||||||
|
if os.path.exists(process_log_path):
|
||||||
|
os.remove(process_log_path)
|
||||||
|
|
||||||
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||||
logged = log.readline()
|
logged = log.readline()
|
||||||
if logged == '':
|
if logged == '':
|
||||||
@ -112,5 +118,116 @@ def convert(start, end):
|
|||||||
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
||||||
|
|
||||||
|
|
||||||
|
def convert_benign(overhaul):
|
||||||
|
cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
|
||||||
|
output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new"
|
||||||
|
dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new"
|
||||||
|
|
||||||
|
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
|
||||||
|
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
|
||||||
|
|
||||||
|
if overhaul:
|
||||||
|
if os.path.exists(log_path):
|
||||||
|
os.remove(log_path)
|
||||||
|
if os.path.exists(process_log_path):
|
||||||
|
os.remove(process_log_path)
|
||||||
|
|
||||||
|
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||||
|
logged = log.readline()
|
||||||
|
if logged == '':
|
||||||
|
log_index = 0
|
||||||
|
else:
|
||||||
|
log_index = int(logged)
|
||||||
|
|
||||||
|
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
|
||||||
|
if index < log_index:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = cfg[:-4] # 纯文件名
|
||||||
|
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
||||||
|
try:
|
||||||
|
data = pk.load(cfg_file)
|
||||||
|
except EOFError:
|
||||||
|
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
cfg_file.close()
|
||||||
|
|
||||||
|
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
||||||
|
if not os.path.exists(dot_file_path):
|
||||||
|
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
|
||||||
|
else:
|
||||||
|
# 打开dot文件获取fcg
|
||||||
|
raw_function_edges = []
|
||||||
|
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
|
||||||
|
functions_list = []
|
||||||
|
with open(dot_file_path, 'r') as dot:
|
||||||
|
for line in dot:
|
||||||
|
if '->' in line:
|
||||||
|
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
||||||
|
elif 'label' in line:
|
||||||
|
functions_list.append(line[line.find('= "') + 3:line.find('",')])
|
||||||
|
|
||||||
|
# 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
|
||||||
|
if raw_function_edges.__len__() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 为当前pe文件创建json对象
|
||||||
|
json_obj = {
|
||||||
|
'hash': data.binary_name[11:],
|
||||||
|
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
|
||||||
|
# 'function_number': data.raw_graph_list.__len__(),
|
||||||
|
'function_number': len(functions_list),
|
||||||
|
'function_edges': [[int(d[0]) for d in raw_function_edges],
|
||||||
|
[int(d[1]) for d in raw_function_edges]],
|
||||||
|
'acfg_list': [],
|
||||||
|
'function_names': functions_list
|
||||||
|
}
|
||||||
|
|
||||||
|
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
|
||||||
|
# 读取pkl文件,一个acfg由一个函数分解而来
|
||||||
|
for acfg in data.raw_graph_list:
|
||||||
|
# 函数为外部函数,不需要构建cfg
|
||||||
|
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
||||||
|
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
||||||
|
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
|
||||||
|
# 以框架为主,将bb_features数组削减为和g.node长度一致
|
||||||
|
diff = acfg.g.__len__() - len(acfg.bb_features)
|
||||||
|
if diff != 0:
|
||||||
|
del acfg.bb_features[diff:]
|
||||||
|
# 将后代数量的特征放入bb_features中
|
||||||
|
|
||||||
|
for i, offs in enumerate(offspring):
|
||||||
|
acfg.bb_features[i].append(offs)
|
||||||
|
|
||||||
|
acfg_item = {
|
||||||
|
'block_number': acfg.g.__len__(),
|
||||||
|
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
|
||||||
|
'block_features': acfg.bb_features
|
||||||
|
}
|
||||||
|
|
||||||
|
json_obj['acfg_list'].append(acfg_item)
|
||||||
|
# json_obj['function_names'].append(acfg.funcname)
|
||||||
|
|
||||||
|
# 将结果写入json本地文件
|
||||||
|
result = json.dumps(json_obj, ensure_ascii=False)
|
||||||
|
|
||||||
|
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
|
||||||
|
out.write(result)
|
||||||
|
|
||||||
|
log.truncate(0)
|
||||||
|
log.seek(0)
|
||||||
|
log.write(str(index))
|
||||||
|
log.flush()
|
||||||
|
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
convert(35, 69)
|
# convert(35, 69)
|
||||||
|
convert_benign(True)
|
||||||
|
@ -19,10 +19,75 @@ def call_preprocess(cmd_line):
|
|||||||
subprocess.call(cmd_line, shell=True)
|
subprocess.call(cmd_line, shell=True)
|
||||||
|
|
||||||
|
|
||||||
def batch_mode(start, end):
|
# 良性软件分析模式,ida的命令中将workflow改为-1
|
||||||
|
def benign_batch_mode(overhaul):
|
||||||
|
# 总失败数据数量
|
||||||
|
total_failed = 0
|
||||||
|
|
||||||
|
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
|
||||||
|
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
|
||||||
|
benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new'
|
||||||
|
|
||||||
|
if overhaul:
|
||||||
|
if os.path.exists(log_path):
|
||||||
|
os.remove(log_path)
|
||||||
|
if os.path.exists(process_log_path):
|
||||||
|
os.remove(process_log_path)
|
||||||
|
|
||||||
|
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||||
|
logged = log.readline()
|
||||||
|
if logged == '':
|
||||||
|
log_index = 0
|
||||||
|
else:
|
||||||
|
log_index = int(logged)
|
||||||
|
|
||||||
|
for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))):
|
||||||
|
if index < log_index:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
|
||||||
|
os.path.join(benign_pe_dir, pe))
|
||||||
|
|
||||||
|
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
|
||||||
|
p.start()
|
||||||
|
flag_kill = True
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start <= TIMEOUT:
|
||||||
|
if not p.is_alive():
|
||||||
|
flag_kill = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if flag_kill:
|
||||||
|
subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
|
process_log.write(
|
||||||
|
"index {}, {} stuck, process terminated.\n".format(index, pe))
|
||||||
|
|
||||||
|
total_failed += 1
|
||||||
|
else:
|
||||||
|
# 正常运行结束
|
||||||
|
log.truncate(0)
|
||||||
|
log.seek(0)
|
||||||
|
log.write(str(index))
|
||||||
|
log.flush()
|
||||||
|
process_log.write("index {}, {} process done.\n".format(index, pe))
|
||||||
|
# 所有副产物删除
|
||||||
|
delete_output()
|
||||||
|
|
||||||
|
print('总失败数{}'.format(total_failed))
|
||||||
|
|
||||||
|
|
||||||
|
def mal_batch_mode(start, end):
|
||||||
# 只选其中这些类的pe进行分析,其他的就直接跳过
|
# 只选其中这些类的pe进行分析,其他的就直接跳过
|
||||||
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
|
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
|
||||||
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
|
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
|
||||||
|
# 记录ida处理报错的数据来自哪些家族
|
||||||
|
failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
|
||||||
|
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
|
||||||
|
# 总失败数据数量
|
||||||
|
total_failed = 0
|
||||||
|
|
||||||
for workflow in range(start, end):
|
for workflow in range(start, end):
|
||||||
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
|
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
|
||||||
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
|
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
|
||||||
@ -73,6 +138,9 @@ def batch_mode(start, end):
|
|||||||
subprocess.call('taskkill /im idaq64.exe /f')
|
subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
process_log.write(
|
process_log.write(
|
||||||
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
|
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
|
||||||
|
|
||||||
|
failed_family[pe_family] += 1
|
||||||
|
total_failed += 1
|
||||||
else:
|
else:
|
||||||
# 正常运行结束
|
# 正常运行结束
|
||||||
log.truncate(0)
|
log.truncate(0)
|
||||||
@ -85,6 +153,10 @@ def batch_mode(start, end):
|
|||||||
# 一次workflow结束后将所有副产物删除
|
# 一次workflow结束后将所有副产物删除
|
||||||
delete_output()
|
delete_output()
|
||||||
|
|
||||||
|
print(families_need_to_analyze)
|
||||||
|
print('\n')
|
||||||
|
print(failed_family, '总失败数{}'.format(total_failed))
|
||||||
|
|
||||||
|
|
||||||
def delete_output():
|
def delete_output():
|
||||||
out_dir = 'F:\\iout'
|
out_dir = 'F:\\iout'
|
||||||
@ -96,4 +168,5 @@ def delete_output():
|
|||||||
# 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库
|
# 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库
|
||||||
# F:\\kkk\\IDA_6.6
|
# F:\\kkk\\IDA_6.6
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
batch_mode(36, 69)
|
benign_batch_mode(True)
|
||||||
|
# mal_batch_mode(35, 69)
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
import pickle
|
import pickle
|
||||||
from func import *
|
from func import *
|
||||||
from raw_graphs import *
|
|
||||||
from idc import *
|
from idc import *
|
||||||
import idautils
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess():
|
def preprocess():
|
||||||
@ -18,7 +15,11 @@ def preprocess():
|
|||||||
binary_name = idc.GetInputFile()
|
binary_name = idc.GetInputFile()
|
||||||
|
|
||||||
workflow = idc.ARGV[1]
|
workflow = idc.ARGV[1]
|
||||||
# workflow = 0
|
# workflow为特定值时分析良性软件,否则分析恶意软件
|
||||||
|
if workflow == '-1':
|
||||||
|
cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
|
||||||
|
gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name)
|
||||||
|
else:
|
||||||
cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
||||||
gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
|
gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
|
||||||
|
|
||||||
|
@ -46,7 +46,6 @@ def create(parent_dir, folder):
|
|||||||
os.mkdir(os.path.join(parent_dir, folder))
|
os.mkdir(os.path.join(parent_dir, folder))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def change_max_item_lines():
|
def change_max_item_lines():
|
||||||
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
|
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
|
||||||
s = f.read()
|
s = f.read()
|
||||||
@ -89,6 +88,7 @@ def delete_error():
|
|||||||
|
|
||||||
|
|
||||||
def check_json():
|
def check_json():
|
||||||
|
print('start checking json')
|
||||||
for workflow in tqdm(range(0, 69)):
|
for workflow in tqdm(range(0, 69)):
|
||||||
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||||
for json_file in os.listdir(json_dir):
|
for json_file in os.listdir(json_dir):
|
||||||
@ -99,9 +99,13 @@ def check_json():
|
|||||||
continue
|
continue
|
||||||
finally:
|
finally:
|
||||||
f.close()
|
f.close()
|
||||||
for acfg in data['acfg_list']:
|
|
||||||
if acfg['block_number'] != len(acfg['block_features']):
|
if len(data['function_edges'][0]) == 0:
|
||||||
print("{} {}\n".format(workflow, json_file))
|
print("{} {} function_edges null\n".format(workflow, json_file))
|
||||||
|
# continue
|
||||||
|
# for acfg in data['acfg_list']:
|
||||||
|
# if acfg['block_number'] != len(acfg['block_features']):
|
||||||
|
# print("{} {}\n".format(workflow, json_file))
|
||||||
|
|
||||||
|
|
||||||
# 临时函数,删除所有jsonl文件
|
# 临时函数,删除所有jsonl文件
|
||||||
@ -112,21 +116,44 @@ def delete_jsonl():
|
|||||||
os.remove(os.path.join(json_dir, f))
|
os.remove(os.path.join(json_dir, f))
|
||||||
|
|
||||||
|
|
||||||
# 临时函数,重命名pt文件使之与代码相符
|
def delete_all_local():
|
||||||
def rename():
|
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||||
|
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
|
||||||
|
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
|
||||||
|
for d in dirs:
|
||||||
|
path = os.path.join(src, d)
|
||||||
|
for f in os.listdir(path):
|
||||||
|
os.remove(os.path.join(path, f))
|
||||||
|
|
||||||
|
|
||||||
|
# 重命名pt文件使之与代码相符
|
||||||
|
def rename(mal_or_be, postfix):
|
||||||
tag_set = ['train', 'test', 'valid']
|
tag_set = ['train', 'test', 'valid']
|
||||||
for tag in tag_set:
|
for tag in tag_set:
|
||||||
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
|
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
|
||||||
for index, f in enumerate(os.listdir(data_dir)):
|
for index, f in enumerate(os.listdir(data_dir)):
|
||||||
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
|
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
|
||||||
for tag in tag_set:
|
for tag in tag_set:
|
||||||
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
|
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
|
||||||
for index, f in enumerate(os.listdir(data_dir)):
|
for index, f in enumerate(os.listdir(data_dir)):
|
||||||
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index)))
|
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
|
||||||
|
|
||||||
|
|
||||||
def split_samples():
|
def split_samples(flag):
|
||||||
|
postfix = ''
|
||||||
|
if flag == 'one_family':
|
||||||
|
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
|
||||||
|
tag = 'malware'
|
||||||
|
elif flag == 'standard':
|
||||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
||||||
|
postfix = '_backup'
|
||||||
|
tag = 'malware'
|
||||||
|
elif flag == 'benign':
|
||||||
|
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
|
||||||
|
tag = 'benign'
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||||
os_list = os.listdir(path)
|
os_list = os.listdir(path)
|
||||||
random.shuffle(os_list)
|
random.shuffle(os_list)
|
||||||
@ -135,11 +162,12 @@ def split_samples():
|
|||||||
test_len = int(train_len / 8)
|
test_len = int(train_len / 8)
|
||||||
for index, f in enumerate(os_list):
|
for index, f in enumerate(os_list):
|
||||||
if index < train_len:
|
if index < train_len:
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware'))
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
|
||||||
elif train_len <= index < train_len + test_len:
|
elif train_len <= index < train_len + test_len:
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware'))
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
|
||||||
else:
|
else:
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware'))
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
|
||||||
|
rename(tag, postfix)
|
||||||
|
|
||||||
|
|
||||||
def half_divide():
|
def half_divide():
|
||||||
@ -206,6 +234,19 @@ def del_redundant():
|
|||||||
os.remove(os.path.join(pe_dir, name))
|
os.remove(os.path.join(pe_dir, name))
|
||||||
|
|
||||||
|
|
||||||
|
def delete_pe():
|
||||||
|
dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
|
||||||
|
cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
|
||||||
|
dot_list = os.listdir(dot_dir)
|
||||||
|
for cfg in os.listdir(cfg_dir):
|
||||||
|
name = cfg[:-4] + ".dot"
|
||||||
|
if name in dot_list:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print(os.path.join(dot_dir, name))
|
||||||
|
# os.remove(os.path.join(dot_dir, cfg))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# create_dir()
|
# create_dir()
|
||||||
# change_max_item_lines()
|
# change_max_item_lines()
|
||||||
@ -213,9 +254,20 @@ if __name__ == '__main__':
|
|||||||
# delete_error()
|
# delete_error()
|
||||||
# test()
|
# test()
|
||||||
# delete_jsonl()
|
# delete_jsonl()
|
||||||
|
delete_all_local()
|
||||||
# check_json()
|
# check_json()
|
||||||
split_samples()
|
# delete_pe()
|
||||||
# rename()
|
|
||||||
|
# rename('malware', '_backup')
|
||||||
|
|
||||||
|
# 指定 'standard' or 'benign' or 'one_family'
|
||||||
|
# standard表示处理所有恶意样本
|
||||||
|
# split_samples('standard')
|
||||||
|
# one_family表示仅处理一个家族,仅用于测试原模型的二分类
|
||||||
|
# split_samples('one_family')
|
||||||
|
# benign表示处理良性样本
|
||||||
|
# split_samples('benign')
|
||||||
|
|
||||||
# half_divide()
|
# half_divide()
|
||||||
# copy_train_data()
|
# copy_train_data()
|
||||||
# clear_dot()
|
# clear_dot()
|
||||||
|
Loading…
Reference in New Issue
Block a user