This commit is contained in:
TinyCaviar 2023-10-10 22:12:18 +08:00
parent ddf9ff3b59
commit d599236e94
6 changed files with 359 additions and 28 deletions

View File

@ -2,7 +2,14 @@
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="root@region-41.seetacloud.com:29208">
<paths name="304">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@region-42.seetacloud.com:58034 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />

View File

@ -0,0 +1,81 @@
class HierarchicalGraphNeuralNetwork(nn.Module):
def __init__(self, external_vocab: Vocab):
super(HierarchicalGraphNeuralNetwork, self).__init__()
self.pool = 'global_max_pool'
# Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
cfg_filter_list =[200, 200]
cfg_filter_list.insert(0, 11)
self.cfg_filter_length = len(cfg_filter_list)
cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
i in range(self.cfg_filter_length - 1)]
cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
cfg_constructor = cfg_conv['constructor']
for i in range(self.cfg_filter_length - 1):
setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
self.dropout = nn.Dropout(p=0.2)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
embedding_dim=cfg_filter_list[-1],
padding_idx=external_vocab.pad_idx)
fcg_filter_list = [200, 200]
fcg_filter_list.insert(0, cfg_filter_list[-1])
self.fcg_filter_length = len(fcg_filter_list)
fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
i in range(self.fcg_filter_length - 1)]
fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
fcg_constructor = fcg_conv['constructor']
for i in range(self.fcg_filter_length - 1):
setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
# Last Projection Function: gradually project with more linear layers
self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
self.last_activation = nn.Softmax(dim=1)
def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
bt_all_function_edges: list):
rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
fcg_list = []
fcg_internal_list = []
for idx_batch in range(len(real_bt_positions) - 1):
start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
idx_x_cfg = x_cfg_pool[start_pos: end_pos]
fcg_internal_list.append(idx_x_cfg)
idx_x_external = self.external_embedding_layer(
torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
idx_x_external = idx_x_external.squeeze(dim=0)
idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
idx_graph_data.validate()
fcg_list.append(idx_graph_data)
fcg_batch = Batch.from_data_list(fcg_list)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
batch_final = x_fcg_pool
# step last project to the number_of_classes (multiclass)
bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
bt_pred = self.last_activation(bt_final_embed)
return bt_pred
def forward_cfg_gnn(self, local_batch: Batch):
in_x, edge_index = local_batch.x, local_batch.edge_index
for i in range(self.cfg_filter_length - 1):
out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
local_batch.x = in_x
return local_batch
def forward_fcg_gnn(self, function_batch: Batch):
in_x, edge_index = function_batch.x, function_batch.edge_index
for i in range(self.fcg_filter_length - 1):
out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
function_batch.x = in_x
return function_batch

View File

@ -6,7 +6,7 @@ import os
from tqdm import tqdm
def convert(start, end):
def convert(start, end, overhaul):
for workflow in range(start, end):
# workflow = 0
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
@ -16,6 +16,12 @@ def convert(start, end):
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
@ -112,5 +118,116 @@ def convert(start, end):
process_log.write("index {}, {} process done.\n".format(index, cfg))
def convert_benign(overhaul):
cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new"
dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new"
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
if index < log_index:
continue
name = cfg[:-4] # 纯文件名
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try:
data = pk.load(cfg_file)
except EOFError:
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
continue
except ValueError:
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
continue
finally:
cfg_file.close()
dot_file_path = os.path.join(dot_dir, name + '.dot')
if not os.path.exists(dot_file_path):
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
else:
# 打开dot文件获取fcg
raw_function_edges = []
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数data.raw_graph_list仅包含了内部函数
functions_list = []
with open(dot_file_path, 'r') as dot:
for line in dot:
if '->' in line:
raw_function_edges.append(re.findall(r'\b\d+\b', line))
elif 'label' in line:
functions_list.append(line[line.find('= "') + 3:line.find('",')])
# 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
if raw_function_edges.__len__() == 0:
continue
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
'function_edges': [[int(d[0]) for d in raw_function_edges],
[int(d[1]) for d in raw_function_edges]],
'acfg_list': [],
'function_names': functions_list
}
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数不包括外部函数因此函数列表和函数数量不能从这里获取
# 读取pkl文件一个acfg由一个函数分解而来
for acfg in data.raw_graph_list:
# 函数为外部函数不需要构建cfg
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
continue
# 这里2是因为Genius框架提取特征时将后代数量放在2
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
# 以框架为主将bb_features数组削减为和g.node长度一致
diff = acfg.g.__len__() - len(acfg.bb_features)
if diff != 0:
del acfg.bb_features[diff:]
# 将后代数量的特征放入bb_features中
for i, offs in enumerate(offspring):
acfg.bb_features[i].append(offs)
acfg_item = {
'block_number': acfg.g.__len__(),
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
'block_features': acfg.bb_features
}
json_obj['acfg_list'].append(acfg_item)
# json_obj['function_names'].append(acfg.funcname)
# 将结果写入json本地文件
result = json.dumps(json_obj, ensure_ascii=False)
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
out.write(result)
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, cfg))
if __name__ == '__main__':
convert(35, 69)
# convert(35, 69)
convert_benign(True)

View File

@ -19,10 +19,75 @@ def call_preprocess(cmd_line):
subprocess.call(cmd_line, shell=True)
def batch_mode(start, end):
# 良性软件分析模式ida的命令中将workflow改为-1
def benign_batch_mode(overhaul):
# 总失败数据数量
total_failed = 0
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new'
if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))):
if index < log_index:
continue
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
os.path.join(benign_pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
flag_kill = True
start = time.time()
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1)
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write(
"index {}, {} stuck, process terminated.\n".format(index, pe))
total_failed += 1
else:
# 正常运行结束
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe))
# 所有副产物删除
delete_output()
print('总失败数{}'.format(total_failed))
def mal_batch_mode(start, end):
# 只选其中这些类的pe进行分析其他的就直接跳过
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
# 记录ida处理报错的数据来自哪些家族
failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
# 总失败数据数量
total_failed = 0
for workflow in range(start, end):
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
@ -73,6 +138,9 @@ def batch_mode(start, end):
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write(
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
failed_family[pe_family] += 1
total_failed += 1
else:
# 正常运行结束
log.truncate(0)
@ -85,6 +153,10 @@ def batch_mode(start, end):
# 一次workflow结束后将所有副产物删除
delete_output()
print(families_need_to_analyze)
print('\n')
print(failed_family, '总失败数{}'.format(total_failed))
def delete_output():
out_dir = 'F:\\iout'
@ -96,4 +168,5 @@ def delete_output():
# 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库
# F:\\kkk\\IDA_6.6
if __name__ == '__main__':
batch_mode(36, 69)
benign_batch_mode(True)
# mal_batch_mode(35, 69)

View File

@ -1,11 +1,8 @@
# -*- coding: UTF-8 -*-
import pickle
from func import *
from raw_graphs import *
from idc import *
import idautils
import os
import sys
def preprocess():
@ -18,9 +15,13 @@ def preprocess():
binary_name = idc.GetInputFile()
workflow = idc.ARGV[1]
# workflow = 0
cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
# workflow为特定值时分析良性软件否则分析恶意软件
if workflow == '-1':
cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name)
else:
cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF

View File

@ -46,7 +46,6 @@ def create(parent_dir, folder):
os.mkdir(os.path.join(parent_dir, folder))
def change_max_item_lines():
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
s = f.read()
@ -89,6 +88,7 @@ def delete_error():
def check_json():
print('start checking json')
for workflow in tqdm(range(0, 69)):
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for json_file in os.listdir(json_dir):
@ -99,9 +99,13 @@ def check_json():
continue
finally:
f.close()
for acfg in data['acfg_list']:
if acfg['block_number'] != len(acfg['block_features']):
print("{} {}\n".format(workflow, json_file))
if len(data['function_edges'][0]) == 0:
print("{} {} function_edges null\n".format(workflow, json_file))
# continue
# for acfg in data['acfg_list']:
# if acfg['block_number'] != len(acfg['block_features']):
# print("{} {}\n".format(workflow, json_file))
# 临时函数删除所有jsonl文件
@ -112,21 +116,44 @@ def delete_jsonl():
os.remove(os.path.join(json_dir, f))
# 临时函数重命名pt文件使之与代码相符
def rename():
def delete_all_local():
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
for d in dirs:
path = os.path.join(src, d)
for f in os.listdir(path):
os.remove(os.path.join(path, f))
# 重命名pt文件使之与代码相符
def rename(mal_or_be, postfix):
tag_set = ['train', 'test', 'valid']
for tag in tag_set:
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
for index, f in enumerate(os.listdir(data_dir)):
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
for tag in tag_set:
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
for index, f in enumerate(os.listdir(data_dir)):
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index)))
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
def split_samples():
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
def split_samples(flag):
postfix = ''
if flag == 'one_family':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
tag = 'malware'
elif flag == 'standard':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
postfix = '_backup'
tag = 'malware'
elif flag == 'benign':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
tag = 'benign'
else:
return
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
os_list = os.listdir(path)
random.shuffle(os_list)
@ -135,11 +162,12 @@ def split_samples():
test_len = int(train_len / 8)
for index, f in enumerate(os_list):
if index < train_len:
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware'))
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
elif train_len <= index < train_len + test_len:
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware'))
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
else:
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware'))
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
rename(tag, postfix)
def half_divide():
@ -206,6 +234,19 @@ def del_redundant():
os.remove(os.path.join(pe_dir, name))
def delete_pe():
dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
dot_list = os.listdir(dot_dir)
for cfg in os.listdir(cfg_dir):
name = cfg[:-4] + ".dot"
if name in dot_list:
continue
else:
print(os.path.join(dot_dir, name))
# os.remove(os.path.join(dot_dir, cfg))
if __name__ == '__main__':
# create_dir()
# change_max_item_lines()
@ -213,11 +254,22 @@ if __name__ == '__main__':
# delete_error()
# test()
# delete_jsonl()
delete_all_local()
# check_json()
split_samples()
# rename()
# delete_pe()
# rename('malware', '_backup')
# 指定 'standard' or 'benign' or 'one_family'
# standard表示处理所有恶意样本
# split_samples('standard')
# one_family表示仅处理一个家族仅用于测试原模型的二分类
# split_samples('one_family')
# benign表示处理良性样本
# split_samples('benign')
# half_divide()
# copy_train_data()
# clear_dot()
# read_test()
# del_redundant()
# del_redundant()