backup
This commit is contained in:
parent
2ec7e5e212
commit
4637fd0d97
14
.idea/deployment.xml
Normal file
14
.idea/deployment.xml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
|
||||||
|
<serverData>
|
||||||
|
<paths name="root@region-41.seetacloud.com:29208">
|
||||||
|
<serverdata>
|
||||||
|
<mappings>
|
||||||
|
<mapping local="$PROJECT_DIR$" web="/" />
|
||||||
|
</mappings>
|
||||||
|
</serverdata>
|
||||||
|
</paths>
|
||||||
|
</serverData>
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -12,6 +12,7 @@ def convert(start, end):
|
|||||||
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
||||||
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
||||||
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
|
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
|
||||||
|
|
||||||
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
||||||
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
||||||
|
|
||||||
@ -36,8 +37,8 @@ def convert(start, end):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
||||||
continue
|
continue
|
||||||
|
finally:
|
||||||
cfg_file.close()
|
cfg_file.close()
|
||||||
|
|
||||||
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
||||||
if not os.path.exists(dot_file_path):
|
if not os.path.exists(dot_file_path):
|
||||||
@ -45,24 +46,47 @@ def convert(start, end):
|
|||||||
else:
|
else:
|
||||||
# 打开dot文件获取fcg
|
# 打开dot文件获取fcg
|
||||||
raw_function_edges = []
|
raw_function_edges = []
|
||||||
|
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
|
||||||
|
functions_list = []
|
||||||
with open(dot_file_path, 'r') as dot:
|
with open(dot_file_path, 'r') as dot:
|
||||||
for line in dot:
|
for line in dot:
|
||||||
if '->' in line:
|
if '->' in line:
|
||||||
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
||||||
|
elif 'label' in line:
|
||||||
|
functions_list.append(line[line.find('= "') + 3:line.find('",')])
|
||||||
|
|
||||||
|
# 没有内部函数被检测到,保险起见还是不要这数据了
|
||||||
|
if raw_function_edges.__len__() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
# 为当前pe文件创建json对象
|
# 为当前pe文件创建json对象
|
||||||
json_obj = {
|
json_obj = {
|
||||||
'hash': data.binary_name[11:],
|
'hash': data.binary_name[11:],
|
||||||
'function_number': data.raw_graph_list.__len__(),
|
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
|
||||||
'function_edges': [[d[0] for d in raw_function_edges], [d[1] for d in raw_function_edges]],
|
# 'function_number': data.raw_graph_list.__len__(),
|
||||||
|
'function_number': len(functions_list),
|
||||||
|
'function_edges': [[int(d[0]) for d in raw_function_edges],
|
||||||
|
[int(d[1]) for d in raw_function_edges]],
|
||||||
'acfg_list': [],
|
'acfg_list': [],
|
||||||
'function_names': []
|
'function_names': functions_list
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
|
||||||
# 读取pkl文件,一个acfg由一个函数分解而来
|
# 读取pkl文件,一个acfg由一个函数分解而来
|
||||||
for acfg in data.raw_graph_list:
|
for acfg in data.raw_graph_list:
|
||||||
|
# 函数为外部函数,不需要构建cfg
|
||||||
|
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
|
||||||
|
continue
|
||||||
|
|
||||||
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
||||||
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
||||||
|
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
|
||||||
|
# 以框架为主,将bb_features数组削减为和g.node长度一致
|
||||||
|
diff = acfg.g.__len__() - len(acfg.bb_features)
|
||||||
|
if diff != 0:
|
||||||
|
del acfg.bb_features[diff:]
|
||||||
# 将后代数量的特征放入bb_features中
|
# 将后代数量的特征放入bb_features中
|
||||||
|
|
||||||
for i, offs in enumerate(offspring):
|
for i, offs in enumerate(offspring):
|
||||||
acfg.bb_features[i].append(offs)
|
acfg.bb_features[i].append(offs)
|
||||||
|
|
||||||
@ -73,7 +97,7 @@ def convert(start, end):
|
|||||||
}
|
}
|
||||||
|
|
||||||
json_obj['acfg_list'].append(acfg_item)
|
json_obj['acfg_list'].append(acfg_item)
|
||||||
json_obj['function_names'].append(acfg.funcname)
|
# json_obj['function_names'].append(acfg.funcname)
|
||||||
|
|
||||||
# 将结果写入json本地文件
|
# 将结果写入json本地文件
|
||||||
result = json.dumps(json_obj, ensure_ascii=False)
|
result = json.dumps(json_obj, ensure_ascii=False)
|
||||||
@ -89,4 +113,4 @@ def convert(start, end):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
convert(20, 35)
|
convert(0, 35)
|
||||||
|
@ -139,7 +139,7 @@ def get_func_cfgs_c(ea):
|
|||||||
icfg = cfg.getCfg(func, externs_eas, ea_externs)
|
icfg = cfg.getCfg(func, externs_eas, ea_externs)
|
||||||
func_f = get_discoverRe_feature(func, icfg[0])
|
func_f = get_discoverRe_feature(func, icfg[0])
|
||||||
bb_f = get_bb_features(func)
|
bb_f = get_bb_features(func)
|
||||||
raw_g = raw_graph(funcname, icfg, func_f, bb_f) # todo 为每个bb生成bb_features
|
raw_g = raw_graph(funcname, icfg, func_f, bb_f)
|
||||||
raw_cfgs.append(raw_g) # raw_graphs 是另一个python class,存储raw_graph的list。定义在 raw_graph.py
|
raw_cfgs.append(raw_g) # raw_graphs 是另一个python class,存储raw_graph的list。定义在 raw_graph.py
|
||||||
#print(raw_g.__dict__)
|
#print(raw_g.__dict__)
|
||||||
#print(raw_g) 由于raw_graph、raw_graphs都是class,直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__)
|
#print(raw_g) 由于raw_graph、raw_graphs都是class,直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__)
|
||||||
|
@ -15,7 +15,7 @@ def print_obj(obj):
|
|||||||
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
|
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
|
||||||
# sub_166C4 393
|
# sub_166C4 393
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected11_cfg\\VirusShare_5c088a2a6e0391b7c6ab22e4648eab3a.ida"
|
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
|
||||||
fr = open(testpath, 'r')
|
fr = open(testpath, 'r')
|
||||||
data = pickle.load(fr) #一个二进制文件的acfgs
|
data = pickle.load(fr) #一个二进制文件的acfgs
|
||||||
fr.close()
|
fr.close()
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
|
# coding=utf-8
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
def func():
|
def func():
|
||||||
@ -19,20 +23,20 @@ def func1():
|
|||||||
print(f[:-4])
|
print(f[:-4])
|
||||||
|
|
||||||
|
|
||||||
def gen_dir():
|
def create_dir():
|
||||||
parent_dir = "D:\\hkn\\infected\\datasets"
|
parent_dir = "D:\\hkn\\infected\\datasets"
|
||||||
for workflow in range(0, 35):
|
for workflow in range(35, 40):
|
||||||
|
# 生成raw data文件夹
|
||||||
# infected = "virusshare_infected{}".format(workflow)
|
# infected = "virusshare_infected{}".format(workflow)
|
||||||
# cfg = "virusshare_infected{}_cfg".format(workflow)
|
# cfg = "virusshare_infected{}_cfg".format(workflow)
|
||||||
# dot = "virusshare_infected{}_dot".format(workflow)
|
# dot = "virusshare_infected{}_dot".format(workflow)
|
||||||
# jsonl = "virusshare_infected{}_json".format(workflow)
|
jsonl = "virusshare_infected{}_json".format(workflow)
|
||||||
iout = "virusshare_infected{}_iout".format(workflow)
|
|
||||||
|
|
||||||
# os.mkdir(os.path.join(parent_dir, infected))
|
# os.mkdir(os.path.join(parent_dir, infected))
|
||||||
# os.mkdir(os.path.join(parent_dir, cfg))
|
# os.mkdir(os.path.join(parent_dir, cfg))
|
||||||
# os.mkdir(os.path.join(parent_dir, dot))
|
# os.mkdir(os.path.join(parent_dir, dot))
|
||||||
# os.mkdir(os.path.join(parent_dir, jsonl))
|
os.mkdir(os.path.join(parent_dir, jsonl))
|
||||||
os.rmdir(os.path.join(parent_dir, iout))
|
# iout = "virusshare_infected{}_iout".format(workflow)
|
||||||
|
# os.rmdir(os.path.join(parent_dir, iout))
|
||||||
# os.rmdir(os.path.join(parent_dir, ida))
|
# os.rmdir(os.path.join(parent_dir, ida))
|
||||||
|
|
||||||
|
|
||||||
@ -77,8 +81,119 @@ def delete_error():
|
|||||||
os.remove(os.path.join(json_dir, name))
|
os.remove(os.path.join(json_dir, name))
|
||||||
|
|
||||||
|
|
||||||
|
def check_json():
|
||||||
|
for workflow in range(5, 16):
|
||||||
|
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||||
|
for json_file in os.listdir(json_dir):
|
||||||
|
f = open(os.path.join(json_dir, json_file), 'r')
|
||||||
|
try:
|
||||||
|
data = json.load(f)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
f.close()
|
||||||
|
for acfg in data['acfg_list']:
|
||||||
|
if acfg['block_number'] != len(acfg['block_features']):
|
||||||
|
print("{} {}\n".format(workflow, json_file))
|
||||||
|
|
||||||
|
|
||||||
|
# 临时函数,删除所有jsonl文件
|
||||||
|
def delete_jsonl():
|
||||||
|
for workflow in range(0, 35):
|
||||||
|
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||||
|
for f in os.listdir(json_dir):
|
||||||
|
os.remove(os.path.join(json_dir, f))
|
||||||
|
|
||||||
|
|
||||||
|
# 临时函数,重命名pt文件使之与代码相符
|
||||||
|
def rename():
|
||||||
|
tag_set = ['train', 'test', 'valid']
|
||||||
|
for tag in tag_set:
|
||||||
|
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
|
||||||
|
for index, f in enumerate(os.listdir(data_dir)):
|
||||||
|
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
|
||||||
|
for tag in tag_set:
|
||||||
|
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag)
|
||||||
|
for index, f in enumerate(os.listdir(data_dir)):
|
||||||
|
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index)))
|
||||||
|
|
||||||
|
|
||||||
|
def split_samples():
|
||||||
|
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
||||||
|
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||||
|
os_list = os.listdir(path)
|
||||||
|
random.shuffle(os_list)
|
||||||
|
# 8/1/1 分数据
|
||||||
|
train_len = int(len(os_list) * 0.8)
|
||||||
|
test_len = int(train_len / 8)
|
||||||
|
for index, f in enumerate(os_list):
|
||||||
|
if index < train_len:
|
||||||
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware'))
|
||||||
|
elif train_len <= index < train_len + test_len:
|
||||||
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware'))
|
||||||
|
else:
|
||||||
|
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware'))
|
||||||
|
|
||||||
|
|
||||||
|
def half_divide():
|
||||||
|
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||||
|
|
||||||
|
test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware'
|
||||||
|
valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'
|
||||||
|
|
||||||
|
flag = True
|
||||||
|
for f in os.listdir(src):
|
||||||
|
if 'pt' not in f:
|
||||||
|
continue
|
||||||
|
if flag:
|
||||||
|
shutil.copy(os.path.join(src, f), test)
|
||||||
|
else:
|
||||||
|
shutil.copy(os.path.join(src, f), valid)
|
||||||
|
flag = not flag
|
||||||
|
|
||||||
|
|
||||||
|
def copy_train_data():
|
||||||
|
all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
||||||
|
dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware'
|
||||||
|
train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'))
|
||||||
|
for f in train:
|
||||||
|
shutil.copy(os.path.join(all, f), dest)
|
||||||
|
|
||||||
|
|
||||||
|
def clear_dot():
|
||||||
|
for workflow in range(0, 35):
|
||||||
|
path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow)
|
||||||
|
for name in os.listdir(path):
|
||||||
|
full = os.path.join(path, name)
|
||||||
|
f = open(full, 'r')
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
if 'start' not in data and 'sub_' not in data:
|
||||||
|
# print("delete")
|
||||||
|
os.remove(full)
|
||||||
|
|
||||||
|
|
||||||
|
def read_test():
|
||||||
|
dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot"
|
||||||
|
with open(dot_file_path, 'r') as dot:
|
||||||
|
for line in dot:
|
||||||
|
if '->' in line:
|
||||||
|
print(re.findall(r'\b\d+\b', line))
|
||||||
|
elif 'label' in line:
|
||||||
|
print(line[line.find('= "') + 3:line.find('",')])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# gen_dir()
|
# create_dir()
|
||||||
# change_max_item_lines()
|
# change_max_item_lines()
|
||||||
# subprocess.call('taskkill /im idaq64.exe /f')
|
# subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
delete_error()
|
# delete_error()
|
||||||
|
# test()
|
||||||
|
# delete_jsonl()
|
||||||
|
# check_json()
|
||||||
|
split_samples()
|
||||||
|
rename()
|
||||||
|
# half_divide()
|
||||||
|
# copy_train_data()
|
||||||
|
# clear_dot()
|
||||||
|
# read_test()
|
Loading…
Reference in New Issue
Block a user