diff --git a/.idea/deployment.xml b/.idea/deployment.xml new file mode 100644 index 0000000..81b14c5 --- /dev/null +++ b/.idea/deployment.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index c07301c..84cac1c 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -12,6 +12,7 @@ def convert(start, end): cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow) + log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow) process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) @@ -36,8 +37,8 @@ def convert(start, end): except ValueError: process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) continue - - cfg_file.close() + finally: + cfg_file.close() dot_file_path = os.path.join(dot_dir, name + '.dot') if not os.path.exists(dot_file_path): @@ -45,24 +46,47 @@ def convert(start, end): else: # 打开dot文件获取fcg raw_function_edges = [] + # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 + functions_list = [] with open(dot_file_path, 'r') as dot: for line in dot: if '->' in line: raw_function_edges.append(re.findall(r'\b\d+\b', line)) + elif 'label' in line: + functions_list.append(line[line.find('= "') + 3:line.find('",')]) + + # 没有内部函数被检测到,保险起见还是不要这数据了 + if raw_function_edges.__len__() == 0: + continue # 为当前pe文件创建json对象 json_obj = { 'hash': data.binary_name[11:], - 'function_number': data.raw_graph_list.__len__(), - 'function_edges': [[d[0] for d in raw_function_edges], [d[1] for d in raw_function_edges]], + # 2023.8.12 bug fix: 这里获取的是内部函数的数量 + # 'function_number': data.raw_graph_list.__len__(), + 'function_number': len(functions_list), + 'function_edges': [[int(d[0]) for d in raw_function_edges], + [int(d[1]) for d in raw_function_edges]], 'acfg_list': [], - 'function_names': [] + 'function_names': functions_list } + + # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 # 读取pkl文件,一个acfg由一个函数分解而来 for acfg in data.raw_graph_list: + # 函数为外部函数,不需要构建cfg + if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: + continue + # 这里2是因为Genius框架提取特征时将后代数量放在2 offspring = [d.get('v')[2] for d in acfg.g.node.values()] + # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 + # 以框架为主,将bb_features数组削减为和g.node长度一致 + diff = acfg.g.__len__() - len(acfg.bb_features) + if diff != 0: + del acfg.bb_features[diff:] # 将后代数量的特征放入bb_features中 + for i, offs in enumerate(offspring): acfg.bb_features[i].append(offs) @@ -73,7 +97,7 @@ def convert(start, end): } json_obj['acfg_list'].append(acfg_item) - json_obj['function_names'].append(acfg.funcname) + # json_obj['function_names'].append(acfg.funcname) # 将结果写入json本地文件 result = json.dumps(json_obj, ensure_ascii=False) @@ -89,4 +113,4 @@ def convert(start, end): if __name__ == '__main__': - convert(20, 35) + convert(0, 35) diff --git a/Genius3/raw-feature-extractor/func.py b/Genius3/raw-feature-extractor/func.py index 6e67820..33020aa 100644 --- a/Genius3/raw-feature-extractor/func.py +++ b/Genius3/raw-feature-extractor/func.py @@ -139,7 +139,7 @@ def get_func_cfgs_c(ea): icfg = cfg.getCfg(func, externs_eas, ea_externs) func_f = get_discoverRe_feature(func, icfg[0]) bb_f = get_bb_features(func) - raw_g = raw_graph(funcname, icfg, func_f, bb_f) # todo 为每个bb生成bb_features + raw_g = raw_graph(funcname, icfg, func_f, bb_f) raw_cfgs.append(raw_g) # raw_graphs 是另一个python class,存储raw_graph的list。定义在 raw_graph.py #print(raw_g.__dict__) #print(raw_g) 由于raw_graph、raw_graphs都是class,直接print只会打印,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__) diff --git a/Genius3/raw-feature-extractor/read_idaFILE.py b/Genius3/raw-feature-extractor/read_idaFILE.py index 463b53d..aae5416 100644 --- a/Genius3/raw-feature-extractor/read_idaFILE.py +++ b/Genius3/raw-feature-extractor/read_idaFILE.py @@ -15,7 +15,7 @@ def print_obj(obj): # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。 # sub_166C4 393 if __name__ == '__main__': - testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected11_cfg\\VirusShare_5c088a2a6e0391b7c6ab22e4648eab3a.ida" + testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida" fr = open(testpath, 'r') data = pickle.load(fr) #一个二进制文件的acfgs fr.close() diff --git a/Genius3/raw-feature-extractor/test.py b/Genius3/raw-feature-extractor/test.py index 4a5c124..8cb8f16 100644 --- a/Genius3/raw-feature-extractor/test.py +++ b/Genius3/raw-feature-extractor/test.py @@ -1,7 +1,11 @@ +# coding=utf-8 import re import os import subprocess import time +import json +import random +import shutil def func(): @@ -19,20 +23,20 @@ def func1(): print(f[:-4]) -def gen_dir(): +def create_dir(): parent_dir = "D:\\hkn\\infected\\datasets" - for workflow in range(0, 35): + for workflow in range(35, 40): + # 生成raw data文件夹 # infected = "virusshare_infected{}".format(workflow) # cfg = "virusshare_infected{}_cfg".format(workflow) # dot = "virusshare_infected{}_dot".format(workflow) - # jsonl = "virusshare_infected{}_json".format(workflow) - iout = "virusshare_infected{}_iout".format(workflow) - + jsonl = "virusshare_infected{}_json".format(workflow) # os.mkdir(os.path.join(parent_dir, infected)) # os.mkdir(os.path.join(parent_dir, cfg)) # os.mkdir(os.path.join(parent_dir, dot)) - # os.mkdir(os.path.join(parent_dir, jsonl)) - os.rmdir(os.path.join(parent_dir, iout)) + os.mkdir(os.path.join(parent_dir, jsonl)) + # iout = "virusshare_infected{}_iout".format(workflow) + # os.rmdir(os.path.join(parent_dir, iout)) # os.rmdir(os.path.join(parent_dir, ida)) @@ -77,8 +81,119 @@ def delete_error(): os.remove(os.path.join(json_dir, name)) +def check_json(): + for workflow in range(5, 16): + json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) + for json_file in os.listdir(json_dir): + f = open(os.path.join(json_dir, json_file), 'r') + try: + data = json.load(f) + except UnicodeDecodeError: + continue + finally: + f.close() + for acfg in data['acfg_list']: + if acfg['block_number'] != len(acfg['block_features']): + print("{} {}\n".format(workflow, json_file)) + + +# 临时函数,删除所有jsonl文件 +def delete_jsonl(): + for workflow in range(0, 35): + json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) + for f in os.listdir(json_dir): + os.remove(os.path.join(json_dir, f)) + + +# 临时函数,重命名pt文件使之与代码相符 +def rename(): + tag_set = ['train', 'test', 'valid'] + for tag in tag_set: + data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag) + for index, f in enumerate(os.listdir(data_dir)): + os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f)) + for tag in tag_set: + data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag) + for index, f in enumerate(os.listdir(data_dir)): + os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index))) + + +def split_samples(): + path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' + out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' + os_list = os.listdir(path) + random.shuffle(os_list) + # 8/1/1 分数据 + train_len = int(len(os_list) * 0.8) + test_len = int(train_len / 8) + for index, f in enumerate(os_list): + if index < train_len: + shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware')) + elif train_len <= index < train_len + test_len: + shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware')) + else: + shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware')) + + +def half_divide(): + src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' + + test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware' + valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware' + + flag = True + for f in os.listdir(src): + if 'pt' not in f: + continue + if flag: + shutil.copy(os.path.join(src, f), test) + else: + shutil.copy(os.path.join(src, f), valid) + flag = not flag + + +def copy_train_data(): + all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' + dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware' + train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware')) + for f in train: + shutil.copy(os.path.join(all, f), dest) + + +def clear_dot(): + for workflow in range(0, 35): + path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow) + for name in os.listdir(path): + full = os.path.join(path, name) + f = open(full, 'r') + data = f.read() + f.close() + if 'start' not in data and 'sub_' not in data: + # print("delete") + os.remove(full) + + +def read_test(): + dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot" + with open(dot_file_path, 'r') as dot: + for line in dot: + if '->' in line: + print(re.findall(r'\b\d+\b', line)) + elif 'label' in line: + print(line[line.find('= "') + 3:line.find('",')]) + + if __name__ == '__main__': - # gen_dir() + # create_dir() # change_max_item_lines() # subprocess.call('taskkill /im idaq64.exe /f') - delete_error() + # delete_error() + # test() + # delete_jsonl() + # check_json() + split_samples() + rename() + # half_divide() + # copy_train_data() + # clear_dot() + # read_test() \ No newline at end of file