backup
This commit is contained in:
parent
badd4eada6
commit
636ec90a1c
@ -3,24 +3,49 @@ import pickle as pk
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def convert():
|
def convert(start, end):
|
||||||
# for workflow in range(0, 20):
|
for workflow in range(start, end):
|
||||||
workflow = 0
|
# workflow = 0
|
||||||
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
||||||
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
||||||
dot_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
|
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
|
||||||
|
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
||||||
|
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
||||||
|
|
||||||
|
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||||
|
logged = log.readline()
|
||||||
|
if logged == '':
|
||||||
|
log_index = 0
|
||||||
|
else:
|
||||||
|
log_index = int(logged)
|
||||||
|
|
||||||
|
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
|
||||||
|
if index < log_index:
|
||||||
|
continue
|
||||||
|
|
||||||
for cfg in os.listdir(cfg_dir):
|
|
||||||
name = cfg[:-4] # 纯文件名,不带后缀
|
name = cfg[:-4] # 纯文件名,不带后缀
|
||||||
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
||||||
|
try:
|
||||||
data = pk.load(cfg_file)
|
data = pk.load(cfg_file)
|
||||||
|
except EOFError:
|
||||||
|
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
||||||
|
continue
|
||||||
|
|
||||||
cfg_file.close()
|
cfg_file.close()
|
||||||
|
|
||||||
|
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
||||||
|
if not os.path.exists(dot_file_path):
|
||||||
|
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
|
||||||
|
else:
|
||||||
# 打开dot文件获取fcg
|
# 打开dot文件获取fcg
|
||||||
raw_function_edges = []
|
raw_function_edges = []
|
||||||
with open(os.path.join(dot_path, name + '.dot'), 'r') as dot:
|
with open(dot_file_path, 'r') as dot:
|
||||||
for line in dot:
|
for line in dot:
|
||||||
if '->' in line:
|
if '->' in line:
|
||||||
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
||||||
@ -38,8 +63,8 @@ def convert():
|
|||||||
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
||||||
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
||||||
# 将后代数量的特征放入bb_features中
|
# 将后代数量的特征放入bb_features中
|
||||||
for i, f in enumerate(acfg.bb_features):
|
for i, offs in enumerate(offspring):
|
||||||
f.append(offspring[i])
|
acfg.bb_features[i].append(offs)
|
||||||
|
|
||||||
acfg_item = {
|
acfg_item = {
|
||||||
'block_number': acfg.g.__len__(),
|
'block_number': acfg.g.__len__(),
|
||||||
@ -51,10 +76,17 @@ def convert():
|
|||||||
json_obj['function_names'].append(acfg.funcname)
|
json_obj['function_names'].append(acfg.funcname)
|
||||||
|
|
||||||
# 将结果写入json本地文件
|
# 将结果写入json本地文件
|
||||||
result = json.dumps(json_obj)
|
result = json.dumps(json_obj, ensure_ascii=False)
|
||||||
|
|
||||||
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
|
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
|
||||||
out.write(result)
|
out.write(result)
|
||||||
|
|
||||||
|
log.truncate(0)
|
||||||
|
log.seek(0)
|
||||||
|
log.write(str(index))
|
||||||
|
log.flush()
|
||||||
|
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
convert()
|
convert(20, 35)
|
||||||
|
@ -6,6 +6,8 @@ from tqdm import tqdm
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
# 单个pe文件处理超时/s
|
# 单个pe文件处理超时/s
|
||||||
|
# 多次处理,一批数据中只有少量文件会超时
|
||||||
|
# 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃
|
||||||
TIMEOUT = 60
|
TIMEOUT = 60
|
||||||
|
|
||||||
|
|
||||||
@ -14,7 +16,7 @@ def call_preprocess(cmd_line):
|
|||||||
|
|
||||||
|
|
||||||
def batch_mode():
|
def batch_mode():
|
||||||
for workflow in range(0, 1):
|
for workflow in range(1, 20):
|
||||||
# workflow = 0
|
# workflow = 0
|
||||||
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
|
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
|
||||||
# for test
|
# for test
|
||||||
@ -36,7 +38,8 @@ def batch_mode():
|
|||||||
# for test
|
# for test
|
||||||
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
|
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
|
||||||
# workflow, os.path.join(pe_dir, pe))
|
# workflow, os.path.join(pe_dir, pe))
|
||||||
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(workflow, os.path.join(pe_dir, pe))
|
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
|
||||||
|
workflow, os.path.join(pe_dir, pe))
|
||||||
|
|
||||||
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
|
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
|
||||||
p.start()
|
p.start()
|
||||||
@ -51,7 +54,8 @@ def batch_mode():
|
|||||||
|
|
||||||
if flag_kill:
|
if flag_kill:
|
||||||
subprocess.call('taskkill /im idaq64.exe /f')
|
subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
process_log.write("index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
|
process_log.write(
|
||||||
|
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
|
||||||
else:
|
else:
|
||||||
# 正常运行结束
|
# 正常运行结束
|
||||||
log.truncate(0)
|
log.truncate(0)
|
||||||
@ -59,6 +63,14 @@ def batch_mode():
|
|||||||
log.write(str(index))
|
log.write(str(index))
|
||||||
log.flush()
|
log.flush()
|
||||||
process_log.write("index {}, {} process done.\n".format(index, pe))
|
process_log.write("index {}, {} process done.\n".format(index, pe))
|
||||||
|
# 一次workflow结束后将所有副产物删除
|
||||||
|
delete_output()
|
||||||
|
|
||||||
|
|
||||||
|
def delete_output():
|
||||||
|
out_dir = 'F:\\iout'
|
||||||
|
os.rmdir(out_dir)
|
||||||
|
os.mkdir(out_dir)
|
||||||
|
|
||||||
|
|
||||||
# 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库
|
# 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库
|
||||||
|
@ -15,7 +15,7 @@ def print_obj(obj):
|
|||||||
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
|
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
|
||||||
# sub_166C4 393
|
# sub_166C4 393
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected0_cfg\\VirusShare_cd53c6637ca75ac5fc1cbe6d2ced41a1.ida"
|
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected11_cfg\\VirusShare_5c088a2a6e0391b7c6ab22e4648eab3a.ida"
|
||||||
fr = open(testpath, 'r')
|
fr = open(testpath, 'r')
|
||||||
data = pickle.load(fr) #一个二进制文件的acfgs
|
data = pickle.load(fr) #一个二进制文件的acfgs
|
||||||
fr.close()
|
fr.close()
|
||||||
|
@ -63,8 +63,22 @@ def clock():
|
|||||||
subprocess.call('taskkill /im idaq64.exe /f')
|
subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
|
|
||||||
|
|
||||||
|
def delete_error():
|
||||||
|
for workflow in range(0, 35):
|
||||||
|
convert_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
||||||
|
json_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
||||||
|
|
||||||
|
with open(convert_log_path, 'r') as log:
|
||||||
|
for line in log:
|
||||||
|
if 'Error occurred' in line:
|
||||||
|
name = line[line.find(',') + 2: line.find('.')] + '.jsonl'
|
||||||
|
# print(os.path.join(json_dir, name))
|
||||||
|
if os.path.exists(os.path.join(json_dir, name)):
|
||||||
|
os.remove(os.path.join(json_dir, name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# gen_dir()
|
# gen_dir()
|
||||||
# change_max_item_lines()
|
# change_max_item_lines()
|
||||||
subprocess.call('taskkill /im idaq64.exe /f')
|
# subprocess.call('taskkill /im idaq64.exe /f')
|
||||||
|
delete_error()
|
||||||
|
Loading…
Reference in New Issue
Block a user