Gencoding_Ke/Genius3/raw-feature-extractor/test.py

307 lines
11 KiB
Python
Raw Normal View History

2023-08-12 13:48:27 +08:00
# coding=utf-8
2023-08-03 10:03:02 +08:00
import re
import os
import subprocess
import time
2023-08-12 13:48:27 +08:00
import json
import random
import shutil
2023-09-01 11:47:19 +08:00
from tqdm import tqdm
2023-11-16 15:31:12 +08:00
import csv
2023-08-03 10:03:02 +08:00
def func():
path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot"
result = []
with open(path, 'r') as f:
for line in f:
if '->' in line:
result.append(re.findall(r'\b\d+\b', line))
print(result)
def func1():
for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"):
print(f[:-4])
2023-08-12 13:48:27 +08:00
def create_dir():
2023-08-03 10:03:02 +08:00
parent_dir = "D:\\hkn\\infected\\datasets"
2023-09-01 11:47:19 +08:00
for workflow in range(40, 70):
2023-08-12 13:48:27 +08:00
# 生成raw data文件夹
2023-09-01 11:47:19 +08:00
infected = "virusshare_infected{}".format(workflow)
cfg = "virusshare_infected{}_cfg".format(workflow)
dot = "virusshare_infected{}_dot".format(workflow)
2023-08-12 13:48:27 +08:00
jsonl = "virusshare_infected{}_json".format(workflow)
2023-09-01 11:47:19 +08:00
create(parent_dir, infected)
create(parent_dir, cfg)
create(parent_dir, dot)
create(parent_dir, jsonl)
2023-08-12 13:48:27 +08:00
# iout = "virusshare_infected{}_iout".format(workflow)
# os.rmdir(os.path.join(parent_dir, iout))
2023-08-03 10:03:02 +08:00
# os.rmdir(os.path.join(parent_dir, ida))
2023-09-01 11:47:19 +08:00
def create(parent_dir, folder):
if not os.path.exists(os.path.join(parent_dir, folder)):
os.mkdir(os.path.join(parent_dir, folder))
2023-08-03 10:03:02 +08:00
def change_max_item_lines():
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
s = f.read()
f.close()
index = s.find(b'MAX_ITEM_LINES = 5000')
news = s.replace(b'MAX_ITEM_LINES = 5000', b'MAX_ITEM_LINES = 50000')
# print(news[index:index+50])
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'wb')
f.write(news)
f.close()
def clock():
TIMEOUT = 10
start = time.time()
flag_kill = True
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1) # Just to avoid hogging the CPU
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
2023-08-07 20:48:21 +08:00
def delete_error():
for workflow in range(0, 35):
convert_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
json_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
with open(convert_log_path, 'r') as log:
for line in log:
if 'Error occurred' in line:
name = line[line.find(',') + 2: line.find('.')] + '.jsonl'
# print(os.path.join(json_dir, name))
if os.path.exists(os.path.join(json_dir, name)):
os.remove(os.path.join(json_dir, name))
2023-08-12 13:48:27 +08:00
def check_json():
2023-10-10 22:12:18 +08:00
print('start checking json')
2023-09-01 11:47:19 +08:00
for workflow in tqdm(range(0, 69)):
2023-08-12 13:48:27 +08:00
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for json_file in os.listdir(json_dir):
f = open(os.path.join(json_dir, json_file), 'r')
try:
data = json.load(f)
except UnicodeDecodeError:
continue
finally:
f.close()
2023-10-10 22:12:18 +08:00
if len(data['function_edges'][0]) == 0:
print("{} {} function_edges null\n".format(workflow, json_file))
# continue
# for acfg in data['acfg_list']:
# if acfg['block_number'] != len(acfg['block_features']):
# print("{} {}\n".format(workflow, json_file))
2023-08-12 13:48:27 +08:00
# 临时函数删除所有jsonl文件
def delete_jsonl():
for workflow in range(0, 35):
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for f in os.listdir(json_dir):
os.remove(os.path.join(json_dir, f))
2023-10-10 22:12:18 +08:00
def delete_all_local():
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
for d in dirs:
path = os.path.join(src, d)
for f in os.listdir(path):
os.remove(os.path.join(path, f))
# 重命名pt文件使之与代码相符
def rename(mal_or_be, postfix):
2023-08-12 13:48:27 +08:00
tag_set = ['train', 'test', 'valid']
for tag in tag_set:
2023-10-10 22:12:18 +08:00
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
2023-08-12 13:48:27 +08:00
for index, f in enumerate(os.listdir(data_dir)):
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
for tag in tag_set:
2023-10-10 22:12:18 +08:00
data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
2023-08-12 13:48:27 +08:00
for index, f in enumerate(os.listdir(data_dir)):
2023-10-10 22:12:18 +08:00
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
def split_samples(flag):
postfix = ''
if flag == 'one_family':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
tag = 'malware'
elif flag == 'standard':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
postfix = '_backup'
tag = 'malware'
elif flag == 'benign':
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
tag = 'benign'
else:
return
2023-08-12 13:48:27 +08:00
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
os_list = os.listdir(path)
random.shuffle(os_list)
# 8/1/1 分数据
train_len = int(len(os_list) * 0.8)
test_len = int(train_len / 8)
for index, f in enumerate(os_list):
if index < train_len:
2023-10-10 22:12:18 +08:00
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
2023-08-12 13:48:27 +08:00
elif train_len <= index < train_len + test_len:
2023-10-10 22:12:18 +08:00
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
2023-08-12 13:48:27 +08:00
else:
2023-10-10 22:12:18 +08:00
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
rename(tag, postfix)
2023-08-12 13:48:27 +08:00
def half_divide():
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware'
valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'
flag = True
for f in os.listdir(src):
if 'pt' not in f:
continue
if flag:
shutil.copy(os.path.join(src, f), test)
else:
shutil.copy(os.path.join(src, f), valid)
flag = not flag
def copy_train_data():
all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware'
train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'))
for f in train:
shutil.copy(os.path.join(all, f), dest)
def clear_dot():
for workflow in range(0, 35):
path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow)
for name in os.listdir(path):
full = os.path.join(path, name)
f = open(full, 'r')
data = f.read()
f.close()
if 'start' not in data and 'sub_' not in data:
# print("delete")
os.remove(full)
def read_test():
dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot"
with open(dot_file_path, 'r') as dot:
for line in dot:
if '->' in line:
print(re.findall(r'\b\d+\b', line))
elif 'label' in line:
print(line[line.find('= "') + 3:line.find('",')])
2023-09-01 11:47:19 +08:00
# 临时工具有些pe文件没有经过api分类直接删掉
def del_redundant():
for workflow in range(0, 68):
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
with open(family_file_path, 'r') as f_file:
family = f_file.read()
for name in os.listdir(pe_dir):
if name[11:] in family:
continue
else:
# print(name)
os.remove(os.path.join(pe_dir, name))
2023-10-10 22:12:18 +08:00
def delete_pe():
dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
dot_list = os.listdir(dot_dir)
for cfg in os.listdir(cfg_dir):
name = cfg[:-4] + ".dot"
if name in dot_list:
continue
else:
print(os.path.join(dot_dir, name))
# os.remove(os.path.join(dot_dir, cfg))
2023-11-16 15:31:12 +08:00
def delete_error_benign():
jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl'
dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot'
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm"
pe_dir = "F:\\kkk\\dataset\\benign\\refind"
alist = os.listdir(pe_dir)
for f in alist:
if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')):
os.remove(os.path.join(pe_dir, f))
if os.path.exists(os.path.join(asm_dir, f + '.asm')):
os.remove(os.path.join(asm_dir, f + '.asm'))
if os.path.exists(os.path.join(cfg_dir, f + '.ida')):
os.remove(os.path.join(cfg_dir, f + '.ida'))
if os.path.exists(os.path.join(dot_dir, f + '.dot')):
os.remove(os.path.join(dot_dir, f + '.dot'))
def generate_benign_csv():
benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
csv_out = 'F:\\kkk\\dataset\\benign_family.csv'
fieldnames = ['Id', 'Class']
with open(csv_out, "wb") as output_file:
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
for f in os.listdir(benign_pe_dir):
writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
2023-08-03 10:03:02 +08:00
if __name__ == '__main__':
2023-11-16 15:31:12 +08:00
generate_benign_csv()
# create_pixel_intensity()
2023-08-12 13:48:27 +08:00
# create_dir()
2023-08-03 10:03:02 +08:00
# change_max_item_lines()
2023-08-07 20:48:21 +08:00
# subprocess.call('taskkill /im idaq64.exe /f')
2023-11-16 15:31:12 +08:00
# delete_error_benign()
2023-08-12 13:48:27 +08:00
# test()
# delete_jsonl()
2023-11-16 15:31:12 +08:00
# delete_all_local()
2023-08-12 13:48:27 +08:00
# check_json()
2023-10-10 22:12:18 +08:00
# delete_pe()
# rename('malware', '_backup')
# 指定 'standard' or 'benign' or 'one_family'
# standard表示处理所有恶意样本
# split_samples('standard')
# one_family表示仅处理一个家族仅用于测试原模型的二分类
# split_samples('one_family')
# benign表示处理良性样本
# split_samples('benign')
2023-08-12 13:48:27 +08:00
# half_divide()
# copy_train_data()
# clear_dot()
2023-09-01 11:47:19 +08:00
# read_test()
2023-10-10 22:12:18 +08:00
# del_redundant()