# coding=utf-8 import re import os import subprocess import time import json import random import shutil from tqdm import tqdm import csv def func(): path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot" result = [] with open(path, 'r') as f: for line in f: if '->' in line: result.append(re.findall(r'\b\d+\b', line)) print(result) def func1(): for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"): print(f[:-4]) def create_dir(): parent_dir = "D:\\hkn\\infected\\datasets" for workflow in range(40, 70): # 生成raw data文件夹 infected = "virusshare_infected{}".format(workflow) cfg = "virusshare_infected{}_cfg".format(workflow) dot = "virusshare_infected{}_dot".format(workflow) jsonl = "virusshare_infected{}_json".format(workflow) create(parent_dir, infected) create(parent_dir, cfg) create(parent_dir, dot) create(parent_dir, jsonl) # iout = "virusshare_infected{}_iout".format(workflow) # os.rmdir(os.path.join(parent_dir, iout)) # os.rmdir(os.path.join(parent_dir, ida)) def create(parent_dir, folder): if not os.path.exists(os.path.join(parent_dir, folder)): os.mkdir(os.path.join(parent_dir, folder)) def change_max_item_lines(): f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb') s = f.read() f.close() index = s.find(b'MAX_ITEM_LINES = 5000') news = s.replace(b'MAX_ITEM_LINES = 5000', b'MAX_ITEM_LINES = 50000') # print(news[index:index+50]) f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'wb') f.write(news) f.close() def clock(): TIMEOUT = 10 start = time.time() flag_kill = True while time.time() - start <= TIMEOUT: if not p.is_alive(): flag_kill = False break else: time.sleep(1) # Just to avoid hogging the CPU if flag_kill: subprocess.call('taskkill /im idaq64.exe /f') def delete_error(): for workflow in range(0, 35): convert_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) json_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) with open(convert_log_path, 'r') as log: for line in log: if 'Error occurred' in line: name = line[line.find(',') + 2: line.find('.')] + '.jsonl' # print(os.path.join(json_dir, name)) if os.path.exists(os.path.join(json_dir, name)): os.remove(os.path.join(json_dir, name)) def check_json(): print('start checking json') for workflow in tqdm(range(0, 69)): json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) for json_file in os.listdir(json_dir): f = open(os.path.join(json_dir, json_file), 'r') try: data = json.load(f) except UnicodeDecodeError: continue finally: f.close() if len(data['function_edges'][0]) == 0: print("{} {} function_edges null\n".format(workflow, json_file)) # continue # for acfg in data['acfg_list']: # if acfg['block_number'] != len(acfg['block_features']): # print("{} {}\n".format(workflow, json_file)) # 临时函数,删除所有jsonl文件 def delete_jsonl(): for workflow in range(0, 35): json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) for f in os.listdir(json_dir): os.remove(os.path.join(json_dir, f)) def delete_all_local(): src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign', 'train_malware_backup', 'test_malware_backup', 'valid_malware_backup'] for d in dirs: path = os.path.join(src, d) for f in os.listdir(path): os.remove(os.path.join(path, f)) # 重命名pt文件使之与代码相符 def rename(mal_or_be, postfix): tag_set = ['train', 'test', 'valid'] for tag in tag_set: data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix) for index, f in enumerate(os.listdir(data_dir)): os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f)) for tag in tag_set: data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix) for index, f in enumerate(os.listdir(data_dir)): os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index))) def split_samples(flag): postfix = '' if flag == 'one_family': path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware' tag = 'malware' elif flag == 'standard': path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' postfix = '_backup' tag = 'malware' elif flag == 'benign': path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign' tag = 'benign' else: return out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' os_list = os.listdir(path) random.shuffle(os_list) # 8/1/1 分数据 train_len = int(len(os_list) * 0.8) test_len = int(train_len / 8) for index, f in enumerate(os_list): if index < train_len: shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix)) elif train_len <= index < train_len + test_len: shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix)) else: shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix)) rename(tag, postfix) def half_divide(): src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware' valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware' flag = True for f in os.listdir(src): if 'pt' not in f: continue if flag: shutil.copy(os.path.join(src, f), test) else: shutil.copy(os.path.join(src, f), valid) flag = not flag def copy_train_data(): all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware' train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware')) for f in train: shutil.copy(os.path.join(all, f), dest) def clear_dot(): for workflow in range(0, 35): path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow) for name in os.listdir(path): full = os.path.join(path, name) f = open(full, 'r') data = f.read() f.close() if 'start' not in data and 'sub_' not in data: # print("delete") os.remove(full) def read_test(): dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot" with open(dot_file_path, 'r') as dot: for line in dot: if '->' in line: print(re.findall(r'\b\d+\b', line)) elif 'label' in line: print(line[line.find('= "') + 3:line.find('",')]) # 临时工具,有些pe文件没有经过api分类,直接删掉 def del_redundant(): for workflow in range(0, 68): pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow) family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow) with open(family_file_path, 'r') as f_file: family = f_file.read() for name in os.listdir(pe_dir): if name[11:] in family: continue else: # print(name) os.remove(os.path.join(pe_dir, name)) def delete_pe(): dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot' cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg' dot_list = os.listdir(dot_dir) for cfg in os.listdir(cfg_dir): name = cfg[:-4] + ".dot" if name in dot_list: continue else: print(os.path.join(dot_dir, name)) # os.remove(os.path.join(dot_dir, cfg)) def delete_error_benign(): jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl' dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot' cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm" pe_dir = "F:\\kkk\\dataset\\benign\\refind" alist = os.listdir(pe_dir) for f in alist: if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')): os.remove(os.path.join(pe_dir, f)) if os.path.exists(os.path.join(asm_dir, f + '.asm')): os.remove(os.path.join(asm_dir, f + '.asm')) if os.path.exists(os.path.join(cfg_dir, f + '.ida')): os.remove(os.path.join(cfg_dir, f + '.ida')) if os.path.exists(os.path.join(dot_dir, f + '.dot')): os.remove(os.path.join(dot_dir, f + '.dot')) def generate_benign_csv(): benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind' csv_out = 'F:\\kkk\\dataset\\benign_family.csv' fieldnames = ['Id', 'Class'] with open(csv_out, "wb") as output_file: writer = csv.DictWriter(output_file, fieldnames=fieldnames) writer.writeheader() for f in os.listdir(benign_pe_dir): writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'}) if __name__ == '__main__': generate_benign_csv() # create_pixel_intensity() # create_dir() # change_max_item_lines() # subprocess.call('taskkill /im idaq64.exe /f') # delete_error_benign() # test() # delete_jsonl() # delete_all_local() # check_json() # delete_pe() # rename('malware', '_backup') # 指定 'standard' or 'benign' or 'one_family' # standard表示处理所有恶意样本 # split_samples('standard') # one_family表示仅处理一个家族,仅用于测试原模型的二分类 # split_samples('one_family') # benign表示处理良性样本 # split_samples('benign') # half_divide() # copy_train_data() # clear_dot() # read_test() # del_redundant()