diff --git a/Genius3/raw-feature-extractor/test.py b/Genius3/raw-feature-extractor/test.py index 7a7ad41..3a5a06b 100644 --- a/Genius3/raw-feature-extractor/test.py +++ b/Genius3/raw-feature-extractor/test.py @@ -8,21 +8,7 @@ import random import shutil from tqdm import tqdm import csv - - -def func(): - path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot" - result = [] - with open(path, 'r') as f: - for line in f: - if '->' in line: - result.append(re.findall(r'\b\d+\b', line)) - print(result) - - -def func1(): - for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"): - print(f[:-4]) +import pandas as pd def create_dir(): @@ -118,13 +104,16 @@ def delete_jsonl(): def delete_all_local(): - src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' - dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign', - 'train_malware_backup', 'test_malware_backup', 'valid_malware_backup'] - for d in dirs: - path = os.path.join(src, d) - for f in os.listdir(path): - os.remove(os.path.join(path, f)) + data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1', + 'D:\\hkn\\infected\\datasets\\virusshare_train\\2', + 'D:\\hkn\\infected\\datasets\\virusshare_train\\3', + 'D:\\hkn\\infected\\datasets\\virusshare_train\\4', + 'D:\\hkn\\infected\\datasets\\virusshare_train\\5', + ] + for d in data_dirs: + path = os.listdir(d) + for f in path: + os.remove(os.path.join(d, f)) # 重命名pt文件使之与代码相符 @@ -140,35 +129,20 @@ def rename(mal_or_be, postfix): os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index))) -def split_samples(flag): - postfix = '' - if flag == 'one_family': - path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware' - tag = 'malware' - elif flag == 'standard': - path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' - postfix = '_backup' - tag = 'malware' - elif flag == 'benign': - path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign' - tag = 'benign' - else: - return - - out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' - os_list = os.listdir(path) - random.shuffle(os_list) - # 8/1/1 分数据 - train_len = int(len(os_list) * 0.8) - test_len = int(train_len / 8) - for index, f in enumerate(os_list): - if index < train_len: - shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix)) - elif train_len <= index < train_len + test_len: - shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix)) - else: - shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix)) - rename(tag, postfix) +def split_data_by_label(): + all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt' + dest = 'D:\\hkn\\infected\\datasets\\virusshare_train' + csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv' + with open(csv_path, 'r') as label: + label.readline() + labels = label.readlines() + for lines in labels: + name, cls = lines.strip().split(',') + fpath = os.path.join(all, name + '.pt') + if os.path.exists(fpath): + shutil.move(fpath, os.path.join(dest, cls)) + else: + print(fpath, 'file not exist.') def half_divide(): @@ -277,8 +251,107 @@ def generate_benign_csv(): writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'}) +def process_csv(): + csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv' + files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe') + print(files.__len__()) + df = df[df['Id'].isin(files)] + df = df.drop_duplicates('Id') + df['Id'] = 'VirusShare_' + df['Id'] + df.to_csv(csv_path, index=False) + + +def generate_virusshare_csv(): + index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4} + fieldnames = ['Id', 'Class'] + pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe' + family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family' + csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv' + with open(csv_out, "wb") as output_file: + writer = csv.DictWriter(output_file, fieldnames=fieldnames) + writer.writeheader() + for f in tqdm(os.listdir(family_dir)): + with open(os.path.join(family_dir, f), 'r') as family: + lines = family.readlines() + for line in lines: + md5, label = line.strip().split('\t') + if label in index: + if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)): + writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]}) + + +def findlostone(): + pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe' + asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm' + for f in os.listdir(pe_dir): + if not os.path.exists(os.path.join(asm_dir, f + '.asm')): + print(f) + + +def find_pe_in_original_set(): + for workflow in range(0, 69): + data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) + for f in os.listdir(data_dir): + if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3': + print(workflow) + return + + +def select_jsonl(): + csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv' + jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl' + + with open(csv_paths, 'r') as csv_path: + labels = csv.reader(csv_path, delimiter=',') + data = list(labels) + for workflow in range(0, 69): + data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) + for f in os.listdir(data_dir): + for line in data: + if f[:-6] in line: + shutil.copy(os.path.join(data_dir, f), jsonl_dir) + break + + +def generate_csv(): + pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe' + csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv' + fieldnames = ['Id', 'Class'] + with open(csv_path, "wb") as output_file: + writer = csv.DictWriter(output_file, fieldnames=fieldnames) + writer.writeheader() + for pe in os.listdir(pe_dir): + writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5}) + + +def merge_csvs(cs, out): + for i, c in enumerate(cs): + if i == 0: + merged = pd.read_csv(c) + else: + merged = pd.merge(pd.read_csv(c), merged, on='Id') + # merged = pd.concat([merged, pd.read_csv(c)]) + + # if 'Class' in merged: + # merged['Class'] = merged['Class'] - 1 + merged.to_csv(out, index=False) + if __name__ == '__main__': - generate_benign_csv() + # find_pe_in_original_set() + # split_data_by_label() + # select_jsonl() + # findlostone() + # generate_csv() + # generate_virusshare_csv() + # merge_csvs([ + # 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv', + # 'D:\\hkn\\infected\\datasets\\virusshare_family.csv', + # 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv', + # ], + # 'D:\\hkn\\infected\\datasets\\virusshare_family.csv' + # ) + process_csv() + # generate_benign_csv() # create_pixel_intensity() # create_dir() # change_max_item_lines() @@ -294,7 +367,7 @@ if __name__ == '__main__': # 指定 'standard' or 'benign' or 'one_family' # standard表示处理所有恶意样本 - # split_samples('standard') + # split_samples() # one_family表示仅处理一个家族,仅用于测试原模型的二分类 # split_samples('one_family') # benign表示处理良性样本