Update test.py

This commit is contained in:
TinyCaviar 2023-11-24 09:43:46 +08:00
parent ad2583dba9
commit f82f488bb3

View File

@ -8,21 +8,7 @@ import random
import shutil import shutil
from tqdm import tqdm from tqdm import tqdm
import csv import csv
import pandas as pd
def func():
path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot"
result = []
with open(path, 'r') as f:
for line in f:
if '->' in line:
result.append(re.findall(r'\b\d+\b', line))
print(result)
def func1():
for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"):
print(f[:-4])
def create_dir(): def create_dir():
@ -118,13 +104,16 @@ def delete_jsonl():
def delete_all_local(): def delete_all_local():
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1',
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign', 'D:\\hkn\\infected\\datasets\\virusshare_train\\2',
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup'] 'D:\\hkn\\infected\\datasets\\virusshare_train\\3',
for d in dirs: 'D:\\hkn\\infected\\datasets\\virusshare_train\\4',
path = os.path.join(src, d) 'D:\\hkn\\infected\\datasets\\virusshare_train\\5',
for f in os.listdir(path): ]
os.remove(os.path.join(path, f)) for d in data_dirs:
path = os.listdir(d)
for f in path:
os.remove(os.path.join(d, f))
# 重命名pt文件使之与代码相符 # 重命名pt文件使之与代码相符
@ -140,35 +129,20 @@ def rename(mal_or_be, postfix):
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index))) os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
def split_samples(flag): def split_data_by_label():
postfix = '' all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt'
if flag == 'one_family': dest = 'D:\\hkn\\infected\\datasets\\virusshare_train'
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware' csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
tag = 'malware' with open(csv_path, 'r') as label:
elif flag == 'standard': label.readline()
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' labels = label.readlines()
postfix = '_backup' for lines in labels:
tag = 'malware' name, cls = lines.strip().split(',')
elif flag == 'benign': fpath = os.path.join(all, name + '.pt')
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign' if os.path.exists(fpath):
tag = 'benign' shutil.move(fpath, os.path.join(dest, cls))
else: else:
return print(fpath, 'file not exist.')
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
os_list = os.listdir(path)
random.shuffle(os_list)
# 8/1/1 分数据
train_len = int(len(os_list) * 0.8)
test_len = int(train_len / 8)
for index, f in enumerate(os_list):
if index < train_len:
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
elif train_len <= index < train_len + test_len:
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
else:
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
rename(tag, postfix)
def half_divide(): def half_divide():
@ -277,8 +251,107 @@ def generate_benign_csv():
writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'}) writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
def process_csv():
csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe')
print(files.__len__())
df = df[df['Id'].isin(files)]
df = df.drop_duplicates('Id')
df['Id'] = 'VirusShare_' + df['Id']
df.to_csv(csv_path, index=False)
def generate_virusshare_csv():
index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4}
fieldnames = ['Id', 'Class']
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family'
csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
with open(csv_out, "wb") as output_file:
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
for f in tqdm(os.listdir(family_dir)):
with open(os.path.join(family_dir, f), 'r') as family:
lines = family.readlines()
for line in lines:
md5, label = line.strip().split('\t')
if label in index:
if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)):
writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]})
def findlostone():
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm'
for f in os.listdir(pe_dir):
if not os.path.exists(os.path.join(asm_dir, f + '.asm')):
print(f)
def find_pe_in_original_set():
for workflow in range(0, 69):
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for f in os.listdir(data_dir):
if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3':
print(workflow)
return
def select_jsonl():
csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv'
jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl'
with open(csv_paths, 'r') as csv_path:
labels = csv.reader(csv_path, delimiter=',')
data = list(labels)
for workflow in range(0, 69):
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for f in os.listdir(data_dir):
for line in data:
if f[:-6] in line:
shutil.copy(os.path.join(data_dir, f), jsonl_dir)
break
def generate_csv():
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe'
csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv'
fieldnames = ['Id', 'Class']
with open(csv_path, "wb") as output_file:
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
for pe in os.listdir(pe_dir):
writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5})
def merge_csvs(cs, out):
for i, c in enumerate(cs):
if i == 0:
merged = pd.read_csv(c)
else:
merged = pd.merge(pd.read_csv(c), merged, on='Id')
# merged = pd.concat([merged, pd.read_csv(c)])
# if 'Class' in merged:
# merged['Class'] = merged['Class'] - 1
merged.to_csv(out, index=False)
if __name__ == '__main__': if __name__ == '__main__':
generate_benign_csv() # find_pe_in_original_set()
# split_data_by_label()
# select_jsonl()
# findlostone()
# generate_csv()
# generate_virusshare_csv()
# merge_csvs([
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv',
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv',
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv',
# ],
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
# )
process_csv()
# generate_benign_csv()
# create_pixel_intensity() # create_pixel_intensity()
# create_dir() # create_dir()
# change_max_item_lines() # change_max_item_lines()
@ -294,7 +367,7 @@ if __name__ == '__main__':
# 指定 'standard' or 'benign' or 'one_family' # 指定 'standard' or 'benign' or 'one_family'
# standard表示处理所有恶意样本 # standard表示处理所有恶意样本
# split_samples('standard') # split_samples()
# one_family表示仅处理一个家族仅用于测试原模型的二分类 # one_family表示仅处理一个家族仅用于测试原模型的二分类
# split_samples('one_family') # split_samples('one_family')
# benign表示处理良性样本 # benign表示处理良性样本