Update test.py
This commit is contained in:
parent
ad2583dba9
commit
f82f488bb3
@ -8,21 +8,7 @@ import random
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
|
||||
|
||||
def func():
|
||||
path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot"
|
||||
result = []
|
||||
with open(path, 'r') as f:
|
||||
for line in f:
|
||||
if '->' in line:
|
||||
result.append(re.findall(r'\b\d+\b', line))
|
||||
print(result)
|
||||
|
||||
|
||||
def func1():
|
||||
for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"):
|
||||
print(f[:-4])
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_dir():
|
||||
@ -118,13 +104,16 @@ def delete_jsonl():
|
||||
|
||||
|
||||
def delete_all_local():
|
||||
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
|
||||
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
|
||||
for d in dirs:
|
||||
path = os.path.join(src, d)
|
||||
for f in os.listdir(path):
|
||||
os.remove(os.path.join(path, f))
|
||||
data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1',
|
||||
'D:\\hkn\\infected\\datasets\\virusshare_train\\2',
|
||||
'D:\\hkn\\infected\\datasets\\virusshare_train\\3',
|
||||
'D:\\hkn\\infected\\datasets\\virusshare_train\\4',
|
||||
'D:\\hkn\\infected\\datasets\\virusshare_train\\5',
|
||||
]
|
||||
for d in data_dirs:
|
||||
path = os.listdir(d)
|
||||
for f in path:
|
||||
os.remove(os.path.join(d, f))
|
||||
|
||||
|
||||
# 重命名pt文件使之与代码相符
|
||||
@ -140,35 +129,20 @@ def rename(mal_or_be, postfix):
|
||||
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
|
||||
|
||||
|
||||
def split_samples(flag):
|
||||
postfix = ''
|
||||
if flag == 'one_family':
|
||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
|
||||
tag = 'malware'
|
||||
elif flag == 'standard':
|
||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
||||
postfix = '_backup'
|
||||
tag = 'malware'
|
||||
elif flag == 'benign':
|
||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
|
||||
tag = 'benign'
|
||||
def split_data_by_label():
|
||||
all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt'
|
||||
dest = 'D:\\hkn\\infected\\datasets\\virusshare_train'
|
||||
csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
|
||||
with open(csv_path, 'r') as label:
|
||||
label.readline()
|
||||
labels = label.readlines()
|
||||
for lines in labels:
|
||||
name, cls = lines.strip().split(',')
|
||||
fpath = os.path.join(all, name + '.pt')
|
||||
if os.path.exists(fpath):
|
||||
shutil.move(fpath, os.path.join(dest, cls))
|
||||
else:
|
||||
return
|
||||
|
||||
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
||||
os_list = os.listdir(path)
|
||||
random.shuffle(os_list)
|
||||
# 8/1/1 分数据
|
||||
train_len = int(len(os_list) * 0.8)
|
||||
test_len = int(train_len / 8)
|
||||
for index, f in enumerate(os_list):
|
||||
if index < train_len:
|
||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
|
||||
elif train_len <= index < train_len + test_len:
|
||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
|
||||
else:
|
||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
|
||||
rename(tag, postfix)
|
||||
print(fpath, 'file not exist.')
|
||||
|
||||
|
||||
def half_divide():
|
||||
@ -277,8 +251,107 @@ def generate_benign_csv():
|
||||
writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
|
||||
|
||||
|
||||
def process_csv():
|
||||
csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
|
||||
files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe')
|
||||
print(files.__len__())
|
||||
df = df[df['Id'].isin(files)]
|
||||
df = df.drop_duplicates('Id')
|
||||
df['Id'] = 'VirusShare_' + df['Id']
|
||||
df.to_csv(csv_path, index=False)
|
||||
|
||||
|
||||
def generate_virusshare_csv():
|
||||
index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4}
|
||||
fieldnames = ['Id', 'Class']
|
||||
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
|
||||
family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family'
|
||||
csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
|
||||
with open(csv_out, "wb") as output_file:
|
||||
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for f in tqdm(os.listdir(family_dir)):
|
||||
with open(os.path.join(family_dir, f), 'r') as family:
|
||||
lines = family.readlines()
|
||||
for line in lines:
|
||||
md5, label = line.strip().split('\t')
|
||||
if label in index:
|
||||
if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)):
|
||||
writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]})
|
||||
|
||||
|
||||
def findlostone():
|
||||
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
|
||||
asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm'
|
||||
for f in os.listdir(pe_dir):
|
||||
if not os.path.exists(os.path.join(asm_dir, f + '.asm')):
|
||||
print(f)
|
||||
|
||||
|
||||
def find_pe_in_original_set():
|
||||
for workflow in range(0, 69):
|
||||
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||
for f in os.listdir(data_dir):
|
||||
if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3':
|
||||
print(workflow)
|
||||
return
|
||||
|
||||
|
||||
def select_jsonl():
|
||||
csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv'
|
||||
jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl'
|
||||
|
||||
with open(csv_paths, 'r') as csv_path:
|
||||
labels = csv.reader(csv_path, delimiter=',')
|
||||
data = list(labels)
|
||||
for workflow in range(0, 69):
|
||||
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||
for f in os.listdir(data_dir):
|
||||
for line in data:
|
||||
if f[:-6] in line:
|
||||
shutil.copy(os.path.join(data_dir, f), jsonl_dir)
|
||||
break
|
||||
|
||||
|
||||
def generate_csv():
|
||||
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe'
|
||||
csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv'
|
||||
fieldnames = ['Id', 'Class']
|
||||
with open(csv_path, "wb") as output_file:
|
||||
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for pe in os.listdir(pe_dir):
|
||||
writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5})
|
||||
|
||||
|
||||
def merge_csvs(cs, out):
|
||||
for i, c in enumerate(cs):
|
||||
if i == 0:
|
||||
merged = pd.read_csv(c)
|
||||
else:
|
||||
merged = pd.merge(pd.read_csv(c), merged, on='Id')
|
||||
# merged = pd.concat([merged, pd.read_csv(c)])
|
||||
|
||||
# if 'Class' in merged:
|
||||
# merged['Class'] = merged['Class'] - 1
|
||||
merged.to_csv(out, index=False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
generate_benign_csv()
|
||||
# find_pe_in_original_set()
|
||||
# split_data_by_label()
|
||||
# select_jsonl()
|
||||
# findlostone()
|
||||
# generate_csv()
|
||||
# generate_virusshare_csv()
|
||||
# merge_csvs([
|
||||
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv',
|
||||
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv',
|
||||
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv',
|
||||
# ],
|
||||
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
|
||||
# )
|
||||
process_csv()
|
||||
# generate_benign_csv()
|
||||
# create_pixel_intensity()
|
||||
# create_dir()
|
||||
# change_max_item_lines()
|
||||
@ -294,7 +367,7 @@ if __name__ == '__main__':
|
||||
|
||||
# 指定 'standard' or 'benign' or 'one_family'
|
||||
# standard表示处理所有恶意样本
|
||||
# split_samples('standard')
|
||||
# split_samples()
|
||||
# one_family表示仅处理一个家族,仅用于测试原模型的二分类
|
||||
# split_samples('one_family')
|
||||
# benign表示处理良性样本
|
||||
|
Loading…
Reference in New Issue
Block a user