Update test.py
This commit is contained in:
parent
ad2583dba9
commit
f82f488bb3
@ -8,21 +8,7 @@ import random
|
|||||||
import shutil
|
import shutil
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import csv
|
import csv
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
def func():
|
|
||||||
path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot"
|
|
||||||
result = []
|
|
||||||
with open(path, 'r') as f:
|
|
||||||
for line in f:
|
|
||||||
if '->' in line:
|
|
||||||
result.append(re.findall(r'\b\d+\b', line))
|
|
||||||
print(result)
|
|
||||||
|
|
||||||
|
|
||||||
def func1():
|
|
||||||
for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"):
|
|
||||||
print(f[:-4])
|
|
||||||
|
|
||||||
|
|
||||||
def create_dir():
|
def create_dir():
|
||||||
@ -118,13 +104,16 @@ def delete_jsonl():
|
|||||||
|
|
||||||
|
|
||||||
def delete_all_local():
|
def delete_all_local():
|
||||||
src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1',
|
||||||
dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign',
|
'D:\\hkn\\infected\\datasets\\virusshare_train\\2',
|
||||||
'train_malware_backup', 'test_malware_backup', 'valid_malware_backup']
|
'D:\\hkn\\infected\\datasets\\virusshare_train\\3',
|
||||||
for d in dirs:
|
'D:\\hkn\\infected\\datasets\\virusshare_train\\4',
|
||||||
path = os.path.join(src, d)
|
'D:\\hkn\\infected\\datasets\\virusshare_train\\5',
|
||||||
for f in os.listdir(path):
|
]
|
||||||
os.remove(os.path.join(path, f))
|
for d in data_dirs:
|
||||||
|
path = os.listdir(d)
|
||||||
|
for f in path:
|
||||||
|
os.remove(os.path.join(d, f))
|
||||||
|
|
||||||
|
|
||||||
# 重命名pt文件使之与代码相符
|
# 重命名pt文件使之与代码相符
|
||||||
@ -140,35 +129,20 @@ def rename(mal_or_be, postfix):
|
|||||||
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
|
os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
|
||||||
|
|
||||||
|
|
||||||
def split_samples(flag):
|
def split_data_by_label():
|
||||||
postfix = ''
|
all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt'
|
||||||
if flag == 'one_family':
|
dest = 'D:\\hkn\\infected\\datasets\\virusshare_train'
|
||||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware'
|
csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
|
||||||
tag = 'malware'
|
with open(csv_path, 'r') as label:
|
||||||
elif flag == 'standard':
|
label.readline()
|
||||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
|
labels = label.readlines()
|
||||||
postfix = '_backup'
|
for lines in labels:
|
||||||
tag = 'malware'
|
name, cls = lines.strip().split(',')
|
||||||
elif flag == 'benign':
|
fpath = os.path.join(all, name + '.pt')
|
||||||
path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign'
|
if os.path.exists(fpath):
|
||||||
tag = 'benign'
|
shutil.move(fpath, os.path.join(dest, cls))
|
||||||
else:
|
else:
|
||||||
return
|
print(fpath, 'file not exist.')
|
||||||
|
|
||||||
out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
|
|
||||||
os_list = os.listdir(path)
|
|
||||||
random.shuffle(os_list)
|
|
||||||
# 8/1/1 分数据
|
|
||||||
train_len = int(len(os_list) * 0.8)
|
|
||||||
test_len = int(train_len / 8)
|
|
||||||
for index, f in enumerate(os_list):
|
|
||||||
if index < train_len:
|
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix))
|
|
||||||
elif train_len <= index < train_len + test_len:
|
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix))
|
|
||||||
else:
|
|
||||||
shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix))
|
|
||||||
rename(tag, postfix)
|
|
||||||
|
|
||||||
|
|
||||||
def half_divide():
|
def half_divide():
|
||||||
@ -277,8 +251,107 @@ def generate_benign_csv():
|
|||||||
writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
|
writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
|
||||||
|
|
||||||
|
|
||||||
|
def process_csv():
|
||||||
|
csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
|
||||||
|
files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe')
|
||||||
|
print(files.__len__())
|
||||||
|
df = df[df['Id'].isin(files)]
|
||||||
|
df = df.drop_duplicates('Id')
|
||||||
|
df['Id'] = 'VirusShare_' + df['Id']
|
||||||
|
df.to_csv(csv_path, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_virusshare_csv():
|
||||||
|
index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4}
|
||||||
|
fieldnames = ['Id', 'Class']
|
||||||
|
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
|
||||||
|
family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family'
|
||||||
|
csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
|
||||||
|
with open(csv_out, "wb") as output_file:
|
||||||
|
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for f in tqdm(os.listdir(family_dir)):
|
||||||
|
with open(os.path.join(family_dir, f), 'r') as family:
|
||||||
|
lines = family.readlines()
|
||||||
|
for line in lines:
|
||||||
|
md5, label = line.strip().split('\t')
|
||||||
|
if label in index:
|
||||||
|
if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)):
|
||||||
|
writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]})
|
||||||
|
|
||||||
|
|
||||||
|
def findlostone():
|
||||||
|
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
|
||||||
|
asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm'
|
||||||
|
for f in os.listdir(pe_dir):
|
||||||
|
if not os.path.exists(os.path.join(asm_dir, f + '.asm')):
|
||||||
|
print(f)
|
||||||
|
|
||||||
|
|
||||||
|
def find_pe_in_original_set():
|
||||||
|
for workflow in range(0, 69):
|
||||||
|
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||||
|
for f in os.listdir(data_dir):
|
||||||
|
if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3':
|
||||||
|
print(workflow)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def select_jsonl():
|
||||||
|
csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv'
|
||||||
|
jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl'
|
||||||
|
|
||||||
|
with open(csv_paths, 'r') as csv_path:
|
||||||
|
labels = csv.reader(csv_path, delimiter=',')
|
||||||
|
data = list(labels)
|
||||||
|
for workflow in range(0, 69):
|
||||||
|
data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
|
||||||
|
for f in os.listdir(data_dir):
|
||||||
|
for line in data:
|
||||||
|
if f[:-6] in line:
|
||||||
|
shutil.copy(os.path.join(data_dir, f), jsonl_dir)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def generate_csv():
|
||||||
|
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe'
|
||||||
|
csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv'
|
||||||
|
fieldnames = ['Id', 'Class']
|
||||||
|
with open(csv_path, "wb") as output_file:
|
||||||
|
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for pe in os.listdir(pe_dir):
|
||||||
|
writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5})
|
||||||
|
|
||||||
|
|
||||||
|
def merge_csvs(cs, out):
|
||||||
|
for i, c in enumerate(cs):
|
||||||
|
if i == 0:
|
||||||
|
merged = pd.read_csv(c)
|
||||||
|
else:
|
||||||
|
merged = pd.merge(pd.read_csv(c), merged, on='Id')
|
||||||
|
# merged = pd.concat([merged, pd.read_csv(c)])
|
||||||
|
|
||||||
|
# if 'Class' in merged:
|
||||||
|
# merged['Class'] = merged['Class'] - 1
|
||||||
|
merged.to_csv(out, index=False)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
generate_benign_csv()
|
# find_pe_in_original_set()
|
||||||
|
# split_data_by_label()
|
||||||
|
# select_jsonl()
|
||||||
|
# findlostone()
|
||||||
|
# generate_csv()
|
||||||
|
# generate_virusshare_csv()
|
||||||
|
# merge_csvs([
|
||||||
|
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv',
|
||||||
|
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv',
|
||||||
|
# 'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv',
|
||||||
|
# ],
|
||||||
|
# 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
|
||||||
|
# )
|
||||||
|
process_csv()
|
||||||
|
# generate_benign_csv()
|
||||||
# create_pixel_intensity()
|
# create_pixel_intensity()
|
||||||
# create_dir()
|
# create_dir()
|
||||||
# change_max_item_lines()
|
# change_max_item_lines()
|
||||||
@ -294,7 +367,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
# 指定 'standard' or 'benign' or 'one_family'
|
# 指定 'standard' or 'benign' or 'one_family'
|
||||||
# standard表示处理所有恶意样本
|
# standard表示处理所有恶意样本
|
||||||
# split_samples('standard')
|
# split_samples()
|
||||||
# one_family表示仅处理一个家族,仅用于测试原模型的二分类
|
# one_family表示仅处理一个家族,仅用于测试原模型的二分类
|
||||||
# split_samples('one_family')
|
# split_samples('one_family')
|
||||||
# benign表示处理良性样本
|
# benign表示处理良性样本
|
||||||
|
Loading…
Reference in New Issue
Block a user