detect_rep/data_extract/data_pretreat.py
2023-04-05 10:04:49 +08:00

66 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
def csv_read(malware_csv='../cfg_data/malware_msg.csv'):
malware_cfg_list=[]
with open(malware_csv, 'r', encoding='utf-8') as f:
#经下述操作后reader成为了一个可以迭代行的文件
reader = csv.reader(f)
#先拿出csv文件的首行一般是基本名称说明的行此时指针指向下一行
header = next(reader)
# print(header)
for row in reader:
file_name=row[0]
nodes_num=row[1]
edgs_num=row[2]
# insert_point_count=row[3]
#节点数量设置最大最小值插入点至少有10个
malware_cfg_list.append([file_name, str(nodes_num), str(edgs_num)])
return malware_cfg_list
#删除节点数量小于15大于10000的CFG
def remove_small_big_cfg(csv_data):
pretreat_data=[]
for item in csv_data:
node_num = item[1]
if int(node_num)>=15 and int(node_num)<=10000:
pretreat_data.append(item)
return pretreat_data
#删除重复的样本
def remove_repeat_sample(csv_data):
pretreat_data=[]
for origin in csv_data:
flag=1
for pre in pretreat_data:
if origin[1]==pre[1] and origin[2]==pre[2]:
flag=0
break
if flag==1:
pretreat_data.append(origin)
return pretreat_data
def write_csv(pretreat_data,csv_save_path="../cfg_data/malware_pretreat_msg.csv",header = ['malware_name','nodes_num','edgs_num']):
with open(csv_save_path, 'w', encoding='utf-8', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(header)
writer.writerows(pretreat_data)
if __name__ == '__main__':
csv_data=csv_read('../cfg_data_with_feature/malware_msg1.csv')
pretreat_data=remove_small_big_cfg(csv_data)
pretreat_data=remove_repeat_sample(pretreat_data)
write_csv(pretreat_data,csv_save_path="../cfg_data_with_feature/malware_pretreat_msg1.csv")
print("malware_len",len(pretreat_data))
csv_data = csv_read('../cfg_data_with_feature/benign_msg1.csv')
pretreat_data = remove_small_big_cfg(csv_data)
pretreat_data = remove_repeat_sample(pretreat_data)
write_csv(pretreat_data, csv_save_path="../cfg_data_with_feature/benign_pretreat_msg1.csv")
print("benign_len", len(pretreat_data))
print("good")