detect_rep/data_extract/data_pretreat.py

66 lines
2.3 KiB
Python
Raw Normal View History

2023-04-05 10:04:49 +08:00
import csv
def csv_read(malware_csv='../cfg_data/malware_msg.csv'):
malware_cfg_list=[]
with open(malware_csv, 'r', encoding='utf-8') as f:
#经下述操作后reader成为了一个可以迭代行的文件
reader = csv.reader(f)
#先拿出csv文件的首行一般是基本名称说明的行此时指针指向下一行
header = next(reader)
# print(header)
for row in reader:
file_name=row[0]
nodes_num=row[1]
edgs_num=row[2]
# insert_point_count=row[3]
#节点数量设置最大最小值插入点至少有10个
malware_cfg_list.append([file_name, str(nodes_num), str(edgs_num)])
return malware_cfg_list
#删除节点数量小于15大于10000的CFG
def remove_small_big_cfg(csv_data):
pretreat_data=[]
for item in csv_data:
node_num = item[1]
if int(node_num)>=15 and int(node_num)<=10000:
pretreat_data.append(item)
return pretreat_data
#删除重复的样本
def remove_repeat_sample(csv_data):
pretreat_data=[]
for origin in csv_data:
flag=1
for pre in pretreat_data:
if origin[1]==pre[1] and origin[2]==pre[2]:
flag=0
break
if flag==1:
pretreat_data.append(origin)
return pretreat_data
def write_csv(pretreat_data,csv_save_path="../cfg_data/malware_pretreat_msg.csv",header = ['malware_name','nodes_num','edgs_num']):
with open(csv_save_path, 'w', encoding='utf-8', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(header)
writer.writerows(pretreat_data)
if __name__ == '__main__':
csv_data=csv_read('../cfg_data_with_feature/malware_msg1.csv')
pretreat_data=remove_small_big_cfg(csv_data)
pretreat_data=remove_repeat_sample(pretreat_data)
write_csv(pretreat_data,csv_save_path="../cfg_data_with_feature/malware_pretreat_msg1.csv")
print("malware_len",len(pretreat_data))
csv_data = csv_read('../cfg_data_with_feature/benign_msg1.csv')
pretreat_data = remove_small_big_cfg(csv_data)
pretreat_data = remove_repeat_sample(pretreat_data)
write_csv(pretreat_data, csv_save_path="../cfg_data_with_feature/benign_pretreat_msg1.csv")
print("benign_len", len(pretreat_data))
print("good")