asm_to_csv/main.py

85 lines
3.5 KiB
Python
Raw Normal View History

2024-03-03 18:16:42 +08:00
import csv
import re
import pandas as pd
import os
from tqdm import tqdm
from log_utils import setup_logger
import time
from datetime import datetime
def csv_write(data: list):
df = pd.DataFrame(data)
chunksize = 1000
for i in range(0, len(df), chunksize):
df.iloc[i:i + chunksize].to_csv('./out/output.csv', mode='a', header=False, index=False)
return True
def findOpcode_in_asm_file(content, logger):
pattern = r'\t{2}(\w+)\s'
result = []
sections = content.read().split("\n\n")
over_num_flag = False
for item in sections:
if item.startswith(';'):
continue
# if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
# TODO 判断函数是否为外部函数
instructions = re.findall(pattern, item)
if instructions and len(instructions) != 1 and instructions[0] != 'retn':
instructions_remove_Opcode_list = {'align', 'dp', 'dd', 'db', 'dq'}
if not instructions_remove_Opcode_list.isdisjoint(instructions):
instructions[:] = [item for item in instructions if item not in instructions_remove_Opcode_list]
if len(instructions) > 200:
over_num_flag = True
logger.info(f"over 200 Opcode is {instructions},list len {len(instructions)}")
result.append(instructions[:200])
else:
result.append(instructions)
none_flag = True if len(result) == 0 else False
return over_num_flag, none_flag, result
if __name__ == '__main__':
start_time = time.time()
logger = setup_logger('asm_to_csv', './log/asm_to_csv.log')
asm_file_path = os.path.join("D:/bishe/dataset/infected/infected_asm/")
file_list = os.listdir(asm_file_path)
Opcode_list = []
none_Opcode_list = []
done_file_num = 0
for file in file_list:
try:
with open(asm_file_path + file, 'r', errors='ignore') as asm_file:
over_flag, flag, result = findOpcode_in_asm_file(asm_file, logger)
if flag:
logger.warning(f"file {file} Opcode is empty")
continue
else:
if over_flag:
logger.info(f"file {file} Opcode num is over 200")
Opcode_list.extend(result)
done_file_num += 1
if len(Opcode_list) > 50000:
print("*======================start write==================================*")
write_res = csv_write(Opcode_list)
Opcode_list.clear()
print("list clear")
print(f"done {done_file_num} files")
print("*=================write to csv success==============================*")
except Exception as e:
print(f"Error processing file {file}: {e}")
if len(Opcode_list) > 0:
print("*======================start write==================================*")
write_res = csv_write(Opcode_list)
Opcode_list.clear()
print("list clear")
print(f"done {done_file_num} files")
print("*=================write to csv success==============================*")
logger.debug(f"none Opcode file list {none_Opcode_list} ")
end_time = time.time()
print(f"Done processing {done_file_num} files")
print(f"Total time: {end_time - start_time} "
f"seconds, start at :{datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')}")