commit cd8ed5d80267dee8eefd947fbc54e9c5a3f0e284 Author: huihun <781165206@qq.com> Date: Sun Mar 3 18:16:42 2024 +0800 first commit diff --git a/log_utils.py b/log_utils.py new file mode 100644 index 0000000..bf12cf2 --- /dev/null +++ b/log_utils.py @@ -0,0 +1,45 @@ +import logging +import os + + +def setup_logger(name, log_file, level=logging.INFO): + """Function setup as many loggers as you want""" + if not os.path.exists(os.path.dirname(log_file)): + os.makedirs(os.path.dirname(log_file)) + + formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + + handler = logging.FileHandler(log_file) + handler.setFormatter(formatter) + + # 控制台是否输出日志信息 + # stream_handler = logging.StreamHandler() + # stream_handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(handler) + # 控制台 + # logger.addHandler(stream_handler) + + # 刷新原有log文件 + + if os.path.exists(log_file): + open(log_file, 'w').close() + + return logger + + +""" +用法示例 +""" + + +def main(): + log_file = "app.log" + logger = setup_logger(__name__, log_file) + + logger.info("Application started.") + logger.debug("Debug message.") + logger.warning("Warning message.") + logger.error("Error occurred.") diff --git a/main.py b/main.py new file mode 100644 index 0000000..79c3079 --- /dev/null +++ b/main.py @@ -0,0 +1,84 @@ +import csv +import re +import pandas as pd +import os +from tqdm import tqdm +from log_utils import setup_logger +import time +from datetime import datetime + + +def csv_write(data: list): + df = pd.DataFrame(data) + chunksize = 1000 + for i in range(0, len(df), chunksize): + df.iloc[i:i + chunksize].to_csv('./out/output.csv', mode='a', header=False, index=False) + return True + + +def findOpcode_in_asm_file(content, logger): + pattern = r'\t{2}(\w+)\s' + result = [] + sections = content.read().split("\n\n") + over_num_flag = False + for item in sections: + if item.startswith(';'): + continue + # if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: + # TODO 判断函数是否为外部函数 + instructions = re.findall(pattern, item) + if instructions and len(instructions) != 1 and instructions[0] != 'retn': + instructions_remove_Opcode_list = {'align', 'dp', 'dd', 'db', 'dq'} + if not instructions_remove_Opcode_list.isdisjoint(instructions): + instructions[:] = [item for item in instructions if item not in instructions_remove_Opcode_list] + if len(instructions) > 200: + over_num_flag = True + logger.info(f"over 200 Opcode is {instructions},list len {len(instructions)}") + result.append(instructions[:200]) + else: + result.append(instructions) + none_flag = True if len(result) == 0 else False + return over_num_flag, none_flag, result + + +if __name__ == '__main__': + start_time = time.time() + logger = setup_logger('asm_to_csv', './log/asm_to_csv.log') + asm_file_path = os.path.join("D:/bishe/dataset/infected/infected_asm/") + file_list = os.listdir(asm_file_path) + Opcode_list = [] + none_Opcode_list = [] + done_file_num = 0 + for file in file_list: + try: + with open(asm_file_path + file, 'r', errors='ignore') as asm_file: + over_flag, flag, result = findOpcode_in_asm_file(asm_file, logger) + if flag: + logger.warning(f"file {file} Opcode is empty") + continue + else: + if over_flag: + logger.info(f"file {file} Opcode num is over 200") + Opcode_list.extend(result) + done_file_num += 1 + if len(Opcode_list) > 50000: + print("*======================start write==================================*") + write_res = csv_write(Opcode_list) + Opcode_list.clear() + print("list clear") + print(f"done {done_file_num} files") + print("*=================write to csv success==============================*") + except Exception as e: + print(f"Error processing file {file}: {e}") + if len(Opcode_list) > 0: + print("*======================start write==================================*") + write_res = csv_write(Opcode_list) + Opcode_list.clear() + print("list clear") + print(f"done {done_file_num} files") + print("*=================write to csv success==============================*") + logger.debug(f"none Opcode file list {none_Opcode_list} ") + end_time = time.time() + print(f"Done processing {done_file_num} files") + print(f"Total time: {end_time - start_time} " + f"seconds, start at :{datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')}")