diff --git a/OpcodeGet.py b/OpcodeGet.py new file mode 100644 index 0000000..0170ec7 --- /dev/null +++ b/OpcodeGet.py @@ -0,0 +1,143 @@ +import os +import re +from log_utils import setup_logger +from tqdm import tqdm + +import r2pipe +import pandas as pd + +def Opcode_to_csv(opcode_list, file_type): + logger.info("*======================start write==================================*") + csv_write(f'output_{file_type}.csv', opcode_list) + logger.info(f"done {done_file_num} files") + logger.info("*=================write to csv success==============================*") + +def csv_write(file_name, data: list): + """write data to csv""" + df = pd.DataFrame(data) + chunksize = 1000 + for i in range(0, len(df), chunksize): + df.iloc[i:i + chunksize].to_csv(f'./out/{file_name}', mode='a', header=False, index=False) + return True +def extract_opcode(disasm_text): + """ + 从反汇编文本中提取操作码和操作数 + 正则表达式用于匹配操作码和操作数,考虑到操作数可能包含空格和逗号 + """ + match = re.search(r"^\s*(\S+)(?:\s+(.*))?$", disasm_text) + if match: + opcode = match.group(1) + # operands_str = match.group(2) if match.group(2) is not None else "" + # split_pattern = re.compile(r",(?![^\[]*\])") # 用于切分操作数的正则表达式 + # operands = split_pattern.split(operands_str) + # return opcode, [op.strip() for op in operands if op.strip()] + return opcode + return "" + +def get_graph_r2pipe(r2pipe_open, file_type): + # 获取基础块内的操作码序列 + opcode_Sequence = [] + try: + # 获取函数列表 + function_list = r2pipe_open.cmdj("aflj") + + for function in function_list: + + # 外部函数测试 + # if function['name'] == 'sub.TNe_U': + # print(function) + # block_list = r2pipe_open.cmdj("afbj @" + str(function['offset'])) + # for block in block_list: + # # print(block) + # # 获取基本块的反汇编指令 + # disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) + # if disasm: + # for op in disasm: + # print(extract_opcode(op["opcode"])) + + block_list = r2pipe_open.cmdj("afbj @" + str(function['offset'])) + block_opcode_Sequence = [] + for block in block_list: + # print(block) + # 获取基本块的反汇编指令 + disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) + if disasm: + for op in disasm: + if op["type"] == "invalid": + continue + block_opcode_Sequence.append(extract_opcode(op["opcode"])) + opcode_Sequence.append([file_type, file_type, len(block_opcode_Sequence), ' '.join(block_opcode_Sequence)]) + except: + print("Error: get function list failed") + return opcode_Sequence + + + + +if __name__ == '__main__': + logger = setup_logger('logger', 'log/opcode_benign.log') + file_type = 'benign' + file_path = os.path.join('/mnt/d/bishe/dataset/train_benign') + file_list = os.listdir(file_path)[:10000] + done_file_num = 0 + process_bar = tqdm(desc='Processing...', leave=True, total=len(file_list)) + done_list = [['class', 'sub-class','size', 'corpus']] + for file_name in file_list: + r2pipe_open = r2pipe.open(os.path.join(file_path, file_name), flags=['-2']) + r2pipe_open.cmd("aaa") + done_list.extend(get_graph_r2pipe(r2pipe_open, file_type)) + if len(done_list) > 100000: + csv_write(f'output_{file_type}.csv', done_list) + done_file_num += 1 + done_list.clear() + process_bar.update(1) + else: + csv_write(f'output_{file_type}.csv', done_list) + + + + + + + + # node_list = [] + # edge_list = [] + # temp_edge_list = [] + # node_info_list = [] + # + # for function in function_list: + # block_list = r2pipe_open.cmdj("afbj @" + str(function['offset'])) + # + # for block in block_list: + # node_list.append(block["addr"]) + # + # # 获取基本块的反汇编指令 + # disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) + # node_info = [] + # if disasm: + # for op in disasm: + # if op["type"] == "invalid": + # continue + # opcode, operands = extract_opcode_and_operands(op["disasm"]) + # # 处理跳转指令 + # if "jump" in op and op["jump"] != 0: + # temp_edge_list.append([block["addr"], op["jump"]]) + # node_info.append([op["offset"], op["bytes"], opcode, op["jump"]]) + # else: + # node_info.append([op["offset"], op["bytes"], opcode, None]) + # node_info_list.append(node_info) + # + # # 完成 CFG 构建后, 检查并清理不存在的出边 + # for temp_edge in temp_edge_list: + # if temp_edge[1] in node_list: + # edge_list.append(temp_edge) + # + # # 获取排序后元素的原始索引 + # sorted_indices = [i for i, v in sorted(enumerate(node_list), key=lambda x: x[1])] + # # 根据这些索引重新排列 + # node_list = [node_list[i] for i in sorted_indices] + # node_info_list = [node_info_list[i] for i in sorted_indices] + # + # return True, "二进制可执行文件解析成功", node_list, edge_list, node_info_list + # except Exception as e: + # return False, e, None, None, None \ No newline at end of file diff --git a/main.py b/main.py index 79c3079..54923cb 100644 --- a/main.py +++ b/main.py @@ -7,16 +7,32 @@ from log_utils import setup_logger import time from datetime import datetime +max_opcode_num = 0 -def csv_write(data: list): + +def csv_write(file_name, data: list): + """write data to csv""" df = pd.DataFrame(data) chunksize = 1000 for i in range(0, len(df), chunksize): - df.iloc[i:i + chunksize].to_csv('./out/output.csv', mode='a', header=False, index=False) + df.iloc[i:i + chunksize].to_csv(f'./out/{file_name}', mode='a', header=False, index=False) return True -def findOpcode_in_asm_file(content, logger): +def findOpcode_in_asm_file(content, logger, file_type): + """ + 在给定的汇编文件内容中查找操作码(opcode)。 + + 参数: + - content: 文件内容的迭代器,预期能逐行读取文件内容。 + - logger: 日志记录器对象,用于记录过程中的信息。 + + 返回值: + - over_num_flag: 布尔值,如果找到的操作码数量超过200,则为True,否则为False。 + - none_flag: 布尔值,如果未找到任何操作码,则为True,否则为False。 + - result: 列表,包含找到的操作码列表。如果找到的数量超过200,则仅包含前200个。 + """ + global max_opcode_num pattern = r'\t{2}(\w+)\s' result = [] sections = content.read().split("\n\n") @@ -27,32 +43,43 @@ def findOpcode_in_asm_file(content, logger): # if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: # TODO 判断函数是否为外部函数 instructions = re.findall(pattern, item) - if instructions and len(instructions) != 1 and instructions[0] != 'retn': - instructions_remove_Opcode_list = {'align', 'dp', 'dd', 'db', 'dq'} + if len(instructions) > 0 and len(instructions) != 1 and instructions[0] != 'retn': + instructions_remove_Opcode_list = {'align', 'dp', 'dd', 'db', 'dq', 'dw'} if not instructions_remove_Opcode_list.isdisjoint(instructions): instructions[:] = [item for item in instructions if item not in instructions_remove_Opcode_list] + if len(instructions) > 0: + result.append([file_type, file_type, len(instructions), ' '.join(instructions)]) if len(instructions) > 200: + max_opcode_num = len(instructions) if len(instructions) > max_opcode_num else max_opcode_num over_num_flag = True logger.info(f"over 200 Opcode is {instructions},list len {len(instructions)}") - result.append(instructions[:200]) - else: - result.append(instructions) none_flag = True if len(result) == 0 else False return over_num_flag, none_flag, result +def Opcode_to_csv(opcode_list, file_type): + logger.info("*======================start write==================================*") + csv_write(f'output_{file_type}.csv', opcode_list) + logger.info(f"done {done_file_num} files") + logger.info("*=================write to csv success==============================*") + + if __name__ == '__main__': start_time = time.time() - logger = setup_logger('asm_to_csv', './log/asm_to_csv.log') + # 文件相关设置 + file_type = 'malware' + logger = setup_logger('asm_to_csv', f'./log/asm_to_csv_{file_type}.log') asm_file_path = os.path.join("D:/bishe/dataset/infected/infected_asm/") + # end file_list = os.listdir(asm_file_path) Opcode_list = [] none_Opcode_list = [] done_file_num = 0 + process_bar = tqdm(desc='Processing...', leave=True, total=len(file_list)) for file in file_list: try: with open(asm_file_path + file, 'r', errors='ignore') as asm_file: - over_flag, flag, result = findOpcode_in_asm_file(asm_file, logger) + over_flag, flag, result = findOpcode_in_asm_file(asm_file, logger, file_type) if flag: logger.warning(f"file {file} Opcode is empty") continue @@ -62,23 +89,20 @@ if __name__ == '__main__': Opcode_list.extend(result) done_file_num += 1 if len(Opcode_list) > 50000: - print("*======================start write==================================*") - write_res = csv_write(Opcode_list) + Opcode_to_csv(Opcode_list, file_type) Opcode_list.clear() - print("list clear") - print(f"done {done_file_num} files") - print("*=================write to csv success==============================*") + except Exception as e: - print(f"Error processing file {file}: {e}") + logger.error(f"Error processing file {file}: {e}") + finally: + process_bar.update(1) if len(Opcode_list) > 0: - print("*======================start write==================================*") - write_res = csv_write(Opcode_list) + Opcode_to_csv(Opcode_list, file_type) Opcode_list.clear() - print("list clear") - print(f"done {done_file_num} files") - print("*=================write to csv success==============================*") logger.debug(f"none Opcode file list {none_Opcode_list} ") + csv_write('none_Opcode_list.csv', none_Opcode_list) end_time = time.time() - print(f"Done processing {done_file_num} files") - print(f"Total time: {end_time - start_time} " + logger.info(f"max_opcode_num is {max_opcode_num}") + logger.info(f"Done processing {done_file_num} files") + logger.info(f"Total time: {end_time - start_time} " f"seconds, start at :{datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')}") diff --git a/ngram.py b/ngram.py new file mode 100644 index 0000000..2e3a6d0 --- /dev/null +++ b/ngram.py @@ -0,0 +1,209 @@ +from collections import defaultdict +from tqdm import tqdm +import pandas as pd +import os +import csv +import argparse +import statistics +import plotly.express as px + +################################################################################################### +## Program shall take two csv files of different classes - benign and malware +## It will compute ngrams for each of the classes seperately and find the delta frequencies +## of each computed ngram. delta_frequencies = (class1 - class2) +################################################################################################### + +#-------------------------------------------------------------------------------------------------- +## Generate ngrams given the corpus and factor n +def generate_N_grams(corpus, n=1): + + words = [word for word in corpus.split(" ")] + temp = zip(*[words[i:] for i in range(0, n)]) + ngram = [' '.join(n) for n in temp] + return ngram + +#-------------------------------------------------------------------------------------------------- +## Creates ngrams for the corpus List for given N and Filters it based on following criteria +# file count >= percent of Total corpus len (pecent in [1..100]) +# Selects high frequency ngram until the mean value +# Returns both complete and filtered dictionary of ngrams +def filter_N_grams (corpusList, N, percent, filterFreq=0): + total = len(corpusList) + ngramDictionary = defaultdict(int) + ngramFileCount = defaultdict(int) + for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"): + opcodes = corpusList[idx] + for item in generate_N_grams(opcodes, N): + #compute frequency of all unique ngrams + if len(opcodes) == 0: + continue + ngramDictionary[item] += 1 + #compute ngram file count + for item in ngramDictionary: + ngramFileCount[item] += 1 + + filteredNgramDictionary = defaultdict(int) + #Filter those ngrams which meet percent of Total files criteria + filterCnt = round(int((percent * total)/ 100), 0) + for item in ngramFileCount: + if ngramFileCount[item] >= filterCnt: + #Add to filtered dictionary the item which meets file count criteria + filteredNgramDictionary[item] = ngramDictionary[item] + + #Filter ngram with a minimum frequency + if (filterFreq): + for item in ngramDictionary: + if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary: + #Remove the item which below the frequency threshold + filteredNgramDictionary.pop(item) + + #print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n") + return [ngramDictionary, filteredNgramDictionary] + +#-------------------------------------------------------------------------------------------------- +# Calculate a normalization factor for frequency values of class1 and class2 +# For class which are high in frequency due their sample size, a normalization may required to be +# factored for correctly resizgin the frequencies of the small class set. +# input list of frequencies of class1 and class 2 +def normalization_factor(class1, class2): + mean1 = statistics.mean(class1) + mean2 = statistics.mean(class2) + return mean1/mean2 + +#-------------------------------------------------------------------------------------------------- +# Write the data into the given csv file handle +def WriteCSV (file, csvFields, dataDictionary): + writer = csv.DictWriter(file, fieldnames=csvFields) + writer.writeheader() + writer.writerows(dataDictionary) + +#-------------------------------------------------------------------------------------------------- +# Execution starts here +# Add command line arguments +# CSV header: class, sub-class, size, corpus +parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.") +parser.add_argument('malware_csvfile', help='path to the malware corpus csv file') +parser.add_argument('benign_csvfile', help='path to the benign corpus csv file') +parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive') + +# Execute the parse_args() method + + +# Get user arguments +malware_csvfile = os.path.join('./out/output_malware.csv') +benign_csvfile = os.path.join('./out/output_benign.csv') +maxgrams = 3 + +# Error check and exit if not a file +if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)): + print (f"Path should be csv file!") + exit(1) + +# Read the csv file using pandas into data frame +try: + malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8") + benignDF = pd.read_csv(benign_csvfile, encoding="utf8") +except Exception as error: + print(error) + +#Build a frequency list for ngrams +filePercentFilter = 80 ## select ngrams present in x% of files +frequencyFilter = 20 ## select ngrams with frequency greater than this value + +malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus +benignNgram = defaultdict(int) ## full list of ngrams in benign corpus +filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus +filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus + +## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) +filteredMergedNgram = defaultdict(int) + + +#run for only the maxgram provided, change lower value to 0 to run for all values [1..N] +for idx in range(maxgrams-1, maxgrams): + print(f"Computing {idx+1}gram on files ...") + malwareNgram.clear() + filteredMalwareNgram.clear() + benignNgram.clear() + filteredBenignNgram.clear() + filteredMergedNgram.clear() + + #opcodes decoded from pe file in sequence is stored as corpus in the csv + [malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1, + filePercentFilter, frequencyFilter) + + [benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1, + filePercentFilter, frequencyFilter) + + #creates a sorted list of ngram tuples with their frequency for 1 .. maxgram + print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}") + print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}") + + ## Make a intersection of filtered list between malware and benign ngrams + mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) + + ## Now find the relative frequency b/w benign and malware files. = benign - malware + ## write this for cases where ngrams only present in one of the clases malware or benign + ## for reusability in case a union of classes is taken. + for item in mergedList: + key = item #get the ngram only + if key in filteredBenignNgram: + if key in filteredMalwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] + elif item in malwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] + else: + filteredMergedNgram[key] = filteredBenignNgram[key] + elif key in filteredMalwareNgram: + if key in benignNgram: + filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] + else: + filteredMergedNgram[key] = filteredMalwareNgram[key] + + print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}") + ## get a sorted list of merged ngrams with relative frequencies + sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) + + #Plot a scatter graph - + # y values as relative frequency benign-malware + # x values as max frequency of a ngram max(malware, benign) + # color labels as 'a' + frequency % 26 + # size as frequency/max * 100 + # hover name is ngram name + titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")" + htmlfile = str (idx+1) +"gram.html" + hovername = [item[0] for item in sortedMergedNgramList] + yval = [item[1] for item in sortedMergedNgramList] + xval = [] + for key in hovername: + xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key])) + colors = [chr(ord('a')+ (value %26)) for value in xval] + maxval = max(xval) + sizeval = [(int((val/maxval)*100)+1) for val in xval] + + fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors, + size=sizeval, hover_name=hovername, log_x=True, + labels = { + "x": "Absolute Frequency", + "y": "Relative Frequency"}) + fig.show() + fig.write_html(htmlfile) + + #write the final ngrams into a file for feature selection + ngramDictList = [] + for item in sortedMergedNgramList: + dictItem = {} + key = item[0] + dictItem['ngram'] = key + dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) + ngramDictList.append(dictItem) + + csvfields = ['ngram', 'count'] + csvname = str(idx+1) + "gram.csv" + try: + csvfile = open(csvname, 'w') + except Exception as err: + print(f"Error: writing csvfile {err}") + WriteCSV(csvfile, csvfields, ngramDictList) + csvfile.close() +