From 8e9c7e31c4977ed8ada08e82ed2c74782de0675e Mon Sep 17 00:00:00 2001 From: huihun <781165206@qq.com> Date: Wed, 13 Mar 2024 15:09:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=96=E9=83=A8=E5=87=BD=E6=95=B0=E6=B5=8B?= =?UTF-8?q?=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- OpcodeGet.py | 104 +++++++++++----------------- funNameGet.py | 36 ++++++++++ ngram.py | 188 ++++++++++++++++++++++++++------------------------ 3 files changed, 177 insertions(+), 151 deletions(-) create mode 100644 funNameGet.py diff --git a/OpcodeGet.py b/OpcodeGet.py index 9cb9f2c..0510138 100644 --- a/OpcodeGet.py +++ b/OpcodeGet.py @@ -1,3 +1,4 @@ +import concurrent.futures import os import re from log_utils import setup_logger @@ -7,19 +8,25 @@ import r2pipe import pandas as pd +csv_lock = 0 + + def Opcode_to_csv(opcode_list, file_type): - logger.info("*======================start write==================================*") + csv_write(f'output_{file_type}.csv', opcode_list) logger.info(f"done {done_file_num} files") - logger.info("*=================write to csv success==============================*") + def csv_write(file_name, data: list): """write data to csv""" + logger.info("*======================start write==================================*") df = pd.DataFrame(data) chunksize = 1000 for i in range(0, len(df), chunksize): df.iloc[i:i + chunksize].to_csv(f'./out/{file_name}', mode='a', header=False, index=False) + logger.info(f"done rows {len(df)}") + logger.info("*=================write to csv success==============================*") return True @@ -39,13 +46,15 @@ def extract_opcode(disasm_text): return "" -def get_graph_r2pipe(r2pipe_open, file_type): +def get_graph_r2pipe(file_type, file_name): # 获取基础块内的操作码序列 + r2pipe_open = r2pipe.open(os.path.join(file_path, file_name), flags=['-2']) opcode_Sequence = [] try: # 获取函数列表 + r2pipe_open.cmd("aaa") + r2pipe_open.cmd('e arch=x86') function_list = r2pipe_open.cmdj("aflj") - for function in function_list: # 外部函数测试 @@ -68,74 +77,45 @@ def get_graph_r2pipe(r2pipe_open, file_type): disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) if disasm: for op in disasm: - if op["type"] == "invalid": + if op["type"] == "invalid" or op["opcode"] == "invalid": continue block_opcode_Sequence.append(extract_opcode(op["opcode"])) opcode_Sequence.append( [file_type, file_type, len(block_opcode_Sequence), ' '.join(block_opcode_Sequence)]) - except: - print("Error: get function list failed") + except Exception as e: + logger.error(f"Error: get function list failed in {file_name}") + print(f"Error: get function list failed in {file_name} ,error info {e}") + r2pipe_open.quit() return opcode_Sequence if __name__ == '__main__': - logger = setup_logger('logger', './log/opcode_benign.log') - file_type = 'benign' - file_path = os.path.join('/mnt/d/bishe/dataset/train_benign') + file_type = 'malware' + logger = setup_logger('logger', f'./log/opcode_{file_type}.log') + file_path = os.path.join('/mnt/d/bishe/dataset/sample_20230130_458') + print(f"max works {os.cpu_count()}") file_list = os.listdir(file_path)[:10000] done_file_num = 0 - process_bar = tqdm(desc='Processing...', leave=True, total=len(file_list)) done_list = [['class', 'sub-class', 'size', 'corpus']] - for file_name in file_list: - r2pipe_open = r2pipe.open(os.path.join(file_path, file_name), flags=['-2']) - r2pipe_open.cmd("aaa") - done_list.extend(get_graph_r2pipe(r2pipe_open, file_type)) - if len(done_list) > 100000: + process_bar = tqdm(desc=f'Processing {file_type}...', leave=True, total=len(file_list)) + with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # 调整线程池大小 + future_to_args = { + executor.submit(get_graph_r2pipe, file_type, file_name): file_name for file_name in file_list + } + for future in concurrent.futures.as_completed(future_to_args): + try: + tmp = future.result() + done_list.extend(tmp if len(tmp) > 0 else []) + if len(done_list) > 100000: + csv_write(f'output_{file_type}.csv', done_list) + done_file_num += 1 + done_list.clear() + except Exception as e: + logger.error(f"Error: {e}") + print(f"Error: {e}") + finally: + process_bar.update(1) + else: csv_write(f'output_{file_type}.csv', done_list) - done_file_num += 1 - done_list.clear() - process_bar.update(1) - else: - csv_write(f'output_{file_type}.csv', done_list) - # node_list = [] - # edge_list = [] - # temp_edge_list = [] - # node_info_list = [] - # - # for function in function_list: - # block_list = r2pipe_open.cmdj("afbj @" + str(function['offset'])) - # - # for block in block_list: - # node_list.append(block["addr"]) - # - # # 获取基本块的反汇编指令 - # disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) - # node_info = [] - # if disasm: - # for op in disasm: - # if op["type"] == "invalid": - # continue - # opcode, operands = extract_opcode_and_operands(op["disasm"]) - # # 处理跳转指令 - # if "jump" in op and op["jump"] != 0: - # temp_edge_list.append([block["addr"], op["jump"]]) - # node_info.append([op["offset"], op["bytes"], opcode, op["jump"]]) - # else: - # node_info.append([op["offset"], op["bytes"], opcode, None]) - # node_info_list.append(node_info) - # - # # 完成 CFG 构建后, 检查并清理不存在的出边 - # for temp_edge in temp_edge_list: - # if temp_edge[1] in node_list: - # edge_list.append(temp_edge) - # - # # 获取排序后元素的原始索引 - # sorted_indices = [i for i, v in sorted(enumerate(node_list), key=lambda x: x[1])] - # # 根据这些索引重新排列 - # node_list = [node_list[i] for i in sorted_indices] - # node_info_list = [node_info_list[i] for i in sorted_indices] - # - # return True, "二进制可执行文件解析成功", node_list, edge_list, node_info_list - # except Exception as e: - # return False, e, None, None, None + diff --git a/funNameGet.py b/funNameGet.py new file mode 100644 index 0000000..15a7020 --- /dev/null +++ b/funNameGet.py @@ -0,0 +1,36 @@ +import concurrent.futures +import os +import r2pipe +from tqdm import tqdm + + +def get_fun_name_list(file_path): + # 读取csv文件 + r2 = r2pipe.open(os.path.join(file_path), flags=['-2']) + r2.cmd('aaa') + r2.cmd('e arch=x86') + function_list = r2.cmdj("aflj") + fun_name_list = [] + for function in function_list: + fun_name_list.append(function['name']) + r2.quit() + return fun_name_list + + +if __name__ == '__main__': + file_path = os.path.join('/mnt/d/bishe/dataset/sample_20230130_458') + file_list = os.listdir(file_path) + fun_name_set = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: + future_to_args = { + executor.submit(get_fun_name_list, os.path.join(file_path, file_name)): file_name + for file_name in file_list + } + for future in tqdm(concurrent.futures.as_completed(future_to_args), total=len(future_to_args)): + fun_name_list = future.result() + for fun_name in fun_name_list: + if fun_name not in fun_name_set: + fun_name_set[fun_name] = 1 + else: + fun_name_set[fun_name] += 1 + print(fun_name_set) diff --git a/ngram.py b/ngram.py index 5fbf5b5..e71c18e 100644 --- a/ngram.py +++ b/ngram.py @@ -1,3 +1,4 @@ +import threading from collections import defaultdict from tqdm import tqdm import pandas as pd @@ -101,6 +102,8 @@ def process_csv_file(csvfile, ngram_type, file_percent_filter, frequency_filter) idx + 1, file_percent_filter, frequency_filter): start for start in range(0, len(dataframe['corpus'].values), 10000) } + + for future in concurrent.futures.as_completed(future_to_args): try: sub_ngram_list, sub_filtered_ngram_list = future.result() @@ -122,11 +125,28 @@ def process_csv_file(csvfile, ngram_type, file_percent_filter, frequency_filter) # Execute the parse_args() method +def build_csv(ngram_list, filter_list, maxgrams, file_type): + ngramDicList = [] + csv_file_header = ['ngram', 'count'] + csv_file = os.path.join('./out', f'{file_type}-{maxgrams}-gram.csv') + for index in tqdm(range(len(ngram_list)), desc=f'Building {maxgrams}-gram csv'): + ngramDicList.append({ + 'ngram': ngram_list[index], + 'count': filter_list[index] + }) + try: + csv_file = open(csv_file, 'w') + except Exception as e: + print(f"Error opening {csv_file} for writing: {e}") + WriteCSV(csv_file, csv_file_header, ngramDicList) + csv_file.close() + + if __name__ == '__main__': # Get user arguments malware_csvfile = os.path.join('./out/output_malware.csv') benign_csvfile = os.path.join('./out/output_benign.csv') - maxgrams = 3 + maxgrams_list = [3,2,1] # Error check and exit if not a file if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)): @@ -136,100 +156,90 @@ if __name__ == '__main__': # Read the csv file using pandas into data frame # Build a frequency list for ngrams - filePercentFilter = 80 ## select ngrams present in x% of files - frequencyFilter = 20 ## select ngrams with frequency greater than this value + for maxgrams in maxgrams_list: + filePercentFilter = 80 ## select ngrams present in x% of files + frequencyFilter = 20 ## select ngrams with frequency greater than this value - malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus - benignNgram = defaultdict(int) ## full list of ngrams in benign corpus - filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus - filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus + malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus + benignNgram = defaultdict(int) ## full list of ngrams in benign corpus + filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus + filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus - ## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) - filteredMergedNgram = defaultdict(int) + ## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) + filteredMergedNgram = defaultdict(int) - # run for only the maxgram provided, change lower value to 0 to run for all values [1..N] - for idx in range(maxgrams - 1, maxgrams): - print(f"Computing {idx + 1}gram on files ...") - print(f"CPU core {os.cpu_count()} on use") - malwareNgram = [] - filteredMalwareNgram = [] - benignNgram = [] - filteredBenignNgram = [] - malwareNgram.clear() - filteredMalwareNgram.clear() - benignNgram.clear() - filteredBenignNgram.clear() - filteredMergedNgram.clear() + # run for only the maxgram provided, change lower value to 0 to run for all values [1..N] + for idx in range(maxgrams - 1, maxgrams): + print(f"Computing {idx + 1}gram on files ...") + print(f"CPU core {os.cpu_count()} on use") + malwareNgram = [] + filteredMalwareNgram = [] + benignNgram = [] + filteredBenignNgram = [] + malwareNgram.clear() + filteredMalwareNgram.clear() + benignNgram.clear() + filteredBenignNgram.clear() + filteredMergedNgram.clear() - # opcodes decoded from pe file in sequence is stored as corpus in the csv - malwareNgram, filteredMalwareNgram = process_csv_file(malware_csvfile, 'malware', filePercentFilter, frequencyFilter) + # opcodes decoded from pe file in sequence is stored as corpus in the csv + malwareNgram, filteredMalwareNgram = process_csv_file(malware_csvfile, 'malware', filePercentFilter, + frequencyFilter) + # build_csv(malwareNgram, filteredMalwareNgram, maxgrams, 'malware') + benignNgram, filteredBenignNgram = process_csv_file(benign_csvfile, 'benign', filePercentFilter, + frequencyFilter) + # build_csv(benignNgram, filteredBenignNgram, maxgrams, 'benign') - benignNgram, filteredBenignNgram = process_csv_file(benign_csvfile, 'benign', filePercentFilter, frequencyFilter) - # creates a sorted list of ngram tuples with their frequency for 1 .. maxgram + # creates a sorted list of ngram tuples with their frequency for 1 .. maxgram - mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) - ## Now find the relative frequency b/w benign and malware files. = benign - malware - ## write this for cases where ngrams only present in one of the clases malware or benign - ## for reusability in case a union of classes is taken. - for item in mergedList: - key = item # get the ngram only - if key in filteredBenignNgram: - if key in filteredMalwareNgram: - filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] - elif item in malwareNgram: - filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] - else: - filteredMergedNgram[key] = filteredBenignNgram[key] - elif key in filteredMalwareNgram: - if key in benignNgram: - filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] - else: - filteredMergedNgram[key] = filteredMalwareNgram[key] + mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) + ## Now find the relative frequency b/w benign and malware files. = benign - malware + ## write this for cases where ngrams only present in one of the clases malware or benign + ## for reusability in case a union of classes is taken. + for item in mergedList: + key = item # get the ngram only + if key in filteredBenignNgram: + if key in filteredMalwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] + elif item in malwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] + else: + filteredMergedNgram[key] = filteredBenignNgram[key] + elif key in filteredMalwareNgram: + if key in benignNgram: + filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] + else: + filteredMergedNgram[key] = filteredMalwareNgram[key] - print(f"Merged: {idx + 1}gramCnt={len(filteredMergedNgram.keys())}") - ## get a sorted list of merged ngrams with relative frequencies - sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) + print(f"Merged: {idx + 1}gramCnt={len(filteredMergedNgram.keys())}") + # ## get a sorted list of merged ngrams with relative frequencies + sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) - # Plot a scatter graph - - # y values as relative frequency benign-malware - # x values as max frequency of a ngram max(malware, benign) - # color labels as 'a' + frequency % 26 - # size as frequency/max * 100 - # hover name is ngram name - # titlestr = str(idx + 1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")" - # htmlfile = str(idx + 1) + "gram.html" - # hovername = [item[0] for item in sortedMergedNgramList] - # yval = [item[1]/1e10 for item in sortedMergedNgramList] - # xval = [] - # for key in hovername: - # xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key])) - # colors = [chr(ord('a') + (value % 26)) for value in xval] - # maxval = max(xval) - # sizeval = [(int((val / maxval) * 100) + 1) for val in xval] - # - # fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors, - # size=sizeval, hover_name=hovername, log_x=True, - # labels={ - # "x": "Absolute Frequency", - # "y": "Relative Frequency"}) - # fig.write_html(htmlfile) - - # write the final ngrams into a file for feature selection - ngramDictList = [] - for item in sortedMergedNgramList: - dictItem = {} - key = item[0] - dictItem['ngram'] = key - dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) - ngramDictList.append(dictItem) - - csvfields = ['ngram', 'count'] - csvname = "./out/"+str(idx + 1) + "gram.csv" - print("*======================start write csv=======================================*") - try: - csvfile = open(csvname, 'w') - except Exception as err: - print(f"Error: writing csvfile {err}") - WriteCSV(csvfile, csvfields, ngramDictList) - csvfile.close() + # write the final ngrams into a file for feature selection + AbsoluteNgramDictList = [] + RelativeNgramDictList = [] + for item in sortedMergedNgramList: + dictItem = {} + key = item[0] + dictItem['ngram'] = key + dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) + AbsoluteNgramDictList.append(dictItem) + RelativeNgramDictList.append({'ngram': item[0], 'count': item[1]}) + csvfields = ['ngram', 'count'] + AbsoluteCsvName = "./out/" + str(idx + 1) + "gram-absolute.csv" + RelativeCsvName = "./out/" + str(idx + 1) + "gram-relative.csv" + print("*======================start write csv=======================================*") + try: + csvfile = open(AbsoluteCsvName, 'w') + except Exception as err: + print(f"Error: writing csvfile {err}") + WriteCSV(csvfile, csvfields, AbsoluteNgramDictList) + csvfile.close() + try: + csvfile = open(RelativeCsvName, 'w') + except Exception as err: + print(print(f"Error: writing csvfile {err}")) + WriteCSV(csvfile, csvfields, RelativeNgramDictList) + csvfile.close() + print("*======================end write csv=======================================*")