From 3f4bde2989663dff81ee3df7eb9ffb8d3611e42c Mon Sep 17 00:00:00 2001 From: huihun <781165206@qq.com> Date: Sat, 9 Mar 2024 15:26:16 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BA=BF=E7=A8=8B=E6=B1=A0=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- OpcodeGet.py | 22 ++-- ngram.py | 290 ++++++++++++++++++++++++++++----------------------- 2 files changed, 168 insertions(+), 144 deletions(-) diff --git a/OpcodeGet.py b/OpcodeGet.py index 0170ec7..9cb9f2c 100644 --- a/OpcodeGet.py +++ b/OpcodeGet.py @@ -6,12 +6,14 @@ from tqdm import tqdm import r2pipe import pandas as pd + def Opcode_to_csv(opcode_list, file_type): logger.info("*======================start write==================================*") csv_write(f'output_{file_type}.csv', opcode_list) logger.info(f"done {done_file_num} files") logger.info("*=================write to csv success==============================*") + def csv_write(file_name, data: list): """write data to csv""" df = pd.DataFrame(data) @@ -19,6 +21,8 @@ def csv_write(file_name, data: list): for i in range(0, len(df), chunksize): df.iloc[i:i + chunksize].to_csv(f'./out/{file_name}', mode='a', header=False, index=False) return True + + def extract_opcode(disasm_text): """ 从反汇编文本中提取操作码和操作数 @@ -34,6 +38,7 @@ def extract_opcode(disasm_text): return opcode return "" + def get_graph_r2pipe(r2pipe_open, file_type): # 获取基础块内的操作码序列 opcode_Sequence = [] @@ -66,22 +71,21 @@ def get_graph_r2pipe(r2pipe_open, file_type): if op["type"] == "invalid": continue block_opcode_Sequence.append(extract_opcode(op["opcode"])) - opcode_Sequence.append([file_type, file_type, len(block_opcode_Sequence), ' '.join(block_opcode_Sequence)]) + opcode_Sequence.append( + [file_type, file_type, len(block_opcode_Sequence), ' '.join(block_opcode_Sequence)]) except: print("Error: get function list failed") return opcode_Sequence - - if __name__ == '__main__': - logger = setup_logger('logger', 'log/opcode_benign.log') + logger = setup_logger('logger', './log/opcode_benign.log') file_type = 'benign' file_path = os.path.join('/mnt/d/bishe/dataset/train_benign') file_list = os.listdir(file_path)[:10000] done_file_num = 0 process_bar = tqdm(desc='Processing...', leave=True, total=len(file_list)) - done_list = [['class', 'sub-class','size', 'corpus']] + done_list = [['class', 'sub-class', 'size', 'corpus']] for file_name in file_list: r2pipe_open = r2pipe.open(os.path.join(file_path, file_name), flags=['-2']) r2pipe_open.cmd("aaa") @@ -94,12 +98,6 @@ if __name__ == '__main__': else: csv_write(f'output_{file_type}.csv', done_list) - - - - - - # node_list = [] # edge_list = [] # temp_edge_list = [] @@ -140,4 +138,4 @@ if __name__ == '__main__': # # return True, "二进制可执行文件解析成功", node_list, edge_list, node_info_list # except Exception as e: - # return False, e, None, None, None \ No newline at end of file + # return False, e, None, None, None diff --git a/ngram.py b/ngram.py index 2e3a6d0..5fbf5b5 100644 --- a/ngram.py +++ b/ngram.py @@ -6,6 +6,11 @@ import csv import argparse import statistics import plotly.express as px +import concurrent.futures +from functools import partial +import logging +import contextlib + ################################################################################################### ## Program shall take two csv files of different classes - benign and malware @@ -13,54 +18,57 @@ import plotly.express as px ## of each computed ngram. delta_frequencies = (class1 - class2) ################################################################################################### -#-------------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------------- ## Generate ngrams given the corpus and factor n def generate_N_grams(corpus, n=1): + words = [word for word in corpus.split(" ")] + temp = zip(*[words[i:] for i in range(0, n)]) + ngram = [' '.join(n) for n in temp] + return ngram - words = [word for word in corpus.split(" ")] - temp = zip(*[words[i:] for i in range(0, n)]) - ngram = [' '.join(n) for n in temp] - return ngram -#-------------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------------- ## Creates ngrams for the corpus List for given N and Filters it based on following criteria # file count >= percent of Total corpus len (pecent in [1..100]) # Selects high frequency ngram until the mean value # Returns both complete and filtered dictionary of ngrams -def filter_N_grams (corpusList, N, percent, filterFreq=0): +def filter_N_grams(corpusList, N, percent, filterFreq=0): total = len(corpusList) ngramDictionary = defaultdict(int) ngramFileCount = defaultdict(int) - for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"): + for idx in range(0, total): opcodes = corpusList[idx] + if type(opcodes) is not str: + continue for item in generate_N_grams(opcodes, N): - #compute frequency of all unique ngrams + # compute frequency of all unique ngrams if len(opcodes) == 0: continue ngramDictionary[item] += 1 - #compute ngram file count + # compute ngram file count for item in ngramDictionary: ngramFileCount[item] += 1 filteredNgramDictionary = defaultdict(int) - #Filter those ngrams which meet percent of Total files criteria - filterCnt = round(int((percent * total)/ 100), 0) + # Filter those ngrams which meet percent of Total files criteria + filterCnt = round(int((percent * total) / 100), 0) for item in ngramFileCount: if ngramFileCount[item] >= filterCnt: - #Add to filtered dictionary the item which meets file count criteria + # Add to filtered dictionary the item which meets file count criteria filteredNgramDictionary[item] = ngramDictionary[item] - #Filter ngram with a minimum frequency + # Filter ngram with a minimum frequency if (filterFreq): - for item in ngramDictionary: + for item in ngramDictionary: if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary: - #Remove the item which below the frequency threshold + # Remove the item which below the frequency threshold filteredNgramDictionary.pop(item) - #print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n") - return [ngramDictionary, filteredNgramDictionary] + # print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n") + return ngramDictionary, filteredNgramDictionary -#-------------------------------------------------------------------------------------------------- + +# -------------------------------------------------------------------------------------------------- # Calculate a normalization factor for frequency values of class1 and class2 # For class which are high in frequency due their sample size, a normalization may required to be # factored for correctly resizgin the frequencies of the small class set. @@ -68,142 +76,160 @@ def filter_N_grams (corpusList, N, percent, filterFreq=0): def normalization_factor(class1, class2): mean1 = statistics.mean(class1) mean2 = statistics.mean(class2) - return mean1/mean2 + return mean1 / mean2 -#-------------------------------------------------------------------------------------------------- + +# -------------------------------------------------------------------------------------------------- # Write the data into the given csv file handle -def WriteCSV (file, csvFields, dataDictionary): +def WriteCSV(file, csvFields, dataDictionary): writer = csv.DictWriter(file, fieldnames=csvFields) writer.writeheader() writer.writerows(dataDictionary) -#-------------------------------------------------------------------------------------------------- +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +def process_csv_file(csvfile, ngram_type, file_percent_filter, frequency_filter): + """处理CSV文件并并行计算n-gram""" + print(f"start load csv file:{os.path.basename(csvfile)}") + dataframe = pd.read_csv(csvfile, encoding="utf8") + print(f"end load") + ngram_list = defaultdict(int) + filtered_ngram_list = defaultdict(int) + process_bar = tqdm(total=len(dataframe['corpus'].values), desc=f'Computing {ngram_type}-gram on files') + with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # 调整线程池大小 + future_to_args = { + executor.submit(filter_N_grams, dataframe['corpus'].values[start: start + 10000], + idx + 1, file_percent_filter, frequency_filter): start for start in + range(0, len(dataframe['corpus'].values), 10000) + } + for future in concurrent.futures.as_completed(future_to_args): + try: + sub_ngram_list, sub_filtered_ngram_list = future.result() + for i in [sub_ngram_list, ngram_list]: + for key, value in i.items(): + ngram_list[key] += value + for i in [sub_filtered_ngram_list, filtered_ngram_list]: + for key, value in i.items(): + filtered_ngram_list[key] += value + process_bar.update(10000) # 手动更新进度条 + except Exception as exc: + logging.error(f"Error processing {idx + 1}-gram: {exc}") + return ngram_list, filtered_ngram_list + +# -------------------------------------------------------------------------------------------------- # Execution starts here # Add command line arguments # CSV header: class, sub-class, size, corpus -parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.") -parser.add_argument('malware_csvfile', help='path to the malware corpus csv file') -parser.add_argument('benign_csvfile', help='path to the benign corpus csv file') -parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive') # Execute the parse_args() method +if __name__ == '__main__': + # Get user arguments + malware_csvfile = os.path.join('./out/output_malware.csv') + benign_csvfile = os.path.join('./out/output_benign.csv') + maxgrams = 3 -# Get user arguments -malware_csvfile = os.path.join('./out/output_malware.csv') -benign_csvfile = os.path.join('./out/output_benign.csv') -maxgrams = 3 + # Error check and exit if not a file + if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)): + print(f"Path should be csv file!") + exit(1) -# Error check and exit if not a file -if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)): - print (f"Path should be csv file!") - exit(1) + # Read the csv file using pandas into data frame -# Read the csv file using pandas into data frame -try: - malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8") - benignDF = pd.read_csv(benign_csvfile, encoding="utf8") -except Exception as error: - print(error) + # Build a frequency list for ngrams + filePercentFilter = 80 ## select ngrams present in x% of files + frequencyFilter = 20 ## select ngrams with frequency greater than this value -#Build a frequency list for ngrams -filePercentFilter = 80 ## select ngrams present in x% of files -frequencyFilter = 20 ## select ngrams with frequency greater than this value + malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus + benignNgram = defaultdict(int) ## full list of ngrams in benign corpus + filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus + filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus -malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus -benignNgram = defaultdict(int) ## full list of ngrams in benign corpus -filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus -filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus + ## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) + filteredMergedNgram = defaultdict(int) -## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) -filteredMergedNgram = defaultdict(int) + # run for only the maxgram provided, change lower value to 0 to run for all values [1..N] + for idx in range(maxgrams - 1, maxgrams): + print(f"Computing {idx + 1}gram on files ...") + print(f"CPU core {os.cpu_count()} on use") + malwareNgram = [] + filteredMalwareNgram = [] + benignNgram = [] + filteredBenignNgram = [] + malwareNgram.clear() + filteredMalwareNgram.clear() + benignNgram.clear() + filteredBenignNgram.clear() + filteredMergedNgram.clear() + # opcodes decoded from pe file in sequence is stored as corpus in the csv + malwareNgram, filteredMalwareNgram = process_csv_file(malware_csvfile, 'malware', filePercentFilter, frequencyFilter) -#run for only the maxgram provided, change lower value to 0 to run for all values [1..N] -for idx in range(maxgrams-1, maxgrams): - print(f"Computing {idx+1}gram on files ...") - malwareNgram.clear() - filteredMalwareNgram.clear() - benignNgram.clear() - filteredBenignNgram.clear() - filteredMergedNgram.clear() + benignNgram, filteredBenignNgram = process_csv_file(benign_csvfile, 'benign', filePercentFilter, frequencyFilter) - #opcodes decoded from pe file in sequence is stored as corpus in the csv - [malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1, - filePercentFilter, frequencyFilter) + # creates a sorted list of ngram tuples with their frequency for 1 .. maxgram - [benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1, - filePercentFilter, frequencyFilter) + mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) + ## Now find the relative frequency b/w benign and malware files. = benign - malware + ## write this for cases where ngrams only present in one of the clases malware or benign + ## for reusability in case a union of classes is taken. + for item in mergedList: + key = item # get the ngram only + if key in filteredBenignNgram: + if key in filteredMalwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] + elif item in malwareNgram: + filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] + else: + filteredMergedNgram[key] = filteredBenignNgram[key] + elif key in filteredMalwareNgram: + if key in benignNgram: + filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] + else: + filteredMergedNgram[key] = filteredMalwareNgram[key] - #creates a sorted list of ngram tuples with their frequency for 1 .. maxgram - print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}") - print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}") + print(f"Merged: {idx + 1}gramCnt={len(filteredMergedNgram.keys())}") + ## get a sorted list of merged ngrams with relative frequencies + sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) - ## Make a intersection of filtered list between malware and benign ngrams - mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) + # Plot a scatter graph - + # y values as relative frequency benign-malware + # x values as max frequency of a ngram max(malware, benign) + # color labels as 'a' + frequency % 26 + # size as frequency/max * 100 + # hover name is ngram name + # titlestr = str(idx + 1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")" + # htmlfile = str(idx + 1) + "gram.html" + # hovername = [item[0] for item in sortedMergedNgramList] + # yval = [item[1]/1e10 for item in sortedMergedNgramList] + # xval = [] + # for key in hovername: + # xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key])) + # colors = [chr(ord('a') + (value % 26)) for value in xval] + # maxval = max(xval) + # sizeval = [(int((val / maxval) * 100) + 1) for val in xval] + # + # fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors, + # size=sizeval, hover_name=hovername, log_x=True, + # labels={ + # "x": "Absolute Frequency", + # "y": "Relative Frequency"}) + # fig.write_html(htmlfile) - ## Now find the relative frequency b/w benign and malware files. = benign - malware - ## write this for cases where ngrams only present in one of the clases malware or benign - ## for reusability in case a union of classes is taken. - for item in mergedList: - key = item #get the ngram only - if key in filteredBenignNgram: - if key in filteredMalwareNgram: - filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] - elif item in malwareNgram: - filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] - else: - filteredMergedNgram[key] = filteredBenignNgram[key] - elif key in filteredMalwareNgram: - if key in benignNgram: - filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] - else: - filteredMergedNgram[key] = filteredMalwareNgram[key] - - print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}") - ## get a sorted list of merged ngrams with relative frequencies - sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) - - #Plot a scatter graph - - # y values as relative frequency benign-malware - # x values as max frequency of a ngram max(malware, benign) - # color labels as 'a' + frequency % 26 - # size as frequency/max * 100 - # hover name is ngram name - titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")" - htmlfile = str (idx+1) +"gram.html" - hovername = [item[0] for item in sortedMergedNgramList] - yval = [item[1] for item in sortedMergedNgramList] - xval = [] - for key in hovername: - xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key])) - colors = [chr(ord('a')+ (value %26)) for value in xval] - maxval = max(xval) - sizeval = [(int((val/maxval)*100)+1) for val in xval] - - fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors, - size=sizeval, hover_name=hovername, log_x=True, - labels = { - "x": "Absolute Frequency", - "y": "Relative Frequency"}) - fig.show() - fig.write_html(htmlfile) - - #write the final ngrams into a file for feature selection - ngramDictList = [] - for item in sortedMergedNgramList: - dictItem = {} - key = item[0] - dictItem['ngram'] = key - dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) - ngramDictList.append(dictItem) - - csvfields = ['ngram', 'count'] - csvname = str(idx+1) + "gram.csv" - try: - csvfile = open(csvname, 'w') - except Exception as err: - print(f"Error: writing csvfile {err}") - WriteCSV(csvfile, csvfields, ngramDictList) - csvfile.close() + # write the final ngrams into a file for feature selection + ngramDictList = [] + for item in sortedMergedNgramList: + dictItem = {} + key = item[0] + dictItem['ngram'] = key + dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) + ngramDictList.append(dictItem) + csvfields = ['ngram', 'count'] + csvname = "./out/"+str(idx + 1) + "gram.csv" + print("*======================start write csv=======================================*") + try: + csvfile = open(csvname, 'w') + except Exception as err: + print(f"Error: writing csvfile {err}") + WriteCSV(csvfile, csvfields, ngramDictList) + csvfile.close()