from collections import defaultdict from tqdm import tqdm import pandas as pd import os import csv import argparse import statistics import plotly.express as px ################################################################################################### ## Program shall take two csv files of different classes - benign and malware ## It will compute ngrams for each of the classes seperately and find the delta frequencies ## of each computed ngram. delta_frequencies = (class1 - class2) ################################################################################################### #-------------------------------------------------------------------------------------------------- ## Generate ngrams given the corpus and factor n def generate_N_grams(corpus, n=1): words = [word for word in corpus.split(" ")] temp = zip(*[words[i:] for i in range(0, n)]) ngram = [' '.join(n) for n in temp] return ngram #-------------------------------------------------------------------------------------------------- ## Creates ngrams for the corpus List for given N and Filters it based on following criteria # file count >= percent of Total corpus len (pecent in [1..100]) # Selects high frequency ngram until the mean value # Returns both complete and filtered dictionary of ngrams def filter_N_grams (corpusList, N, percent, filterFreq=0): total = len(corpusList) ngramDictionary = defaultdict(int) ngramFileCount = defaultdict(int) for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"): opcodes = corpusList[idx] for item in generate_N_grams(opcodes, N): #compute frequency of all unique ngrams if len(opcodes) == 0: continue ngramDictionary[item] += 1 #compute ngram file count for item in ngramDictionary: ngramFileCount[item] += 1 filteredNgramDictionary = defaultdict(int) #Filter those ngrams which meet percent of Total files criteria filterCnt = round(int((percent * total)/ 100), 0) for item in ngramFileCount: if ngramFileCount[item] >= filterCnt: #Add to filtered dictionary the item which meets file count criteria filteredNgramDictionary[item] = ngramDictionary[item] #Filter ngram with a minimum frequency if (filterFreq): for item in ngramDictionary: if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary: #Remove the item which below the frequency threshold filteredNgramDictionary.pop(item) #print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n") return [ngramDictionary, filteredNgramDictionary] #-------------------------------------------------------------------------------------------------- # Calculate a normalization factor for frequency values of class1 and class2 # For class which are high in frequency due their sample size, a normalization may required to be # factored for correctly resizgin the frequencies of the small class set. # input list of frequencies of class1 and class 2 def normalization_factor(class1, class2): mean1 = statistics.mean(class1) mean2 = statistics.mean(class2) return mean1/mean2 #-------------------------------------------------------------------------------------------------- # Write the data into the given csv file handle def WriteCSV (file, csvFields, dataDictionary): writer = csv.DictWriter(file, fieldnames=csvFields) writer.writeheader() writer.writerows(dataDictionary) #-------------------------------------------------------------------------------------------------- # Execution starts here # Add command line arguments # CSV header: class, sub-class, size, corpus parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.") parser.add_argument('malware_csvfile', help='path to the malware corpus csv file') parser.add_argument('benign_csvfile', help='path to the benign corpus csv file') parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive') # Execute the parse_args() method # Get user arguments malware_csvfile = os.path.join('./out/output_malware.csv') benign_csvfile = os.path.join('./out/output_benign.csv') maxgrams = 3 # Error check and exit if not a file if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)): print (f"Path should be csv file!") exit(1) # Read the csv file using pandas into data frame try: malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8") benignDF = pd.read_csv(benign_csvfile, encoding="utf8") except Exception as error: print(error) #Build a frequency list for ngrams filePercentFilter = 80 ## select ngrams present in x% of files frequencyFilter = 20 ## select ngrams with frequency greater than this value malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus benignNgram = defaultdict(int) ## full list of ngrams in benign corpus filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus ## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq) filteredMergedNgram = defaultdict(int) #run for only the maxgram provided, change lower value to 0 to run for all values [1..N] for idx in range(maxgrams-1, maxgrams): print(f"Computing {idx+1}gram on files ...") malwareNgram.clear() filteredMalwareNgram.clear() benignNgram.clear() filteredBenignNgram.clear() filteredMergedNgram.clear() #opcodes decoded from pe file in sequence is stored as corpus in the csv [malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1, filePercentFilter, frequencyFilter) [benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1, filePercentFilter, frequencyFilter) #creates a sorted list of ngram tuples with their frequency for 1 .. maxgram print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}") print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}") ## Make a intersection of filtered list between malware and benign ngrams mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys())) ## Now find the relative frequency b/w benign and malware files. = benign - malware ## write this for cases where ngrams only present in one of the clases malware or benign ## for reusability in case a union of classes is taken. for item in mergedList: key = item #get the ngram only if key in filteredBenignNgram: if key in filteredMalwareNgram: filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key] elif item in malwareNgram: filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key] else: filteredMergedNgram[key] = filteredBenignNgram[key] elif key in filteredMalwareNgram: if key in benignNgram: filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key] else: filteredMergedNgram[key] = filteredMalwareNgram[key] print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}") ## get a sorted list of merged ngrams with relative frequencies sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1]) #Plot a scatter graph - # y values as relative frequency benign-malware # x values as max frequency of a ngram max(malware, benign) # color labels as 'a' + frequency % 26 # size as frequency/max * 100 # hover name is ngram name titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")" htmlfile = str (idx+1) +"gram.html" hovername = [item[0] for item in sortedMergedNgramList] yval = [item[1] for item in sortedMergedNgramList] xval = [] for key in hovername: xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key])) colors = [chr(ord('a')+ (value %26)) for value in xval] maxval = max(xval) sizeval = [(int((val/maxval)*100)+1) for val in xval] fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors, size=sizeval, hover_name=hovername, log_x=True, labels = { "x": "Absolute Frequency", "y": "Relative Frequency"}) fig.show() fig.write_html(htmlfile) #write the final ngrams into a file for feature selection ngramDictList = [] for item in sortedMergedNgramList: dictItem = {} key = item[0] dictItem['ngram'] = key dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key]) ngramDictList.append(dictItem) csvfields = ['ngram', 'count'] csvname = str(idx+1) + "gram.csv" try: csvfile = open(csvname, 'w') except Exception as err: print(f"Error: writing csvfile {err}") WriteCSV(csvfile, csvfields, ngramDictList) csvfile.close()