asm_to_csv/ngram.py

from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import os
import csv
import argparse
import statistics
import plotly.express as px

###################################################################################################
## Program shall take two csv files of different classes - benign and malware
## It will compute ngrams for each of the classes seperately and find the delta frequencies
## of each computed ngram. delta_frequencies = (class1 - class2)
###################################################################################################

#--------------------------------------------------------------------------------------------------
## Generate ngrams given the corpus and factor n
def generate_N_grams(corpus, n=1):

  words = [word for word in corpus.split(" ")]
  temp = zip(*[words[i:] for i in range(0, n)])
  ngram = [' '.join(n) for n in temp]
  return ngram

#--------------------------------------------------------------------------------------------------
## Creates ngrams for the corpus List for given N and Filters it based on following criteria
# file count >= percent of Total corpus len (pecent in [1..100])
# Selects high frequency ngram until the mean value
# Returns both complete and filtered dictionary of ngrams
def filter_N_grams (corpusList, N, percent, filterFreq=0):
    total = len(corpusList)
    ngramDictionary = defaultdict(int)
    ngramFileCount = defaultdict(int)
    for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"):
        opcodes = corpusList[idx]
        for item in generate_N_grams(opcodes, N):
            #compute frequency of all unique ngrams
            if len(opcodes) == 0:
                continue
            ngramDictionary[item] += 1
        #compute ngram file count
        for item in ngramDictionary:
            ngramFileCount[item] += 1

    filteredNgramDictionary = defaultdict(int)
    #Filter those ngrams which meet percent of Total files criteria
    filterCnt = round(int((percent * total)/ 100), 0)
    for item in ngramFileCount:
        if ngramFileCount[item] >= filterCnt:
            #Add to filtered dictionary the item which meets file count criteria
            filteredNgramDictionary[item] = ngramDictionary[item]

    #Filter ngram with a minimum frequency
    if (filterFreq):
       for item in ngramDictionary:
            if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary:
                #Remove the item which below the frequency threshold
                filteredNgramDictionary.pop(item)

    #print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n")
    return [ngramDictionary, filteredNgramDictionary]

#--------------------------------------------------------------------------------------------------
# Calculate a normalization factor for frequency values of class1 and class2
# For class which are high in frequency due their sample size, a normalization may required to be
# factored for correctly resizgin the frequencies of the small class set.
# input list of frequencies of class1 and class 2
def normalization_factor(class1, class2):
    mean1 = statistics.mean(class1)
    mean2 = statistics.mean(class2)
    return mean1/mean2

#--------------------------------------------------------------------------------------------------
# Write the data into the given csv file handle
def WriteCSV (file, csvFields, dataDictionary):
    writer = csv.DictWriter(file, fieldnames=csvFields)
    writer.writeheader()
    writer.writerows(dataDictionary)

#--------------------------------------------------------------------------------------------------
# Execution starts here
# Add command line arguments
# CSV header: class, sub-class, size, corpus
parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.")
parser.add_argument('malware_csvfile', help='path to the malware corpus csv file')
parser.add_argument('benign_csvfile', help='path to the benign corpus csv file')
parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive')

# Execute the parse_args() method


# Get user arguments
malware_csvfile = os.path.join('./out/output_malware.csv')
benign_csvfile = os.path.join('./out/output_benign.csv')
maxgrams = 3

# Error check and exit if not a file
if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)):
    print (f"Path should be csv file!")
    exit(1)

# Read the csv file using pandas into data frame
try:
    malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8")
    benignDF = pd.read_csv(benign_csvfile, encoding="utf8")
except Exception as error:
    print(error)

#Build a frequency list for ngrams
filePercentFilter = 80  ## select ngrams present in x% of files
frequencyFilter = 20    ## select ngrams with frequency greater than this value

malwareNgram = defaultdict(int)  ## full list of ngrams in malware corpus
benignNgram = defaultdict(int)   ## full list of ngrams in benign corpus
filteredMalwareNgram = defaultdict(int)  ## filtered list of ngrams from malware corpus
filteredBenignNgram = defaultdict(int)   ## filtered list of ngrams from benign corpus

## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq)
filteredMergedNgram = defaultdict(int)


#run for only the maxgram provided, change lower value to 0 to run for all values [1..N]
for idx in range(maxgrams-1, maxgrams):
    print(f"Computing {idx+1}gram on files ...")
    malwareNgram.clear()
    filteredMalwareNgram.clear()
    benignNgram.clear()
    filteredBenignNgram.clear()
    filteredMergedNgram.clear()

    #opcodes decoded from pe file in sequence is stored as corpus in the csv
    [malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1,
                                                          filePercentFilter, frequencyFilter)

    [benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1,
                                                        filePercentFilter, frequencyFilter)

    #creates a sorted list of ngram tuples with their frequency for 1 .. maxgram
    print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}")
    print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}")

    ## Make a intersection of filtered list between malware and benign ngrams
    mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys()))

    ## Now find the relative frequency b/w benign and malware files. = benign - malware
    ## write this for cases where ngrams only present in one of the clases malware or benign
    ## for reusability in case a union of classes is taken.
    for item in mergedList:
        key = item  #get the ngram only
        if key in filteredBenignNgram:
            if key in filteredMalwareNgram:
                filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key]
            elif item in malwareNgram:
                filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key]
            else:
                filteredMergedNgram[key] = filteredBenignNgram[key]
        elif key in filteredMalwareNgram:
            if key in benignNgram:
                filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key]
            else:
                filteredMergedNgram[key] = filteredMalwareNgram[key]

    print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}")
    ## get a sorted list of merged ngrams with relative frequencies
    sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1])

    #Plot a scatter graph -
    # y values as relative frequency benign-malware
    # x values as max frequency of a ngram max(malware, benign)
    # color labels as 'a' + frequency % 26
    # size as frequency/max * 100
    # hover name is ngram name
    titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")"
    htmlfile = str (idx+1) +"gram.html"
    hovername = [item[0] for item in sortedMergedNgramList]
    yval = [item[1] for item in sortedMergedNgramList]
    xval = []
    for key in hovername:
        xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key]))
    colors = [chr(ord('a')+ (value %26)) for value in xval]
    maxval = max(xval)
    sizeval = [(int((val/maxval)*100)+1) for val in xval]

    fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors,
                     size=sizeval, hover_name=hovername, log_x=True,
                     labels = {
                         "x": "Absolute Frequency",
                         "y": "Relative Frequency"})
    fig.show()
    fig.write_html(htmlfile)

    #write the final ngrams into a file for feature selection
    ngramDictList = []
    for item in sortedMergedNgramList:
        dictItem = {}
        key = item[0]
        dictItem['ngram'] = key
        dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key])
        ngramDictList.append(dictItem)

    csvfields = ['ngram', 'count']
    csvname = str(idx+1) + "gram.csv"
    try:
        csvfile = open(csvname, 'w')
    except Exception as err:
        print(f"Error: writing csvfile {err}")
    WriteCSV(csvfile, csvfields, ngramDictList)
    csvfile.close()