asm_to_csv/ngram.py

from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import os
import csv
import argparse
import statistics
import plotly.express as px

###################################################################################################
## Program shall take two csv files of different classes - benign and malware
## It will compute ngrams for each of the classes seperately and find the delta frequencies
## of each computed ngram. delta_frequencies = (class1 - class2)
###################################################################################################

#--------------------------------------------------------------------------------------------------
## Generate ngrams given the corpus and factor n
def generate_N_grams(corpus, n=1):

  words = [word for word in corpus.split(" ")]
  temp = zip(*[words[i:] for i in range(0, n)])
  ngram = [' '.join(n) for n in temp]
  return ngram

#--------------------------------------------------------------------------------------------------
## Creates ngrams for the corpus List for given N and Filters it based on following criteria
# file count >= percent of Total corpus len (pecent in [1..100])
# Selects high frequency ngram until the mean value
# Returns both complete and filtered dictionary of ngrams
def filter_N_grams (corpusList, N, percent, filterFreq=0):
    total = len(corpusList)
    ngramDictionary = defaultdict(int)
    ngramFileCount = defaultdict(int)
    for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"):
        opcodes = corpusList[idx]
        for item in generate_N_grams(opcodes, N):
            #compute frequency of all unique ngrams
            if len(opcodes) == 0:
                continue
            ngramDictionary[item] += 1
        #compute ngram file count
        for item in ngramDictionary:
            ngramFileCount[item] += 1

    filteredNgramDictionary = defaultdict(int)
    #Filter those ngrams which meet percent of Total files criteria
    filterCnt = round(int((percent * total)/ 100), 0)
    for item in ngramFileCount:
        if ngramFileCount[item] >= filterCnt:
            #Add to filtered dictionary the item which meets file count criteria
            filteredNgramDictionary[item] = ngramDictionary[item]

    #Filter ngram with a minimum frequency
    if (filterFreq):
       for item in ngramDictionary:
            if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary:
                #Remove the item which below the frequency threshold
                filteredNgramDictionary.pop(item)

    #print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n")
    return [ngramDictionary, filteredNgramDictionary]

#--------------------------------------------------------------------------------------------------
# Calculate a normalization factor for frequency values of class1 and class2
# For class which are high in frequency due their sample size, a normalization may required to be
# factored for correctly resizgin the frequencies of the small class set.
# input list of frequencies of class1 and class 2
def normalization_factor(class1, class2):
    mean1 = statistics.mean(class1)
    mean2 = statistics.mean(class2)
    return mean1/mean2

#--------------------------------------------------------------------------------------------------
# Write the data into the given csv file handle
def WriteCSV (file, csvFields, dataDictionary):
    writer = csv.DictWriter(file, fieldnames=csvFields)
    writer.writeheader()
    writer.writerows(dataDictionary)

#--------------------------------------------------------------------------------------------------
# Execution starts here
# Add command line arguments
# CSV header: class, sub-class, size, corpus
parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.")
parser.add_argument('malware_csvfile', help='path to the malware corpus csv file')
parser.add_argument('benign_csvfile', help='path to the benign corpus csv file')
parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive')

# Execute the parse_args() method


# Get user arguments
malware_csvfile = os.path.join('./out/output_malware.csv')
benign_csvfile = os.path.join('./out/output_benign.csv')
maxgrams = 3

# Error check and exit if not a file
if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)):
    print (f"Path should be csv file!")
    exit(1)

# Read the csv file using pandas into data frame
try:
    malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8")
    benignDF = pd.read_csv(benign_csvfile, encoding="utf8")
except Exception as error:
    print(error)

#Build a frequency list for ngrams
filePercentFilter = 80  ## select ngrams present in x% of files
frequencyFilter = 20    ## select ngrams with frequency greater than this value

malwareNgram = defaultdict(int)  ## full list of ngrams in malware corpus
benignNgram = defaultdict(int)   ## full list of ngrams in benign corpus
filteredMalwareNgram = defaultdict(int)  ## filtered list of ngrams from malware corpus
filteredBenignNgram = defaultdict(int)   ## filtered list of ngrams from benign corpus

## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq)
filteredMergedNgram = defaultdict(int)


#run for only the maxgram provided, change lower value to 0 to run for all values [1..N]
for idx in range(maxgrams-1, maxgrams):
    print(f"Computing {idx+1}gram on files ...")
    malwareNgram.clear()
    filteredMalwareNgram.clear()
    benignNgram.clear()
    filteredBenignNgram.clear()
    filteredMergedNgram.clear()

    #opcodes decoded from pe file in sequence is stored as corpus in the csv
    [malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1,
                                                          filePercentFilter, frequencyFilter)

    [benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1,
                                                        filePercentFilter, frequencyFilter)

    #creates a sorted list of ngram tuples with their frequency for 1 .. maxgram
    print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}")
    print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}")

    ## Make a intersection of filtered list between malware and benign ngrams
    mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys()))

    ## Now find the relative frequency b/w benign and malware files. = benign - malware
    ## write this for cases where ngrams only present in one of the clases malware or benign
    ## for reusability in case a union of classes is taken.
    for item in mergedList:
        key = item  #get the ngram only
        if key in filteredBenignNgram:
            if key in filteredMalwareNgram:
                filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key]
            elif item in malwareNgram:
                filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key]
            else:
                filteredMergedNgram[key] = filteredBenignNgram[key]
        elif key in filteredMalwareNgram:
            if key in benignNgram:
                filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key]
            else:
                filteredMergedNgram[key] = filteredMalwareNgram[key]

    print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}")
    ## get a sorted list of merged ngrams with relative frequencies
    sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1])

    #Plot a scatter graph -
    # y values as relative frequency benign-malware
    # x values as max frequency of a ngram max(malware, benign)
    # color labels as 'a' + frequency % 26
    # size as frequency/max * 100
    # hover name is ngram name
    titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")"
    htmlfile = str (idx+1) +"gram.html"
    hovername = [item[0] for item in sortedMergedNgramList]
    yval = [item[1] for item in sortedMergedNgramList]
    xval = []
    for key in hovername:
        xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key]))
    colors = [chr(ord('a')+ (value %26)) for value in xval]
    maxval = max(xval)
    sizeval = [(int((val/maxval)*100)+1) for val in xval]

    fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors,
                     size=sizeval, hover_name=hovername, log_x=True,
                     labels = {
                         "x": "Absolute Frequency",
                         "y": "Relative Frequency"})
    fig.show()
    fig.write_html(htmlfile)

    #write the final ngrams into a file for feature selection
    ngramDictList = []
    for item in sortedMergedNgramList:
        dictItem = {}
        key = item[0]
        dictItem['ngram'] = key
        dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key])
        ngramDictList.append(dictItem)

    csvfields = ['ngram', 'count']
    csvname = str(idx+1) + "gram.csv"
    try:
        csvfile = open(csvname, 'w')
    except Exception as err:
        print(f"Error: writing csvfile {err}")
    WriteCSV(csvfile, csvfields, ngramDictList)
    csvfile.close()
asm提取 2024-03-07 15:08:07 +08:00			`from collections import defaultdict`
			`from tqdm import tqdm`
			`import pandas as pd`
			`import os`
			`import csv`
			`import argparse`
			`import statistics`
			`import plotly.express as px`

			`###################################################################################################`
			`## Program shall take two csv files of different classes - benign and malware`
			`## It will compute ngrams for each of the classes seperately and find the delta frequencies`
			`## of each computed ngram. delta_frequencies = (class1 - class2)`
			`###################################################################################################`

			`#--------------------------------------------------------------------------------------------------`
			`## Generate ngrams given the corpus and factor n`
			`def generate_N_grams(corpus, n=1):`

			`words = [word for word in corpus.split(" ")]`
			`temp = zip(*[words[i:] for i in range(0, n)])`
			`ngram = [' '.join(n) for n in temp]`
			`return ngram`

			`#--------------------------------------------------------------------------------------------------`
			`## Creates ngrams for the corpus List for given N and Filters it based on following criteria`
			`# file count >= percent of Total corpus len (pecent in [1..100])`
			`# Selects high frequency ngram until the mean value`
			`# Returns both complete and filtered dictionary of ngrams`
			`def filter_N_grams (corpusList, N, percent, filterFreq=0):`
			`total = len(corpusList)`
			`ngramDictionary = defaultdict(int)`
			`ngramFileCount = defaultdict(int)`
			`for idx in tqdm(range(0, total), ncols=100, desc="Computing ngrams"):`
			`opcodes = corpusList[idx]`
			`for item in generate_N_grams(opcodes, N):`
			`#compute frequency of all unique ngrams`
			`if len(opcodes) == 0:`
			`continue`
			`ngramDictionary[item] += 1`
			`#compute ngram file count`
			`for item in ngramDictionary:`
			`ngramFileCount[item] += 1`

			`filteredNgramDictionary = defaultdict(int)`
			`#Filter those ngrams which meet percent of Total files criteria`
			`filterCnt = round(int((percent * total)/ 100), 0)`
			`for item in ngramFileCount:`
			`if ngramFileCount[item] >= filterCnt:`
			`#Add to filtered dictionary the item which meets file count criteria`
			`filteredNgramDictionary[item] = ngramDictionary[item]`

			`#Filter ngram with a minimum frequency`
			`if (filterFreq):`
			`for item in ngramDictionary:`
			`if ngramDictionary[item] < filterFreq and item in filteredNgramDictionary:`
			`#Remove the item which below the frequency threshold`
			`filteredNgramDictionary.pop(item)`

			`#print(f"Total ngrams:{len(ngramDictionary.items())} => filtered: {len(filteredNgramDictionary.items())}\n")`
			`return [ngramDictionary, filteredNgramDictionary]`

			`#--------------------------------------------------------------------------------------------------`
			`# Calculate a normalization factor for frequency values of class1 and class2`
			`# For class which are high in frequency due their sample size, a normalization may required to be`
			`# factored for correctly resizgin the frequencies of the small class set.`
			`# input list of frequencies of class1 and class 2`
			`def normalization_factor(class1, class2):`
			`mean1 = statistics.mean(class1)`
			`mean2 = statistics.mean(class2)`
			`return mean1/mean2`

			`#--------------------------------------------------------------------------------------------------`
			`# Write the data into the given csv file handle`
			`def WriteCSV (file, csvFields, dataDictionary):`
			`writer = csv.DictWriter(file, fieldnames=csvFields)`
			`writer.writeheader()`
			`writer.writerows(dataDictionary)`

			`#--------------------------------------------------------------------------------------------------`
			`# Execution starts here`
			`# Add command line arguments`
			`# CSV header: class, sub-class, size, corpus`
			`parser = argparse.ArgumentParser(description="ngram analysis on a given corpus csv file.")`
			`parser.add_argument('malware_csvfile', help='path to the malware corpus csv file')`
			`parser.add_argument('benign_csvfile', help='path to the benign corpus csv file')`
			`parser.add_argument('ngram', help='ngram to compute, higher value will be compute intensive')`

			`# Execute the parse_args() method`


			`# Get user arguments`
			`malware_csvfile = os.path.join('./out/output_malware.csv')`
			`benign_csvfile = os.path.join('./out/output_benign.csv')`
			`maxgrams = 3`

			`# Error check and exit if not a file`
			`if not (os.path.isfile(malware_csvfile) and os.path.isfile(benign_csvfile)):`
			`print (f"Path should be csv file!")`
			`exit(1)`

			`# Read the csv file using pandas into data frame`
			`try:`
			`malwareDF = pd.read_csv(malware_csvfile, encoding = "utf8")`
			`benignDF = pd.read_csv(benign_csvfile, encoding="utf8")`
			`except Exception as error:`
			`print(error)`

			`#Build a frequency list for ngrams`
			`filePercentFilter = 80 ## select ngrams present in x% of files`
			`frequencyFilter = 20 ## select ngrams with frequency greater than this value`

			`malwareNgram = defaultdict(int) ## full list of ngrams in malware corpus`
			`benignNgram = defaultdict(int) ## full list of ngrams in benign corpus`
			`filteredMalwareNgram = defaultdict(int) ## filtered list of ngrams from malware corpus`
			`filteredBenignNgram = defaultdict(int) ## filtered list of ngrams from benign corpus`

			`## common list ngrams from both malware and benign corpus with relative frequency (benignFreq - malwareFreq)`
			`filteredMergedNgram = defaultdict(int)`


			`#run for only the maxgram provided, change lower value to 0 to run for all values [1..N]`
			`for idx in range(maxgrams-1, maxgrams):`
			`print(f"Computing {idx+1}gram on files ...")`
			`malwareNgram.clear()`
			`filteredMalwareNgram.clear()`
			`benignNgram.clear()`
			`filteredBenignNgram.clear()`
			`filteredMergedNgram.clear()`

			`#opcodes decoded from pe file in sequence is stored as corpus in the csv`
			`[malwareNgram, filteredMalwareNgram] = filter_N_grams(malwareDF['corpus'].values, idx+1,`
			`filePercentFilter, frequencyFilter)`

			`[benignNgram, filteredBenignNgram] = filter_N_grams(benignDF['corpus'].values, idx+1,`
			`filePercentFilter, frequencyFilter)`

			`#creates a sorted list of ngram tuples with their frequency for 1 .. maxgram`
			`print(f"Malware: {idx+1}gramCnt={len(malwareNgram.items())}, filterenCnt={len(filteredMalwareNgram.items())}")`
			`print(f"Benign: {idx+1}gramCnt={len(benignNgram.items())}, filterenCnt={len(filteredBenignNgram.items())}")`

			`## Make a intersection of filtered list between malware and benign ngrams`
			`mergedList = list(set().union(filteredMalwareNgram.keys(), filteredBenignNgram.keys()))`

			`## Now find the relative frequency b/w benign and malware files. = benign - malware`
			`## write this for cases where ngrams only present in one of the clases malware or benign`
			`## for reusability in case a union of classes is taken.`
			`for item in mergedList:`
			`key = item #get the ngram only`
			`if key in filteredBenignNgram:`
			`if key in filteredMalwareNgram:`
			`filteredMergedNgram[key] = filteredBenignNgram[key] - filteredMalwareNgram[key]`
			`elif item in malwareNgram:`
			`filteredMergedNgram[key] = filteredBenignNgram[key] - malwareNgram[key]`
			`else:`
			`filteredMergedNgram[key] = filteredBenignNgram[key]`
			`elif key in filteredMalwareNgram:`
			`if key in benignNgram:`
			`filteredMergedNgram[key] = benignNgram[key] - filteredMalwareNgram[key]`
			`else:`
			`filteredMergedNgram[key] = filteredMalwareNgram[key]`

			`print(f"Merged: {idx+1}gramCnt={len(filteredMergedNgram.keys())}")`
			`## get a sorted list of merged ngrams with relative frequencies`
			`sortedMergedNgramList = sorted(filteredMergedNgram.items(), key=lambda x: x[1])`

			`#Plot a scatter graph -`
			`# y values as relative frequency benign-malware`
			`# x values as max frequency of a ngram max(malware, benign)`
			`# color labels as 'a' + frequency % 26`
			`# size as frequency/max * 100`
			`# hover name is ngram name`
			`titlestr = str(idx+1) + "gram: Total samples(" + str(len(sortedMergedNgramList)) + ")"`
			`htmlfile = str (idx+1) +"gram.html"`
			`hovername = [item[0] for item in sortedMergedNgramList]`
			`yval = [item[1] for item in sortedMergedNgramList]`
			`xval = []`
			`for key in hovername:`
			`xval.append(max(filteredMalwareNgram[key], filteredBenignNgram[key]))`
			`colors = [chr(ord('a')+ (value %26)) for value in xval]`
			`maxval = max(xval)`
			`sizeval = [(int((val/maxval)*100)+1) for val in xval]`

			`fig = px.scatter(title=titlestr, y=yval, x=xval, color=colors,`
			`size=sizeval, hover_name=hovername, log_x=True,`
			`labels = {`
			`"x": "Absolute Frequency",`
			`"y": "Relative Frequency"})`
			`fig.show()`
			`fig.write_html(htmlfile)`

			`#write the final ngrams into a file for feature selection`
			`ngramDictList = []`
			`for item in sortedMergedNgramList:`
			`dictItem = {}`
			`key = item[0]`
			`dictItem['ngram'] = key`
			`dictItem['count'] = max(filteredMalwareNgram[key], filteredBenignNgram[key])`
			`ngramDictList.append(dictItem)`

			`csvfields = ['ngram', 'count']`
			`csvname = str(idx+1) + "gram.csv"`
			`try:`
			`csvfile = open(csvname, 'w')`
			`except Exception as err:`
			`print(f"Error: writing csvfile {err}")`
			`WriteCSV(csvfile, csvfields, ngramDictList)`
			`csvfile.close()`