diff --git a/classification_evolvement.py b/classification_evolvement.py new file mode 100644 index 0000000..85a7f17 --- /dev/null +++ b/classification_evolvement.py @@ -0,0 +1,394 @@ +''' +Use the model pool initialized with 2011 apps to detect malware from apps developed in 2012, 2013, 2014, 2015, 2016 +Model pool and feature set (i.e., feature_set.pkl) are evolved during detection. + +''' + +import numpy as np +import scipy +from scipy.stats import logistic +from scipy.special import expit +from numpy import dot +import sklearn +from sklearn.datasets import load_svmlight_file +import os +import sys +import string +from decimal import * +import collections +from classifiers import * +import time +import random +import pickle as pkl +import argparse +import shutil + +class app(object): + def __init__(self, a, y, pl): + self.a = a + self.y = y + self.pl = pl + +def extract_benign(filedir): + + app_feature = pkl.load(open(filedir + '.feature','rb')) + + result = [] + result.append('-1 ') + new = [] + for i in range(len(features)): + if features[i] in app_feature: + result.append(str(i+1) + ':1 ') + + for item in app_feature: + if item not in features: # this is a new feature, store new features in advance to save time + p = 1 + # append the new feature to the data + # the model won't process this new feature unless update + # the model will only process the first |len(features)| features + result.append(str(len(features) + p) + ':1 ') + new.append(item) + p += 1 + + return result, new + + + +def extract_malicious(filedir): + + app_feature = pkl.load(open(filedir + '.feature','rb')) + + result = [] + result.append('1 ') + new = [] + + for i in range(len(features)): + if features[i] in app_feature: + result.append(str(i+1) + ':1 ') + + for item in app_feature: + if item not in features: # this is a new feature + p = 1 + # append the new feature to the data + # the model won't process this new feature unless update + # the model will only process the first |len(features)| features + # if this app is a drifting app, the new identified feature will be added into feature_set.pkl + result.append(str(len(features) + p) + ':1 ') + new.append(item) + p += 1 + + return result, new + + +def evaluation(Y_test, instances): + n = p = tp = fn = tn = fp = right = 0 + print 'evaluating predictions' + + for e in xrange(len(Y_test)): + + if Y_test[e] != 1 and instances[e].pl != 1: # true label, prediction label + n += 1 + tn += 1 + if Y_test[e] != 1 and instances[e].pl == 1: + n += 1 + fp +=1 + if Y_test[e] == 1 and instances[e].pl == 1: + p += 1 + tp += 1 + if Y_test[e] == 1 and instances[e].pl != 1: + p += 1 + fn += 1 + if Y_test[e] == instances[e].pl: + right += 1 + + print type(Y_test), len(Y_test) + print 'all', n+p, 'right', right ,'n', n , 'p:', p, 'tn', tn, 'tp',tp, 'fn',fn, 'fp',fp + accu = (Decimal(tp) + Decimal(tn))*Decimal(100) / (Decimal(n) + Decimal(p)) + tpr = Decimal(tp)*Decimal(100)/Decimal(p) + fpr = Decimal(fp)*Decimal(100)/Decimal(n) + f1 = Decimal(200)*Decimal(tp)/(Decimal(2)*Decimal(tp) + Decimal(fp) + Decimal(fn)) + precision = Decimal(tp)*Decimal(100)/(Decimal(tp) + Decimal(fp)) + print 'model pool f measure: ', float(format(f1, '.2f')), 'precision: ', float(format(precision, '.2f')), 'recall: ', float(format(tpr, '.2f')) + + return float(format(accu, '.2f')), float(format(f1, '.2f')), float(format(precision, '.2f')), float(format(tpr, '.2f')), float(format(fpr, '.2f')) + + +def metric_calculation(i, j, buffer_size): + larger = 0 + if len(app_buffer) <=buffer_size: + app_temp = [item[j] for item in app_buffer] + positive = sum(app_tt > 0 for app_tt in app_temp) + negative = sum(app_tt <= 0 for app_tt in app_temp) + if confidences[i][j] > 0: # prediction label = 1 = malicious + larger = sum(confidences[i][j] >= app_t and app_t > 0 for app_t in app_temp) + p_ratio = float(Decimal(larger)/Decimal(positive)) + + else: # <= 0 = benign + larger = sum(confidences[i][j] <= app_t and app_t <= 0 for app_t in app_temp) + p_ratio = float(Decimal(larger)/Decimal(negative)) + + else: + app_temp = [item[j] for item in app_buffer[len(app_buffer)-buffer_size:]] + positive = sum(app_tt > 0 for app_tt in app_temp) + negative = sum(app_tt <= 0 for app_tt in app_temp) + if confidences[i][j] > 0: # prediction label = 1 = malicious + larger = sum(confidences[i][j] >= app_t and app_t > 0 for app_t in app_temp) + p_ratio = float(Decimal(larger)/Decimal(positive)) + + else: + larger = sum(confidences[i][j] <= app_t and app_t <= 0 for app_t in app_temp) + p_ratio = float(Decimal(larger)/Decimal(negative)) + return p_ratio + + +def all_model_label(i, age_threshold_low, age_threshold_up): + young = aged = a_marker = y_marker = 0 + for j in xrange(len(clfs)): + if age_threshold_low <= p_values[i][j] <= age_threshold_up: # not an aged model, can vote + young += confidences[i][j] + y_marker += 1 # number of young model + + else: # this is an aged model, need to be updated + aged += confidences[i][j] + aged_model.append(j) # record aged model index + a_marker += 1 # num of aged model for this drifting app + + return young, aged, a_marker, y_marker + + +def generate_pseudo_label(aged_marker, young_marker, aged_value, young_value): + if young_marker == 0: # young models are not available; weighted voting using aged model + if aged_value > 0: + temp = app(aged_marker, young_marker, 1.) + else: + temp = app(aged_marker, young_marker, -1.) + fail += 1 + else: # young models are available; weighted voting using young model + if young_value > 0: + temp = app(aged_marker, young_marker, 1.) + else: + temp = app(aged_marker, aged_marker, -1.) + instances.append(temp) + + +def save_model(current_year, checkpoint_dir): + for m in xrange(len(clfs)): + print m, clfs[m] + clfs[m].save( checkpoint_dir + str(current_year) + '_' + str(m) + '.model') + + +def main(): + + # set argument for past year and current year + parser = argparse.ArgumentParser() + parser.add_argument('--past', type=int, help='past year') + parser.add_argument('--current', type=int, help='current year') + parser.add_argument('--starting', type=int, help='starting year') # initialization year = 2011 + parser.add_argument('--low', type=float, help='low threshold value') + parser.add_argument('--high', type=float, help='high threshold value') + parser.add_argument('--buffer', type=int, help = 'buffer size value') + + args = parser.parse_args() + + buffer_size = args.buffer + age_threshold_low = args.low + age_threshold_up = args.high + + + global features + features = pkl.load(open('feature_set.pkl','rb')) + + whole_directory = './'+ str(args.starting) + 'train/' + current_directory = str(age_threshold_low) + '_' + str(age_threshold_up) + '_' + str(buffer_size) + '/' + checkpoint_dir = whole_directory + current_directory + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + + + global clfs + + clfs = [PA1(), OGD(), AROW(), RDA(), ADA_FOBOS()] + print 'model pool size: ', len(clfs) + + ori_train_acc, ori_test_acc, weights, pool_acc, pool_fscore, pool_precision, pool_tpr, pool_fnr, pool_fpr, pool_difference = ([] for list_number in range(10)) + + print 'Loading trained model from ', args.past + + if args.starting == args.past: # copy the initial detection model into checkpoint_dir + for i in xrange(len(clfs)): + shutil.copy2( whole_directory + str(args.past) + '_' + str(i) + '.model' , checkpoint_dir ) + + + for i in xrange(len(clfs)): # for each model in the model pool + + clfs[i].load( checkpoint_dir + str(args.past) + '_' + str(i) + '.model') + # get original model weight + w = clfs[i].coef_[1:] + + weight = [] # [i][j]: i = model index, j = feature index + for w_num in xrange(len(w)): + weight.append(w[w_num]) + weights.append(weight) + + + print 'original weight size' + for c in xrange(len(weights)): + print c, len(weights[c]) + + print 'App buffer generation' + global app_buffer + app_buffer = [] + + if '2011' in str(args.past): # buffer is not exist + print 'App buffer not exists' + print 'App buffer initialization' + + print 'Loading data from ', args.past, ' to initialize app buffer ...' # load the 2011 data to initialized app buffer + X_train,Y_train=load_svmlight_file( str(args.past) + '.libsvm') + train_size, _ = X_train.shape + + random_app_index = np.random.randint(train_size, size = buffer_size) + X_train_temp = X_train[random_app_index, :] + + for i in xrange(buffer_size): + app_buffer_temp = [] + for j in xrange(len(clfs)): + app_buffer_temp.append(clfs[j].decision_function(X_train_temp[i])[0]) + app_buffer.append(app_buffer_temp) + + else: # load buffer from str(args.past).buffer + print 'App buffer exists' + app_buffer = pkl.load(open( checkpoint_dir + str(args.past) + '_buffer.pkl', 'rb')) + print 'Load app buffer from ', args.past, '_buffer.pkl' + + print 'Start evolving' + global confidences, new_confidences, p_values, instances, model_credits, model_confidences + confidences, new_confidences, p_values, instances, model_credits, model_confidences = ([] for list_number in range(6)) + all_fail = 0 # a special case, all model are aged + num_of_update = num_of_update_model = 0 + wrong_update = 0 + wrong_update_benign = wrong_update_malicious = right_update_benign = right_update_malicious = 0 + + Y_test = [] # save ground truth of test data ; for final evaluation only + + names = ['---list of test app names -----'] # names of apps developed in the current_year, e.g., names of apps developed in 2012 + for i in xrange(len(names)): + + # generate test data + + app_name = names[i] # for each test app + # according to the ground truth to get the true label + # the true label is for evaluation only, won't be processed by the model + data = [] + if 'malicious' in app_name: + + d, new_feature = extract_malicious(app_name) + data.append(d) + else: + d, new_feature = extract_benign(app_name) + data.append(d) + + # skip if do not need to save test data + save_data = open(app_name + '.libsvm', 'w') + for item in data: + save_data.writelines(item) + save_data.writelines('\n') + data_file.close() + + + X_test, y_t=load_svmlight_file(app_name + '.libsvm') + X_testt,y_testt=load_svmlight_file(app_name + '.libsvm') + Y_test.append(y_t) + + print 'X_test data shape', type(X_test), X_test.shape + xtest_dense = scipy.sparse.csr_matrix(X_testt).todense() + print 'X_test', xtest_dense.shape + + + # calculate JI value + + pre, conf, new_conf, app_b, p_value = ([] for list_number in range(5)) + + for j in xrange(len(clfs)): + xtest_current = xtest_dense[ ,:len(weights[j])] + score = xtest_current.dot(weights[j]) + conf.append(score[0,0]) + app_b.append(score[0,0]) + new_conf.append(abs(score[0,0])) + + confidences.append(conf) + new_confidences.append(new_conf) + app_buffer[random.randint(0, buffer_size-1)] = app_b # randomly replace a processed app with the new app + + + for j in xrange(len(clfs)): + pv = metric_calculation(i, j, buffer_size) + p_value.append(pv) + p_values.append(p_value) + + + global aged_model + aged_model = [] # store the index of aged model for current app i + young_value = aged_value = aged_marker = young_marker = 0 + young_value, aged_value, aged_marker, young_marker = all_model_label(i, age_threshold_low, age_threshold_up) + + # generate pseudo label + generate_pseudo_label(aged_marker, young_marker, aged_value, young_value) + + # drifting app is identified and young model exists + if (aged_marker != 0) and (young_marker >= 1): + + update_label = np.array([instances[i].pl]) # update label = pseudo label of the drifting app + + # update aged models + for model_index in aged_model: # update clfs[a] with X_test, update_label; a is the aged model index + # update with drifting app and corresponding pseudo label + train_accuracy,data,err,fit_time=clfs[model_index].fit(X_test,update_label, False) + w = clfs[model_index].coef_[1:] + updated_w = [] + for w_num in xrange(len(w)): + updated_w.append(w[w_num]) + weights[model_index] = updated_w # update weight matrix in the weight matrix list for the next new app + + # updat feature set + for new_identified_feature in new_feature: + features.append(new_identified_feature) + + + a, f, preci, tprr, fprr = evaluation(Y_test, instances) + pool_acc.append(a) + pool_fscore.append(f) + pool_precision.append(preci) + pool_tpr.append(tprr) + pool_fnr.append(100-tprr) + pool_fpr.append(fprr) + + + print buffer_size, len(app_buffer) + print 'pool accuracy', pool_acc + print 'pool fscore', pool_fscore + print 'pool precision', pool_precision + print 'pool tpr', pool_tpr + print 'pool fnr', pool_fnr + print 'pool fpr', pool_fpr + + print 'evolved weight length' + for c in xrange(len(weights)): + print c, len(weights[c]) + + # save evolved model for each year + print 'Save model evolved in Year ', args.current, 'into directory /', checkpoint_dir + current_year = args.current + save_model(current_year, checkpoint_dir) + + + # save feature set + with open('feature_set.pkl','wb') as feature_result: + pkl.dump(features, feature_result) + + print 'Save app buffer evolved in Year', args.current + pkl.dump(app_buffer, open( checkpoint_dir + str(args.current) + '_buffer.pkl', 'wb')) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/feature_extraction.py b/feature_extraction.py new file mode 100644 index 0000000..03420b6 --- /dev/null +++ b/feature_extraction.py @@ -0,0 +1,70 @@ +''' +Extract detection feature for each app according to included Android API + +input: smali files of an app stored under /app_name/ +output: detection features for the app stored in app_name.feature + +''' +import os +import sys +import string +import pickle as pkl +import argparse +import glob +import operator + +def extract_feature(filedir): + feature = [] + for dirpath, dirnames, filenames in os.walk(filedir): + for filename in [f for f in filenames if f.endswith ('.smali')]: + fn = os.path.join(dirpath, filename) # each smali file + lines = open(fn,'r').readlines() + lines = [line.strip() for line in lines] + + for line in lines: + # get all class names in invoke + try: + start = line.index(', ') + len(', ') + end = line.index(';', start) + classes = line[start:end] + except ValueError: + classes = '' + + # get invoking method name + try: + start = line.index(';->') + len(';->') + end = line.index('(', start) + methods = line[start:end] + except ValueError: + methods = '' + + objects = classes.split('/') + a = len(objects) + current_class = classes[:-(len(objects[a-1])+1)] + + if current_class in packages: # android api + + fe = classes + ':' + methods + feature.append(fe) + + with open(filedir + '.feature', 'wb') as result: + pkl.dump(feature, result) + + +def main(): + + family = ['android','google','java','javax', 'xml','apache', 'junit','json', 'dom'] + # correspond to the android.*, com.google.*, java.*, javax.*, org.xml.*, org.apache.*, junit.*, org.json, and org.w3c.dom.* packages + + global packages + packages = open('android_package.name','r').readlines() + packages = [package.strip() for package in packages] # packages correspond to family + print 'official package number:', len(packages) + + names = ['--list of app names ----'] + for app_name in names: + extract_feature(app_name) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/feature_set_initialization.py b/feature_set_initialization.py new file mode 100644 index 0000000..40a7c67 --- /dev/null +++ b/feature_set_initialization.py @@ -0,0 +1,22 @@ +import pickle as pkl +import os +import sys + +def main(): + + feature = [] + + names = ['--list of app names developed in 2011 ----'] + for app_name in names: + app_feature = pkl.load(open(app_name + '.feature', 'rb')) + for item in app_feature: + if item not in feature: + feature.append(item) + + with open('feature_set.pkl','wb') as result: + pkl.dump(feature, result) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model_pool_construction.py b/model_pool_construction.py new file mode 100644 index 0000000..f223e21 --- /dev/null +++ b/model_pool_construction.py @@ -0,0 +1,57 @@ +''' +Construct model pool according to initialization dataset, e.g., apps developed in 2011 + +''' +import numpy as np +import scipy +from scipy.stats import logistic +from scipy.special import expit +from numpy import dot +import sklearn +from sklearn.datasets import load_svmlight_file +import os +import sys +import string +from decimal import * +import collections +from classifiers import * +import time +import random +import argparse + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--starting', type=int, help='initialization dataset') # to use = args.initialization + args = parser.parse_args() + + starting_year = args.starting + + X_train,Y_train=load_svmlight_file(str(starting_year) + '.libsvm') + print 'X_train data shape' , type(X_train), X_train.shape + + global clfs + + clfs = [PA1(), OGD(), AROW(), RDA(), ADA_FOBOS()] + + print 'model pool size: ', len(clfs) # number of models in the model pool + + ori_train_acc = [] + + directory = './' + str(starting_year) + 'train/' + if not os.path.exists(directory): + os.makedirs(directory) + + # initialization process of all models + print 'All model initialization' + for i in xrange(len(clfs)): # i = every model in model pool + print clfs[i] + print 'training' + train_accuracy,data,err,fit_time=clfs[i].fit(X_train,Y_train, False) + ori_train_acc.append(train_accuracy) + clfs[i].save('./' + str(starting_year) + 'train/' + str(starting_year) + '_' + str(i) + '.model') + + print 'original model accuracy', ori_train_acc + +if __name__ == "__main__": + main() diff --git a/vector_generation.py b/vector_generation.py new file mode 100644 index 0000000..4cd575c --- /dev/null +++ b/vector_generation.py @@ -0,0 +1,82 @@ +#!/usr/bin/python +#coding:utf-8 +''' +generate 2011.libsvm (i.e., the initialization dataset) from *.feature developed in 2011 + +label: 1 = malicious, -1 = benign +''' + +import sys +import os +import string +import glob +import re +import string +import pickle as pkl +import argparse + + +def extract_benign(filedir): + + app_feature = pkl.load(open(filedir + '.feature','rb')) + + result = [] + result.append('-1 ') + + for i in range(len(features)): + if features[i] in app_feature: + result.append(str(i+1) + ':1 ') + + data.append(result) + + + +def extract_malicious(filedir): + + app_feature = pkl.load(open(filedir + '.feature','rb')) + + result = [] + result.append('1 ') + + for i in range(len(features)): + if features[i] in app_feature: + result.append(str(i+1) + ':1 ') + + data.append(result) + + +def main(): + + global features + features = [] + features = pkl.load(open('feature_set.pkl','rb')) + features = [feature.strip() for feature in features] + print 'feature size:', len(features) + print type(features) + + + global data + data = [] + + # generate initialization dataset + + benign_names = ['--list of benign apps developed in 2011 ---'] + for benign_app in benign_names: + extract_benign(benign_app, marker) + + malicious_names = ['--list of malicious apps developed in 2011 --'] + for malicious_app in malicious_names: + extract_malicious(malicious_app, marker) + + + data_file = open('2011.libsvm', 'w') # apps developed in 2011 is the initialization dataset + + for item in data: + data_file.writelines(item) + data_file.writelines('\n') + data_file.close() + + + +if __name__ == "__main__": + main()