Add files via upload

2019-06-13 18:31:42 +08:00 · 2019-06-13 18:31:42 +08:00 · b644913fbf
commit b644913fbf
parent 42192eb5a4
5 changed files with 625 additions and 0 deletions
--- a/classification_evolvement.py
+++ b/classification_evolvement.py
@ -0,0 +1,394 @@
 '''
 Use the model pool initialized with 2011 apps to detect malware from apps developed in 2012, 2013, 2014, 2015, 2016
 Model pool and feature set (i.e., feature_set.pkl) are evolved during detection.
 '''
 import numpy as np
 import scipy
 from scipy.stats import logistic
 from scipy.special import expit
 from numpy import dot
 import sklearn
 from sklearn.datasets import load_svmlight_file
 import os
 import sys
 import string
 from decimal import *
 import collections
 from classifiers import *
 import time
 import random
 import pickle as pkl
 import argparse
 import shutil
 class app(object):
 	def __init__(self, a, y, pl):
 		self.a = a
 		self.y = y
 		self.pl = pl
 def extract_benign(filedir):
 	app_feature = pkl.load(open(filedir + '.feature','rb'))
 	result = []
 	result.append('-1 ')
 	new = []
 	for i in range(len(features)):
 		if features[i] in app_feature:
 			result.append(str(i+1) + ':1 ')
 	for item in app_feature:
 		if item not in features: # this is a new feature, store new features in advance to save time
 			p = 1
 			# append the new feature to the data
 			# the model won't process this new feature unless update 
 			# the model will only process the first |len(features)| features
 			result.append(str(len(features) + p) + ':1 ') 
 			new.append(item)
 			p += 1
 	return result, new
 def extract_malicious(filedir):
 	app_feature = pkl.load(open(filedir + '.feature','rb'))
 	result = []
 	result.append('1 ')
 	new = []
 	for i in range(len(features)):
 		if features[i] in app_feature:
 			result.append(str(i+1) + ':1 ')
 	for item in app_feature:
 		if item not in features: # this is a new feature
 			p = 1
 			# append the new feature to the data
 			# the model won't process this new feature unless update 
 			# the model will only process the first |len(features)| features
 			# if this app is a drifting app, the new identified feature will be added into feature_set.pkl
 			result.append(str(len(features) + p) + ':1 ') 
 			new.append(item)
 			p += 1
 	return result, new
 def evaluation(Y_test, instances):
 	n = p = tp = fn = tn = fp = right = 0
 	print 'evaluating predictions'
 	for e in xrange(len(Y_test)):
 		if Y_test[e] != 1 and instances[e].pl != 1: # true label, prediction label
 			n += 1
 			tn += 1
 		if Y_test[e] != 1 and instances[e].pl == 1:
 			n += 1
 			fp +=1
 		if Y_test[e] == 1 and instances[e].pl == 1:
 			p += 1
 			tp += 1
 		if Y_test[e] == 1 and instances[e].pl != 1:
 			p += 1
 			fn += 1
 		if Y_test[e] == instances[e].pl:
 			right += 1
 	print type(Y_test), len(Y_test)
 	print 'all', n+p, 'right', right ,'n', n , 'p:', p, 'tn', tn, 'tp',tp, 'fn',fn, 'fp',fp
 	accu = (Decimal(tp) + Decimal(tn))*Decimal(100) / (Decimal(n) + Decimal(p))
 	tpr = Decimal(tp)*Decimal(100)/Decimal(p)
 	fpr = Decimal(fp)*Decimal(100)/Decimal(n)
 	f1 = Decimal(200)*Decimal(tp)/(Decimal(2)*Decimal(tp) + Decimal(fp) + Decimal(fn))
 	precision = Decimal(tp)*Decimal(100)/(Decimal(tp) + Decimal(fp))
 	print 'model pool f measure: ', float(format(f1, '.2f')), 'precision: ', float(format(precision, '.2f')), 'recall: ', float(format(tpr, '.2f'))
 	return float(format(accu, '.2f')), float(format(f1, '.2f')), float(format(precision, '.2f')), float(format(tpr, '.2f')), float(format(fpr, '.2f'))
 def metric_calculation(i, j, buffer_size):
 	larger = 0
 	if len(app_buffer) <=buffer_size:
 		app_temp = [item[j] for item in app_buffer]
 		positive = sum(app_tt > 0 for app_tt in app_temp)
 		negative = sum(app_tt <= 0 for app_tt in app_temp) 
 		if confidences[i][j] > 0: # prediction label = 1 = malicious
 			larger = sum(confidences[i][j] >= app_t and app_t > 0 for app_t in app_temp)
 			p_ratio = float(Decimal(larger)/Decimal(positive))
 		else: # <= 0 = benign
 			larger = sum(confidences[i][j] <= app_t and app_t <= 0 for app_t in app_temp)
 			p_ratio = float(Decimal(larger)/Decimal(negative))
 	else: 
 		app_temp = [item[j] for item in app_buffer[len(app_buffer)-buffer_size:]] 
 		positive = sum(app_tt > 0 for app_tt in app_temp) 
 		negative = sum(app_tt <= 0 for app_tt in app_temp)
 		if confidences[i][j] > 0: # prediction label = 1 = malicious
 			larger = sum(confidences[i][j] >= app_t and app_t > 0 for app_t in app_temp)
 			p_ratio = float(Decimal(larger)/Decimal(positive))
 		else:
 			larger = sum(confidences[i][j] <= app_t and app_t <= 0 for app_t in app_temp)
 			p_ratio = float(Decimal(larger)/Decimal(negative))
 	return p_ratio
 def all_model_label(i, age_threshold_low, age_threshold_up):
 	young = aged = a_marker = y_marker = 0
 	for j in xrange(len(clfs)):
 		if age_threshold_low <= p_values[i][j] <= age_threshold_up: # not an aged model, can vote
 			young += confidences[i][j]
 			y_marker += 1 # number of young model
 		else: # this is an aged model, need to be updated
 			aged += confidences[i][j]
 			aged_model.append(j) # record aged model index
 			a_marker += 1 # num of aged model for this drifting app
 	return young, aged, a_marker, y_marker
 def generate_pseudo_label(aged_marker, young_marker, aged_value, young_value):
 	if young_marker == 0: # young models are not available; weighted voting using aged model
 		if aged_value > 0:
 			temp = app(aged_marker, young_marker, 1.)
 		else:
 			temp = app(aged_marker, young_marker, -1.)
 		fail += 1
 	else: # young models are available; weighted voting using young model
 		if young_value > 0:
 			temp = app(aged_marker, young_marker, 1.)
 		else:
 			temp = app(aged_marker, aged_marker, -1.)
 	instances.append(temp)
 def save_model(current_year, checkpoint_dir):
 	for m in xrange(len(clfs)):
 		print m, clfs[m]
 		clfs[m].save( checkpoint_dir + str(current_year) + '_' + str(m) + '.model')
 def main():
 	# set argument for past year and current year
 	parser = argparse.ArgumentParser()
 	parser.add_argument('--past', type=int, help='past year')
 	parser.add_argument('--current', type=int, help='current year')
 	parser.add_argument('--starting', type=int, help='starting year') # initialization year = 2011
 	parser.add_argument('--low', type=float, help='low threshold value')
 	parser.add_argument('--high', type=float, help='high threshold value')
 	parser.add_argument('--buffer', type=int, help = 'buffer size value')
 	args = parser.parse_args()
 	buffer_size = args.buffer
 	age_threshold_low = args.low
 	age_threshold_up = args.high
 	global features
 	features = pkl.load(open('feature_set.pkl','rb'))
 	whole_directory = './'+ str(args.starting) + 'train/'
 	current_directory = str(age_threshold_low) + '_' + str(age_threshold_up) + '_' + str(buffer_size) + '/' 
 	checkpoint_dir = whole_directory + current_directory
 	if not os.path.exists(checkpoint_dir):
 		os.makedirs(checkpoint_dir)
 	global clfs
 	clfs = [PA1(), OGD(), AROW(), RDA(), ADA_FOBOS()]
 	print 'model pool size: ', len(clfs)
 	ori_train_acc, ori_test_acc, weights, pool_acc, pool_fscore, pool_precision, pool_tpr, pool_fnr, pool_fpr, pool_difference = ([] for list_number in range(10))
 	print 'Loading trained model from ', args.past
 	if args.starting == args.past: # copy the initial detection model into checkpoint_dir
 		for i in xrange(len(clfs)):
 			shutil.copy2( whole_directory + str(args.past) + '_' + str(i) + '.model' , checkpoint_dir )
 	for i in xrange(len(clfs)): # for each model in the model pool
 		clfs[i].load( checkpoint_dir + str(args.past) + '_' + str(i) + '.model')
 		# get original model weight
 		w = clfs[i].coef_[1:]
 		weight = [] # [i][j]: i = model index, j = feature index
 		for w_num in xrange(len(w)):
 			weight.append(w[w_num])
 		weights.append(weight)
 	print 'original weight size'
 	for c in xrange(len(weights)):
 		print c, len(weights[c])
 	print 'App buffer generation'
 	global app_buffer
 	app_buffer = []
 	if '2011' in str(args.past): # buffer is not exist
 		print 'App buffer not exists'
 		print 'App buffer initialization'
 		print 'Loading data from ', args.past, ' to initialize app buffer ...' # load the 2011 data to initialized app buffer
 		X_train,Y_train=load_svmlight_file( str(args.past) + '.libsvm')
 		train_size, _  = X_train.shape
 		random_app_index = np.random.randint(train_size, size = buffer_size)
 		X_train_temp = X_train[random_app_index, :]
 		for i in xrange(buffer_size):
 			app_buffer_temp = []
 			for j  in xrange(len(clfs)):
 				app_buffer_temp.append(clfs[j].decision_function(X_train_temp[i])[0])
 			app_buffer.append(app_buffer_temp)
 	else: # load buffer from str(args.past).buffer
 		print 'App buffer exists'
 		app_buffer = pkl.load(open( checkpoint_dir + str(args.past) + '_buffer.pkl', 'rb'))
 		print 'Load app buffer from ', args.past, '_buffer.pkl'
 	print 'Start evolving'
 	global confidences, new_confidences, p_values, instances, model_credits, model_confidences
 	confidences, new_confidences, p_values, instances, model_credits, model_confidences = ([] for list_number in range(6))
 	all_fail = 0 # a special case, all model are aged
 	num_of_update = num_of_update_model =  0
 	wrong_update = 0
 	wrong_update_benign = wrong_update_malicious = right_update_benign = right_update_malicious = 0
 	Y_test = [] # save ground truth of test data ; for final evaluation only
 	names = ['---list of test app names -----'] # names of apps developed in the current_year, e.g., names of apps developed in 2012
 	for i in xrange(len(names)):
 		# generate test data
 		app_name = names[i] # for each test app
 		# according to the ground truth to get the true label
 		# the true label is for evaluation only, won't be processed by the model
 		data = []
 		if 'malicious' in app_name:
 			d, new_feature = extract_malicious(app_name)
 			data.append(d)
 		else:
 			d, new_feature = extract_benign(app_name)
 			data.append(d)
 		# skip if do not need to save test data
 		save_data = open(app_name + '.libsvm', 'w')
 		for item in data:
 			save_data.writelines(item)
 			save_data.writelines('\n')
 		data_file.close()
 		X_test, y_t=load_svmlight_file(app_name + '.libsvm')
 		X_testt,y_testt=load_svmlight_file(app_name + '.libsvm')
 		Y_test.append(y_t)
 		print 'X_test data shape', type(X_test), X_test.shape
 		xtest_dense = scipy.sparse.csr_matrix(X_testt).todense()
 		print 'X_test', xtest_dense.shape	
 		# calculate JI value
 		pre, conf, new_conf, app_b, p_value = ([] for list_number in range(5))
 		for j in xrange(len(clfs)):
 			xtest_current = xtest_dense[ ,:len(weights[j])] 
 			score = xtest_current.dot(weights[j])
 			conf.append(score[0,0])
 			app_b.append(score[0,0])
 			new_conf.append(abs(score[0,0]))
 		confidences.append(conf)
 		new_confidences.append(new_conf)
 		app_buffer[random.randint(0, buffer_size-1)] = app_b # randomly replace a processed app with the new app
 		for j in xrange(len(clfs)):
 			pv = metric_calculation(i, j, buffer_size)
 			p_value.append(pv)
 		p_values.append(p_value) 
 		global aged_model
 		aged_model = [] # store the index of aged model for current app i
 		young_value = aged_value = aged_marker = young_marker = 0
 		young_value, aged_value, aged_marker, young_marker = all_model_label(i, age_threshold_low, age_threshold_up)
 		# generate  pseudo label
 		generate_pseudo_label(aged_marker, young_marker, aged_value, young_value)
 		# drifting app is identified and young model exists
 		if (aged_marker != 0) and (young_marker >= 1): 
 			update_label = np.array([instances[i].pl]) # update label = pseudo label of the drifting app
 			# update aged models
 			for model_index in aged_model: # update clfs[a] with X_test, update_label; a is the aged model index
 				# update with drifting app and corresponding pseudo label
 				train_accuracy,data,err,fit_time=clfs[model_index].fit(X_test,update_label, False)
 				w = clfs[model_index].coef_[1:] 
 				updated_w = []
 				for w_num in xrange(len(w)):
 					updated_w.append(w[w_num])
 				weights[model_index] = updated_w # update weight matrix in the weight matrix list for the next new app
 			# updat feature set
 			for new_identified_feature in new_feature:
 				features.append(new_identified_feature)
 	a, f, preci, tprr, fprr = evaluation(Y_test, instances)
 	pool_acc.append(a)
 	pool_fscore.append(f)
 	pool_precision.append(preci)
 	pool_tpr.append(tprr)
 	pool_fnr.append(100-tprr)
 	pool_fpr.append(fprr)
 	print buffer_size, len(app_buffer)
 	print 'pool accuracy', pool_acc
 	print 'pool fscore', pool_fscore
 	print 'pool precision', pool_precision
 	print 'pool tpr', pool_tpr
 	print 'pool fnr', pool_fnr
 	print 'pool fpr', pool_fpr
 	print 'evolved weight length'
 	for c in xrange(len(weights)):
 		print c, len(weights[c])
 	# save evolved model for each year
 	print 'Save model evolved in Year ', args.current, 'into directory /', checkpoint_dir
 	current_year = args.current
 	save_model(current_year, checkpoint_dir)
 	# save feature set
 	with open('feature_set.pkl','wb') as feature_result:
 		pkl.dump(features, feature_result)
 	print 'Save app buffer evolved in Year', args.current
 	pkl.dump(app_buffer, open( checkpoint_dir + str(args.current) + '_buffer.pkl', 'wb'))
 if __name__ == "__main__":
 	main()
--- a/feature_extraction.py
+++ b/feature_extraction.py
@ -0,0 +1,70 @@
 '''
 Extract detection feature for each app according to included Android API
 input: smali files of an app stored under /app_name/
 output: detection features for the app stored in app_name.feature
 '''
 import os
 import sys
 import string
 import pickle as pkl
 import argparse
 import glob
 import operator
 def extract_feature(filedir):
 	feature = []
 	for dirpath, dirnames, filenames in os.walk(filedir):
 		for filename in [f for f in filenames if f.endswith ('.smali')]:
 			fn = os.path.join(dirpath, filename) # each smali file
 			lines = open(fn,'r').readlines()
 			lines = [line.strip() for line in lines]
 			for line in lines:
 				# get all class names in invoke
 				try:
 					start = line.index(', ') + len(', ')
 					end = line.index(';', start)
 					classes = line[start:end]
 				except ValueError:
 					classes = ''
 				# get invoking method name
 				try:
 					start = line.index(';->') + len(';->')
 					end = line.index('(', start)
 					methods = line[start:end]
 				except ValueError:
 					methods = ''
 				objects = classes.split('/')
 				a = len(objects)
 				current_class = classes[:-(len(objects[a-1])+1)]
 				if current_class in packages: # android api
 					fe = classes + ':' + methods
 					feature.append(fe)
 	with open(filedir + '.feature', 'wb') as result:
 		pkl.dump(feature, result)
 def main():
 	family = ['android','google','java','javax', 'xml','apache', 'junit','json', 'dom']
 	# correspond to the android.*, com.google.*, java.*, javax.*, org.xml.*, org.apache.*, junit.*, org.json, and org.w3c.dom.* packages
 	global packages
 	packages = open('android_package.name','r').readlines()
 	packages = [package.strip() for package in packages] # packages correspond to family
 	print 'official package number:', len(packages)
 	names = ['--list of app names ----']
 	for app_name in names:
 		extract_feature(app_name)
 if __name__ == "__main__":
 	main()
--- a/feature_set_initialization.py
+++ b/feature_set_initialization.py
@ -0,0 +1,22 @@
 import pickle as pkl 
 import os
 import sys
 def main():
 	feature = []
 	names = ['--list of app names developed in 2011 ----']
 	for app_name in names:
 		app_feature = pkl.load(open(app_name + '.feature', 'rb'))
 		for item in app_feature:
 			if item not in feature:
 				feature.append(item)
 	with open('feature_set.pkl','wb') as result:
 		pkl.dump(feature, result)
 if __name__ == "__main__":
 	main()
--- a/model_pool_construction.py
+++ b/model_pool_construction.py
@ -0,0 +1,57 @@
 '''
 Construct model pool according to initialization dataset, e.g., apps developed in 2011
 '''
 import numpy as np
 import scipy
 from scipy.stats import logistic
 from scipy.special import expit
 from numpy import dot
 import sklearn
 from sklearn.datasets import load_svmlight_file
 import os
 import sys
 import string
 from decimal import *
 import collections
 from classifiers import *
 import time
 import random
 import argparse
 def main():
 	parser = argparse.ArgumentParser()
 	parser.add_argument('--starting', type=int, help='initialization dataset') # to use = args.initialization
 	args = parser.parse_args()
 	starting_year = args.starting
 	X_train,Y_train=load_svmlight_file(str(starting_year) + '.libsvm')
 	print 'X_train data shape' , type(X_train), X_train.shape
 	global clfs
 	clfs = [PA1(), OGD(), AROW(), RDA(), ADA_FOBOS()]
 	print 'model pool size: ', len(clfs) # number of models in the model pool
 	ori_train_acc = []
 	directory = './' + str(starting_year) + 'train/' 
 	if not os.path.exists(directory):
 		os.makedirs(directory)
 	# initialization process of all models 
 	print 'All model initialization'
 	for i in xrange(len(clfs)): # i = every model in model pool
 		print clfs[i]
 		print 'training'
 		train_accuracy,data,err,fit_time=clfs[i].fit(X_train,Y_train, False)
 		ori_train_acc.append(train_accuracy)
 		clfs[i].save('./' + str(starting_year) + 'train/' + str(starting_year) + '_' + str(i) + '.model')
 	print 'original model accuracy', ori_train_acc
 if __name__ == "__main__":
 	main()
--- a/vector_generation.py
+++ b/vector_generation.py
@ -0,0 +1,82 @@
 #!/usr/bin/python
 #coding:utf-8
 '''
 generate 2011.libsvm (i.e., the initialization dataset) from *.feature developed in 2011
 label: 1 = malicious, -1 = benign
 '''
 import sys
 import os
 import string
 import glob
 import re
 import string
 import pickle as pkl
 import argparse
 def extract_benign(filedir):
 	app_feature = pkl.load(open(filedir + '.feature','rb'))
 	result = []
 	result.append('-1 ')
 	for i in range(len(features)):
 		if features[i] in app_feature:
 			result.append(str(i+1) + ':1 ')
 	data.append(result)
 def extract_malicious(filedir):
 	app_feature = pkl.load(open(filedir + '.feature','rb'))
 	result = []
 	result.append('1 ')
 	for i in range(len(features)):
 		if features[i] in app_feature:
 			result.append(str(i+1) + ':1 ')
 	data.append(result)
 def main():
 	global features
 	features = []
 	features = pkl.load(open('feature_set.pkl','rb'))
 	features = [feature.strip() for feature in features]
 	print 'feature size:', len(features)
 	print type(features)
 	global data 
 	data = []
 	# generate initialization dataset
 	benign_names = ['--list of benign apps developed in 2011 ---']
 	for benign_app in benign_names:
 		extract_benign(benign_app, marker)
 	malicious_names = ['--list of malicious apps developed in 2011 --']
 	for malicious_app in malicious_names:
 		extract_malicious(malicious_app, marker)
 	data_file = open('2011.libsvm', 'w') # apps developed in 2011 is the initialization dataset
 	for item in data:
 		data_file.writelines(item)
 		data_file.writelines('\n')
 	data_file.close()
 if __name__ == "__main__":
 	main()