Gencoding_plus/Genius3/search-engine/db.py

import cPickle as pickle
from search import *
from nearpy import Engine
from nearpy.hashes import RandomDiscretizedProjections
from nearpy.filters import NearestFilter, UniqueFilter
from nearpy.distances import EuclideanDistance
from nearpy.distances import CosineDistance
from nearpy.hashes import RandomBinaryProjections
from nearpy.experiments import DistanceRatioExperiment
from redis import Redis
from nearpy.storage import RedisStorage
from feature import *
import numpy as np
import os
import pdb
import argparse
import time
import numpy as np
from refactoring import *
import pymongo
from pymongo import MongoClient

def initDB():
	client = MongoClient()
	client = MongoClient('localhost', 27017)
	client = MongoClient('mongodb://localhost:27017/')
	db = client.test_database
	db = client['iot-encoding']
	return db

db = initDB()
posts = db.posts

class db:

	def __init__(self):
		self.feature_list = {}
		self.engine = None

	def loadHashmap(self, feature_size, result_n):
		# Create redis storage adapter
		redis_object = Redis(host='localhost', port=6379, db=0)
		redis_storage = RedisStorage(redis_object)
		pdb.set_trace()
		try:
			# Get hash config from redis
			config = redis_storage.load_hash_configuration('test')
			# Config is existing, create hash with None parameters
			lshash = RandomBinaryProjections(None, None)
			# Apply configuration loaded from redis
			lshash.apply_config(config)

		except:
			# Config is not existing, create hash from scratch, with 10 projections
			lshash = RandomBinaryProjections('test', 0)


		# Create engine for feature space of 100 dimensions and use our hash.
		# This will set the dimension of the lshash only the first time, not when
		# using the configuration loaded from redis. Use redis storage to store
		# buckets.
		nearest = NearestFilter(1000)
		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
		pdb.set_trace()
		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())

		# Do some stuff like indexing or querying with the engine...

		# Finally store hash configuration in redis for later use
		redis_storage.store_hash_configuration(lshash)

	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
		if fvector is None:
			return
		#ftuple = tuple([fvector])
		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))

	def batch_appendDB(self, binary_name, features, firmware_name=""):
		for funcname in features:
			feature = features[funcname]
			#pdb.set_trace()
			self.appendToDB(binary_name, funcname, feature, firmware_name)

	def batch_appendDBbyDir(self, base_dir):
		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
		i = 0
		for v in cursor:
			print i
			i+=1
			binary_name = v['binary_name']
			funcname = v['func_name']
			firmware_name = v['firmware_name']
			feature = v['fvector']
			self.appendToDB(binary_name, funcname, feature, firmware_name)

	def batch_appendDBbyDir1(self, base_dir):
		image_dir = os.path.join(base_dir, "image")
		firmware_featrues={}
		bnum = 0
		fnum = 0
		i  = 0
		pdb.set_trace()
		for firmware_name in os.listdir(image_dir):
			print firmware_name
			firmware_featrues[firmware_name] = {}
			firmware_dir = os.path.join(image_dir, firmware_name)
			for binary_name in os.listdir(firmware_dir):
				if binary_name.endswith(".features"):
					bnum += 1
					featrues_dir = os.path.join(firmware_dir, binary_name)
					featrues = pickle.load(open(featrues_dir, "r"))
					for funcname in featrues:
						fnum +=1
						#pdb.set_trace()
						feature = featrues[funcname]
						self.appendToDB(binary_name, funcname, feature, firmware_name)
					del featrues
		print("bnum ", bnum)
		print("fnum ", fnum)

	def dump(self, base_dir):
		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
		pickle.dump(self.feature_list, open(db_dir, 'w'))
		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
		pickle.dump(self.engine, open(db_dir, 'w'))

	def loadDB(self, base_dir):
		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
		self.feature_list = pickle.load(open(db_dir, 'r'))
		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
		self.engine = pickle.load(open(db_dir, 'r'))

	def findF(self, binary_name, funcname):
		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
		return x[0]

def retrieveFeaturesByDir(n, base_dir):
	firmware_featrues={}
	i = 0
	for firmware_name in os.listdir(base_dir):
		if firmware_name.endWith(".features"):
			firmware_featrues[firmware_name] = {}
			firmware_dir = os.path.join(base_dir, firmware_name)
			if i > 0:
				break
			i += 1
			pdb.set_trace()
			for binary_name in os.listdir(firmware_dir):
				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
				featrues = pickle.load(open(featrues_dir, "r"))
				for funcname in featrues:
					feature = featrues[funcname]
					self.appendToDB(firmware_name, binary_name, funcname, feature)
				del featrues

def retrieveFeatures(n, base_dir, filename, funcs):
	feature_dic = {}
	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
	featrues = pickle.load(open(featrues_dir, "r"))
	#featuresx = retrieveFeaturesx(filename)
	for name in featrues:
		#if name in funcs:
		x = featrues[name]
		#+ featuresx[name]
		feature_dic[name] = np.asarray(x)
	return feature_dic

def retrieveVuldb(base_input_dir):
	vul_path = os.path.join(base_input_dir, "vul")
	vul_db = pickle.load(open(vul_path, "r"))
	return vul_db


def retrieveFeaturesx(filename):
	ida_input_dir = os.path.join("./data/", filename + ".features")
	featuresx = pickle.load(open(ida_input_dir, "r"))
	return featuresx

def retrieveQueries(n, base_dir, filename1, featrues_src):
	queries = {}
	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
	featrues = pickle.load(open(featrues_dir, "r"))
	#featuresx = retrieveFeaturesx(filename1)
	for name in featrues:
		#if name in featrues_src:
		x = featrues[name]
		#+ featuresx[name]
		queries[name] = np.asarray(x)
	return queries

def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
	queries = {}
	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
	featrues = pickle.load(open(featrues_dir, "r"))
	for name in featrues:
		#del featrues[name][5]
		queries[name] = np.asarray(featrues[name])
	return queries

def retrieveQuery(n, base_dir, filename, funcname):
	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
	featrues = pickle.load(open(featrues_dir, "r"))
	f = [featrues[v] for v in featrues if funcname in v ][0]
	return np.asarray(f)

def parse_command():
	parser = argparse.ArgumentParser(description='Process some integers.')
	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
	parser.add_argument('--output_dir', type=str, help="output dir")
	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
	parser.add_argument("--size", type=int, help="the size of each graphlet")
	#parser.add_argument("--size", type=int, help="the size of each graphlet")
	args = parser.parse_args()
	return args

def loadFuncs(path):
	funcs = {}
	x86_dir = os.path.join(path, "func_candid")
	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
	fp = open(x86_dir,"r")
	for line in fp:
		items = line.split("\n")
		funcname = items[0]
		funcs[funcname] = 1
	return funcs

def dump(path, featrues, queries):
	fp = open(path + "/" + "matrix", 'w')
	for name in featrues:
		row = []
		row.append("x86")
		row.append(name)
		row += featrues[name]
		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
	for name in queries:
		row = []
		row.append("mips")
		row.append(name)
		row += queries[name]
		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
	fp.close()


def queryBytwo(base_input_dir, filename1, filename2, n):
	threthold = 50
	db_instance = db()
	funcs = loadFuncs(base_input_dir)
	db_instance.loadHashmap(n, 50000)
	#pdb.set_trace()
	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
	#queries = refactoring(queries, featrues)
	vul_db = retrieveVuldb(base_input_dir)
	pdb.set_trace()
	#dump(base_input_dir, featrues, queries)
	#start = time.time()
	#db_instance.batch_appendDBbyDir(base_input_dir)
	#end = time.time()
	#total = end - start
	#print total
	db_instance.batch_appendDB(filename1, featrues)
	pdb.set_trace()
	ranks = []
	times = []
	for threthold in xrange(1, 210, 10):
		hit = []
		i = 0
		for name in queries:
			#print i
			i += 1
			'''
			if i == 1000:
				print (sum(times)/len(times))
				pdb.set_trace()
				print "s"
			'''
			#if name not in vul_db['openssl']:
			#	continue
			if name not in featrues:
				continue
			#pdb.set_trace()
			query = queries[name]
			#start = time.time()
			x = db_instance.engine.neighbours(query)
			#end = time.time()
			#total = end - start
			#times.append(total)
			#print total
			#pdb.set_trace()
			try:
				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
				ranks.append((name, rank))
				if rank <= threthold:
					hit.append(1)
				else:
					hit.append(0)
			except:
				#pdb.set_trace()
				hit.append(0)
				pass
		#pdb.set_trace()
		acc = sum(hit) * 1.0 / len(hit)
		print acc

def queryAll(base_dir, firmware_name, filename1, n):
	threthold = 155
	db_instance = db()
	db_instance.loadHashmap(n, 50000)
	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
	start = time.time()
	pdb.set_trace()
	db_instance.batch_appendDBbyDir(n, base_dir)
	end = time.time()
	dur = end - start
	print dur
	pdb.set_trace()
	hit = []
	i = 0
	times = []
	for name in queries:
		print i
		i += 1
		query = queries[name]
		start = time.clock()
		x = db_instance.engine.neighbours(query)
		end = time.clock()
		dur = end - start
		times.append(dur)
		#pdb.set_trace()
		try:
			rank = [v for v in xrange(len(x)) if name in x[v][1]]
			if len(rank) > 1:
				pdb.set_trace()
				print "stop"
			if rank[0] <= threthold:
				hit.append(1)
			else:
				hit.append(0)
		except:
			hit.append(0)

	acc = sum(hit) * 1.0 / len(hit)
	mean = np.mean(times)
	std =  np.std(times)
	#pdb.set_trace()
	print acc

if __name__ == "__main__":
	args = parse_command()
	base_dir = args.base_input_dir
	filename1 = args.filename1
	filename2 = args.filename2
	n = args.size
	pdb.set_trace()
	queryBytwo(base_dir, filename1, filename2, n)