From 49bae1c6ca8fe3c57d004e884f0e270037aca3b4 Mon Sep 17 00:00:00 2001 From: qian Date: Wed, 14 Sep 2016 21:57:36 -0400 Subject: [PATCH] update search engine part --- search-engine/db.py | 356 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 search-engine/db.py diff --git a/search-engine/db.py b/search-engine/db.py new file mode 100644 index 0000000..bc6c864 --- /dev/null +++ b/search-engine/db.py @@ -0,0 +1,356 @@ +import cPickle as pickle +from search import * +from nearpy import Engine +from nearpy.hashes import RandomDiscretizedProjections +from nearpy.filters import NearestFilter, UniqueFilter +from nearpy.distances import EuclideanDistance +from nearpy.distances import CosineDistance +from nearpy.hashes import RandomBinaryProjections +from nearpy.experiments import DistanceRatioExperiment +from redis import Redis +from nearpy.storage import RedisStorage +from feature import * +import numpy as np +import os +import pdb +import argparse +import time +import numpy as np +from refactoring import * +import pymongo +from pymongo import MongoClient + +def initDB(): + client = MongoClient() + client = MongoClient('localhost', 27017) + client = MongoClient('mongodb://localhost:27017/') + db = client.test_database + db = client['iot-encoding'] + return db + +db = initDB() +posts = db.posts + +class db: + + def __init__(self): + self.feature_list = {} + self.engine = None + + def loadHashmap(self, feature_size, result_n): + # Create redis storage adapter + redis_object = Redis(host='localhost', port=6379, db=0) + redis_storage = RedisStorage(redis_object) + pdb.set_trace() + try: + # Get hash config from redis + config = redis_storage.load_hash_configuration('test') + # Config is existing, create hash with None parameters + lshash = RandomBinaryProjections(None, None) + # Apply configuration loaded from redis + lshash.apply_config(config) + + except: + # Config is not existing, create hash from scratch, with 10 projections + lshash = RandomBinaryProjections('test', 0) + + + # Create engine for feature space of 100 dimensions and use our hash. + # This will set the dimension of the lshash only the first time, not when + # using the configuration loaded from redis. Use redis storage to store + # buckets. + nearest = NearestFilter(1000) + #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) + pdb.set_trace() + self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) + + # Do some stuff like indexing or querying with the engine... + + # Finally store hash configuration in redis for later use + redis_storage.store_hash_configuration(lshash) + + def appendToDB(self, binary_name, funcname, fvector, firmware_name=""): + if fvector is None: + return + #ftuple = tuple([fvector]) + self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname))) + + def batch_appendDB(self, binary_name, features, firmware_name=""): + for funcname in features: + feature = features[funcname] + #pdb.set_trace() + self.appendToDB(binary_name, funcname, feature, firmware_name) + + def batch_appendDBbyDir(self, base_dir): + cursor = posts.find({"firmware_name":"ddwrt-r21676_result"}) + i = 0 + for v in cursor: + print i + i+=1 + binary_name = v['binary_name'] + funcname = v['func_name'] + firmware_name = v['firmware_name'] + feature = v['fvector'] + self.appendToDB(binary_name, funcname, feature, firmware_name) + + def batch_appendDBbyDir1(self, base_dir): + image_dir = os.path.join(base_dir, "image") + firmware_featrues={} + bnum = 0 + fnum = 0 + i = 0 + pdb.set_trace() + for firmware_name in os.listdir(image_dir): + print firmware_name + firmware_featrues[firmware_name] = {} + firmware_dir = os.path.join(image_dir, firmware_name) + for binary_name in os.listdir(firmware_dir): + if binary_name.endswith(".features"): + bnum += 1 + featrues_dir = os.path.join(firmware_dir, binary_name) + featrues = pickle.load(open(featrues_dir, "r")) + for funcname in featrues: + fnum +=1 + #pdb.set_trace() + feature = featrues[funcname] + self.appendToDB(binary_name, funcname, feature, firmware_name) + del featrues + print("bnum ", bnum) + print("fnum ", fnum) + + def dump(self, base_dir): + db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") + pickle.dump(self.feature_list, open(db_dir, 'w')) + db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") + pickle.dump(self.engine, open(db_dir, 'w')) + + def loadDB(self, base_dir): + db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") + self.feature_list = pickle.load(open(db_dir, 'r')) + db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") + self.engine = pickle.load(open(db_dir, 'r')) + + def findF(self, binary_name, funcname): + x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]] + return x[0] + +def retrieveFeaturesByDir(n, base_dir): + firmware_featrues={} + i = 0 + for firmware_name in os.listdir(base_dir): + if firmware_name.endWith(".features"): + firmware_featrues[firmware_name] = {} + firmware_dir = os.path.join(base_dir, firmware_name) + if i > 0: + break + i += 1 + pdb.set_trace() + for binary_name in os.listdir(firmware_dir): + featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features") + featrues = pickle.load(open(featrues_dir, "r")) + for funcname in featrues: + feature = featrues[funcname] + self.appendToDB(firmware_name, binary_name, funcname, feature) + del featrues + +def retrieveFeatures(n, base_dir, filename, funcs): + feature_dic = {} + featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features") + featrues = pickle.load(open(featrues_dir, "r")) + #featuresx = retrieveFeaturesx(filename) + for name in featrues: + #if name in funcs: + x = featrues[name] + #+ featuresx[name] + feature_dic[name] = np.asarray(x) + return feature_dic + +def retrieveVuldb(base_input_dir): + vul_path = os.path.join(base_input_dir, "vul") + vul_db = pickle.load(open(vul_path, "r")) + return vul_db + + +def retrieveFeaturesx(filename): + ida_input_dir = os.path.join("./data/", filename + ".features") + featuresx = pickle.load(open(ida_input_dir, "r")) + return featuresx + +def retrieveQueries(n, base_dir, filename1, featrues_src): + queries = {} + featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features") + featrues = pickle.load(open(featrues_dir, "r")) + #featuresx = retrieveFeaturesx(filename1) + for name in featrues: + #if name in featrues_src: + x = featrues[name] + #+ featuresx[name] + queries[name] = np.asarray(x) + return queries + +def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1): + queries = {} + featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features") + featrues = pickle.load(open(featrues_dir, "r")) + for name in featrues: + #del featrues[name][5] + queries[name] = np.asarray(featrues[name]) + return queries + +def retrieveQuery(n, base_dir, filename, funcname): + featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features") + featrues = pickle.load(open(featrues_dir, "r")) + f = [featrues[v] for v in featrues if funcname in v ][0] + return np.asarray(f) + +def parse_command(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training") + parser.add_argument('--output_dir', type=str, help="output dir") + parser.add_argument("--filename1", type=str, help="the size of each graphlet") + parser.add_argument("--filename2", type=str, help="the size of each graphlet") + parser.add_argument("--size", type=int, help="the size of each graphlet") + #parser.add_argument("--size", type=int, help="the size of each graphlet") + args = parser.parse_args() + return args + +def loadFuncs(path): + funcs = {} + x86_dir = os.path.join(path, "func_candid") + #mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida") + fp = open(x86_dir,"r") + for line in fp: + items = line.split("\n") + funcname = items[0] + funcs[funcname] = 1 + return funcs + +def dump(path, featrues, queries): + fp = open(path + "/" + "matrix", 'w') + for name in featrues: + row = [] + row.append("x86") + row.append(name) + row += featrues[name] + fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row)) + for name in queries: + row = [] + row.append("mips") + row.append(name) + row += queries[name] + fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row)) + fp.close() + + +def queryBytwo(base_input_dir, filename1, filename2, n): + threthold = 50 + db_instance = db() + funcs = loadFuncs(base_input_dir) + db_instance.loadHashmap(n, 50000) + #pdb.set_trace() + featrues = retrieveFeatures(n, base_input_dir, filename1, funcs) + queries = retrieveQueries(n, base_input_dir, filename2, funcs) + #queries = refactoring(queries, featrues) + vul_db = retrieveVuldb(base_input_dir) + pdb.set_trace() + #dump(base_input_dir, featrues, queries) + #start = time.time() + #db_instance.batch_appendDBbyDir(base_input_dir) + #end = time.time() + #total = end - start + #print total + db_instance.batch_appendDB(filename1, featrues) + pdb.set_trace() + ranks = [] + times = [] + for threthold in xrange(1, 210, 10): + hit = [] + i = 0 + for name in queries: + #print i + i += 1 + ''' + if i == 1000: + print (sum(times)/len(times)) + pdb.set_trace() + print "s" + ''' + #if name not in vul_db['openssl']: + # continue + if name not in featrues: + continue + #pdb.set_trace() + query = queries[name] + #start = time.time() + x = db_instance.engine.neighbours(query) + #end = time.time() + #total = end - start + #times.append(total) + #print total + #pdb.set_trace() + try: + rank = [v for v in xrange(len(x)) if name in x[v][1]][0] + ranks.append((name, rank)) + if rank <= threthold: + hit.append(1) + else: + hit.append(0) + except: + #pdb.set_trace() + hit.append(0) + pass + #pdb.set_trace() + acc = sum(hit) * 1.0 / len(hit) + print acc + +def queryAll(base_dir, firmware_name, filename1, n): + threthold = 155 + db_instance = db() + db_instance.loadHashmap(n, 50000) + queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1) + start = time.time() + pdb.set_trace() + db_instance.batch_appendDBbyDir(n, base_dir) + end = time.time() + dur = end - start + print dur + pdb.set_trace() + hit = [] + i = 0 + times = [] + for name in queries: + print i + i += 1 + query = queries[name] + start = time.clock() + x = db_instance.engine.neighbours(query) + end = time.clock() + dur = end - start + times.append(dur) + #pdb.set_trace() + try: + rank = [v for v in xrange(len(x)) if name in x[v][1]] + if len(rank) > 1: + pdb.set_trace() + print "stop" + if rank[0] <= threthold: + hit.append(1) + else: + hit.append(0) + except: + hit.append(0) + + acc = sum(hit) * 1.0 / len(hit) + mean = np.mean(times) + std = np.std(times) + #pdb.set_trace() + print acc + +if __name__ == "__main__": + args = parse_command() + base_dir = args.base_input_dir + filename1 = args.filename1 + filename2 = args.filename2 + n = args.size + pdb.set_trace() + queryBytwo(base_dir, filename1, filename2, n)