update search engine part
This commit is contained in:
parent
7cd5e55e48
commit
49bae1c6ca
356
search-engine/db.py
Normal file
356
search-engine/db.py
Normal file
@ -0,0 +1,356 @@
|
|||||||
|
import cPickle as pickle
|
||||||
|
from search import *
|
||||||
|
from nearpy import Engine
|
||||||
|
from nearpy.hashes import RandomDiscretizedProjections
|
||||||
|
from nearpy.filters import NearestFilter, UniqueFilter
|
||||||
|
from nearpy.distances import EuclideanDistance
|
||||||
|
from nearpy.distances import CosineDistance
|
||||||
|
from nearpy.hashes import RandomBinaryProjections
|
||||||
|
from nearpy.experiments import DistanceRatioExperiment
|
||||||
|
from redis import Redis
|
||||||
|
from nearpy.storage import RedisStorage
|
||||||
|
from feature import *
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import pdb
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
from refactoring import *
|
||||||
|
import pymongo
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
def initDB():
|
||||||
|
client = MongoClient()
|
||||||
|
client = MongoClient('localhost', 27017)
|
||||||
|
client = MongoClient('mongodb://localhost:27017/')
|
||||||
|
db = client.test_database
|
||||||
|
db = client['iot-encoding']
|
||||||
|
return db
|
||||||
|
|
||||||
|
db = initDB()
|
||||||
|
posts = db.posts
|
||||||
|
|
||||||
|
class db:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.feature_list = {}
|
||||||
|
self.engine = None
|
||||||
|
|
||||||
|
def loadHashmap(self, feature_size, result_n):
|
||||||
|
# Create redis storage adapter
|
||||||
|
redis_object = Redis(host='localhost', port=6379, db=0)
|
||||||
|
redis_storage = RedisStorage(redis_object)
|
||||||
|
pdb.set_trace()
|
||||||
|
try:
|
||||||
|
# Get hash config from redis
|
||||||
|
config = redis_storage.load_hash_configuration('test')
|
||||||
|
# Config is existing, create hash with None parameters
|
||||||
|
lshash = RandomBinaryProjections(None, None)
|
||||||
|
# Apply configuration loaded from redis
|
||||||
|
lshash.apply_config(config)
|
||||||
|
|
||||||
|
except:
|
||||||
|
# Config is not existing, create hash from scratch, with 10 projections
|
||||||
|
lshash = RandomBinaryProjections('test', 0)
|
||||||
|
|
||||||
|
|
||||||
|
# Create engine for feature space of 100 dimensions and use our hash.
|
||||||
|
# This will set the dimension of the lshash only the first time, not when
|
||||||
|
# using the configuration loaded from redis. Use redis storage to store
|
||||||
|
# buckets.
|
||||||
|
nearest = NearestFilter(1000)
|
||||||
|
#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
|
||||||
|
pdb.set_trace()
|
||||||
|
self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
|
||||||
|
|
||||||
|
# Do some stuff like indexing or querying with the engine...
|
||||||
|
|
||||||
|
# Finally store hash configuration in redis for later use
|
||||||
|
redis_storage.store_hash_configuration(lshash)
|
||||||
|
|
||||||
|
def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
|
||||||
|
if fvector is None:
|
||||||
|
return
|
||||||
|
#ftuple = tuple([fvector])
|
||||||
|
self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
|
||||||
|
|
||||||
|
def batch_appendDB(self, binary_name, features, firmware_name=""):
|
||||||
|
for funcname in features:
|
||||||
|
feature = features[funcname]
|
||||||
|
#pdb.set_trace()
|
||||||
|
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||||
|
|
||||||
|
def batch_appendDBbyDir(self, base_dir):
|
||||||
|
cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
|
||||||
|
i = 0
|
||||||
|
for v in cursor:
|
||||||
|
print i
|
||||||
|
i+=1
|
||||||
|
binary_name = v['binary_name']
|
||||||
|
funcname = v['func_name']
|
||||||
|
firmware_name = v['firmware_name']
|
||||||
|
feature = v['fvector']
|
||||||
|
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||||
|
|
||||||
|
def batch_appendDBbyDir1(self, base_dir):
|
||||||
|
image_dir = os.path.join(base_dir, "image")
|
||||||
|
firmware_featrues={}
|
||||||
|
bnum = 0
|
||||||
|
fnum = 0
|
||||||
|
i = 0
|
||||||
|
pdb.set_trace()
|
||||||
|
for firmware_name in os.listdir(image_dir):
|
||||||
|
print firmware_name
|
||||||
|
firmware_featrues[firmware_name] = {}
|
||||||
|
firmware_dir = os.path.join(image_dir, firmware_name)
|
||||||
|
for binary_name in os.listdir(firmware_dir):
|
||||||
|
if binary_name.endswith(".features"):
|
||||||
|
bnum += 1
|
||||||
|
featrues_dir = os.path.join(firmware_dir, binary_name)
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
for funcname in featrues:
|
||||||
|
fnum +=1
|
||||||
|
#pdb.set_trace()
|
||||||
|
feature = featrues[funcname]
|
||||||
|
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||||
|
del featrues
|
||||||
|
print("bnum ", bnum)
|
||||||
|
print("fnum ", fnum)
|
||||||
|
|
||||||
|
def dump(self, base_dir):
|
||||||
|
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
|
||||||
|
pickle.dump(self.feature_list, open(db_dir, 'w'))
|
||||||
|
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
|
||||||
|
pickle.dump(self.engine, open(db_dir, 'w'))
|
||||||
|
|
||||||
|
def loadDB(self, base_dir):
|
||||||
|
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
|
||||||
|
self.feature_list = pickle.load(open(db_dir, 'r'))
|
||||||
|
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
|
||||||
|
self.engine = pickle.load(open(db_dir, 'r'))
|
||||||
|
|
||||||
|
def findF(self, binary_name, funcname):
|
||||||
|
x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
|
||||||
|
return x[0]
|
||||||
|
|
||||||
|
def retrieveFeaturesByDir(n, base_dir):
|
||||||
|
firmware_featrues={}
|
||||||
|
i = 0
|
||||||
|
for firmware_name in os.listdir(base_dir):
|
||||||
|
if firmware_name.endWith(".features"):
|
||||||
|
firmware_featrues[firmware_name] = {}
|
||||||
|
firmware_dir = os.path.join(base_dir, firmware_name)
|
||||||
|
if i > 0:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
pdb.set_trace()
|
||||||
|
for binary_name in os.listdir(firmware_dir):
|
||||||
|
featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
for funcname in featrues:
|
||||||
|
feature = featrues[funcname]
|
||||||
|
self.appendToDB(firmware_name, binary_name, funcname, feature)
|
||||||
|
del featrues
|
||||||
|
|
||||||
|
def retrieveFeatures(n, base_dir, filename, funcs):
|
||||||
|
feature_dic = {}
|
||||||
|
featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
#featuresx = retrieveFeaturesx(filename)
|
||||||
|
for name in featrues:
|
||||||
|
#if name in funcs:
|
||||||
|
x = featrues[name]
|
||||||
|
#+ featuresx[name]
|
||||||
|
feature_dic[name] = np.asarray(x)
|
||||||
|
return feature_dic
|
||||||
|
|
||||||
|
def retrieveVuldb(base_input_dir):
|
||||||
|
vul_path = os.path.join(base_input_dir, "vul")
|
||||||
|
vul_db = pickle.load(open(vul_path, "r"))
|
||||||
|
return vul_db
|
||||||
|
|
||||||
|
|
||||||
|
def retrieveFeaturesx(filename):
|
||||||
|
ida_input_dir = os.path.join("./data/", filename + ".features")
|
||||||
|
featuresx = pickle.load(open(ida_input_dir, "r"))
|
||||||
|
return featuresx
|
||||||
|
|
||||||
|
def retrieveQueries(n, base_dir, filename1, featrues_src):
|
||||||
|
queries = {}
|
||||||
|
featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
#featuresx = retrieveFeaturesx(filename1)
|
||||||
|
for name in featrues:
|
||||||
|
#if name in featrues_src:
|
||||||
|
x = featrues[name]
|
||||||
|
#+ featuresx[name]
|
||||||
|
queries[name] = np.asarray(x)
|
||||||
|
return queries
|
||||||
|
|
||||||
|
def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
|
||||||
|
queries = {}
|
||||||
|
featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
for name in featrues:
|
||||||
|
#del featrues[name][5]
|
||||||
|
queries[name] = np.asarray(featrues[name])
|
||||||
|
return queries
|
||||||
|
|
||||||
|
def retrieveQuery(n, base_dir, filename, funcname):
|
||||||
|
featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
|
||||||
|
featrues = pickle.load(open(featrues_dir, "r"))
|
||||||
|
f = [featrues[v] for v in featrues if funcname in v ][0]
|
||||||
|
return np.asarray(f)
|
||||||
|
|
||||||
|
def parse_command():
|
||||||
|
parser = argparse.ArgumentParser(description='Process some integers.')
|
||||||
|
parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
|
||||||
|
parser.add_argument('--output_dir', type=str, help="output dir")
|
||||||
|
parser.add_argument("--filename1", type=str, help="the size of each graphlet")
|
||||||
|
parser.add_argument("--filename2", type=str, help="the size of each graphlet")
|
||||||
|
parser.add_argument("--size", type=int, help="the size of each graphlet")
|
||||||
|
#parser.add_argument("--size", type=int, help="the size of each graphlet")
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
def loadFuncs(path):
|
||||||
|
funcs = {}
|
||||||
|
x86_dir = os.path.join(path, "func_candid")
|
||||||
|
#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
|
||||||
|
fp = open(x86_dir,"r")
|
||||||
|
for line in fp:
|
||||||
|
items = line.split("\n")
|
||||||
|
funcname = items[0]
|
||||||
|
funcs[funcname] = 1
|
||||||
|
return funcs
|
||||||
|
|
||||||
|
def dump(path, featrues, queries):
|
||||||
|
fp = open(path + "/" + "matrix", 'w')
|
||||||
|
for name in featrues:
|
||||||
|
row = []
|
||||||
|
row.append("x86")
|
||||||
|
row.append(name)
|
||||||
|
row += featrues[name]
|
||||||
|
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
|
||||||
|
for name in queries:
|
||||||
|
row = []
|
||||||
|
row.append("mips")
|
||||||
|
row.append(name)
|
||||||
|
row += queries[name]
|
||||||
|
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
|
||||||
|
def queryBytwo(base_input_dir, filename1, filename2, n):
|
||||||
|
threthold = 50
|
||||||
|
db_instance = db()
|
||||||
|
funcs = loadFuncs(base_input_dir)
|
||||||
|
db_instance.loadHashmap(n, 50000)
|
||||||
|
#pdb.set_trace()
|
||||||
|
featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
|
||||||
|
queries = retrieveQueries(n, base_input_dir, filename2, funcs)
|
||||||
|
#queries = refactoring(queries, featrues)
|
||||||
|
vul_db = retrieveVuldb(base_input_dir)
|
||||||
|
pdb.set_trace()
|
||||||
|
#dump(base_input_dir, featrues, queries)
|
||||||
|
#start = time.time()
|
||||||
|
#db_instance.batch_appendDBbyDir(base_input_dir)
|
||||||
|
#end = time.time()
|
||||||
|
#total = end - start
|
||||||
|
#print total
|
||||||
|
db_instance.batch_appendDB(filename1, featrues)
|
||||||
|
pdb.set_trace()
|
||||||
|
ranks = []
|
||||||
|
times = []
|
||||||
|
for threthold in xrange(1, 210, 10):
|
||||||
|
hit = []
|
||||||
|
i = 0
|
||||||
|
for name in queries:
|
||||||
|
#print i
|
||||||
|
i += 1
|
||||||
|
'''
|
||||||
|
if i == 1000:
|
||||||
|
print (sum(times)/len(times))
|
||||||
|
pdb.set_trace()
|
||||||
|
print "s"
|
||||||
|
'''
|
||||||
|
#if name not in vul_db['openssl']:
|
||||||
|
# continue
|
||||||
|
if name not in featrues:
|
||||||
|
continue
|
||||||
|
#pdb.set_trace()
|
||||||
|
query = queries[name]
|
||||||
|
#start = time.time()
|
||||||
|
x = db_instance.engine.neighbours(query)
|
||||||
|
#end = time.time()
|
||||||
|
#total = end - start
|
||||||
|
#times.append(total)
|
||||||
|
#print total
|
||||||
|
#pdb.set_trace()
|
||||||
|
try:
|
||||||
|
rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
|
||||||
|
ranks.append((name, rank))
|
||||||
|
if rank <= threthold:
|
||||||
|
hit.append(1)
|
||||||
|
else:
|
||||||
|
hit.append(0)
|
||||||
|
except:
|
||||||
|
#pdb.set_trace()
|
||||||
|
hit.append(0)
|
||||||
|
pass
|
||||||
|
#pdb.set_trace()
|
||||||
|
acc = sum(hit) * 1.0 / len(hit)
|
||||||
|
print acc
|
||||||
|
|
||||||
|
def queryAll(base_dir, firmware_name, filename1, n):
|
||||||
|
threthold = 155
|
||||||
|
db_instance = db()
|
||||||
|
db_instance.loadHashmap(n, 50000)
|
||||||
|
queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
|
||||||
|
start = time.time()
|
||||||
|
pdb.set_trace()
|
||||||
|
db_instance.batch_appendDBbyDir(n, base_dir)
|
||||||
|
end = time.time()
|
||||||
|
dur = end - start
|
||||||
|
print dur
|
||||||
|
pdb.set_trace()
|
||||||
|
hit = []
|
||||||
|
i = 0
|
||||||
|
times = []
|
||||||
|
for name in queries:
|
||||||
|
print i
|
||||||
|
i += 1
|
||||||
|
query = queries[name]
|
||||||
|
start = time.clock()
|
||||||
|
x = db_instance.engine.neighbours(query)
|
||||||
|
end = time.clock()
|
||||||
|
dur = end - start
|
||||||
|
times.append(dur)
|
||||||
|
#pdb.set_trace()
|
||||||
|
try:
|
||||||
|
rank = [v for v in xrange(len(x)) if name in x[v][1]]
|
||||||
|
if len(rank) > 1:
|
||||||
|
pdb.set_trace()
|
||||||
|
print "stop"
|
||||||
|
if rank[0] <= threthold:
|
||||||
|
hit.append(1)
|
||||||
|
else:
|
||||||
|
hit.append(0)
|
||||||
|
except:
|
||||||
|
hit.append(0)
|
||||||
|
|
||||||
|
acc = sum(hit) * 1.0 / len(hit)
|
||||||
|
mean = np.mean(times)
|
||||||
|
std = np.std(times)
|
||||||
|
#pdb.set_trace()
|
||||||
|
print acc
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_command()
|
||||||
|
base_dir = args.base_input_dir
|
||||||
|
filename1 = args.filename1
|
||||||
|
filename2 = args.filename2
|
||||||
|
n = args.size
|
||||||
|
pdb.set_trace()
|
||||||
|
queryBytwo(base_dir, filename1, filename2, n)
|
Loading…
Reference in New Issue
Block a user