import torch import torch.nn as nn import click import asm2vec from utils2 import sigmoid, get_batches, compute_pca, get_dict from matplotlib import pyplot import numpy as np import os import random def cosine_similarity(v1, v2): return (v1 @ v2 / (v1.norm() * v2.norm())).item() # @click.command() # @click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True) # @click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True) # @click.option('-m', '--model', 'mpath', help='model path', required=True) # @click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) # @click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) # @click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) def cli(): # mpath = "../model.pt" mpath="./asm2vec_checkpoints/model.pt" # epochs = 10 device = "auto" # lr = 0.02 file_dir="../asm_func/asm_hex/func_bytes.csv" if device == 'auto': device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # load model, tokens model = asm2vec.utils.load_model(mpath, device=device) name_list=["_fsopen","_fstat","_ftime","_futime","_fdopen", "_getch","_getche", "_strcmpi","_strupr","_strnset", "fgetwc","fgetws","fgetc","fgets","fprintf","fputc","fputs","fread","fclose","free","fflush" ,"public _class_ostream_____thiscall_ostream operator___void_const","public _class_ostream_____thiscall_ostream operator___unsigned_short_int" ,"public _class_ostream_____thiscall_ostream operator___long_int" ,"vprintf","vsprintf","vswprintf" ,"fputs","fputwc","fputws" ,"wcslen","wcsncat","wcsncmp","wcsncpy","wcsrchr","wcspbrk" ,"strstr","strncmp","strncat","strlen","strcpy","strcmp","strchr", "time"] name_list2=[""]*len(name_list) for i in range(len(name_list)): name_list2[i]=name_list[i] # name_list=os.listdir(file_dir) # name_list=random.sample(name_list, 100) print(name_list) # exit() # for i in range(len(name_list)): # name_list[i]=os.path.join(file_dir,"sym.MSVCRT20.dll_"+name_list[i]) for i in range(len(name_list)): name_list[i] = "sym.MSVCRT20.dll_" + name_list[i] # functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2]) # print(name_list) functions = asm2vec.utils.load_data(file_dir,name_list) # print(len(name_list)) # print(len(functions)) # exit() # print(len(functions)) # print(functions) # exit() # tokens.update(tokens_new) # model.update(2, tokens.size()) # model.update(len(name_list), tokens.size()) model = model.to(device) # train function embedding # model = asm2vec.utils.train( # functions, # # tokens, # model=model, # epochs=epochs, # device=device, # mode='test', # # mode='train', # learning_rate=lr # ) # compare 2 function vectors # len_list=[i for i in range(len(name_list))] # v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1])) # v_list= model.to('cpu').embeddings_f(torch.tensor(len_list)) # a,fun_vec_list=preprocess(functions) fun_vec_list=[] for fn in functions: fun_vec_list.append(fn.fun2vec) v_list = model.to('cpu').linear_f(torch.tensor(fun_vec_list)) print(v_list) # exit() result_vec = np.array(v_list.tolist()) print("res") result = compute_pca(result_vec, 2) pyplot.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(name_list2): pyplot.annotate(word, xy=(result[i, 0], result[i, 1])) pyplot.show() if __name__ == '__main__': cli()