detect_rep/ASM2VEC_base_scripts/compare.py

124 lines
3.8 KiB
Python
Raw Permalink Normal View History

2023-04-05 10:04:49 +08:00
import torch
import torch.nn as nn
import click
import asm2vec
from utils2 import sigmoid, get_batches, compute_pca, get_dict
from matplotlib import pyplot
import numpy as np
import os
import random
def cosine_similarity(v1, v2):
return (v1 @ v2 / (v1.norm() * v2.norm())).item()
# @click.command()
# @click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True)
# @click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True)
# @click.option('-m', '--model', 'mpath', help='model path', required=True)
# @click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
# @click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
# @click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
def cli():
# mpath = "../model.pt"
mpath="./asm2vec_checkpoints/model.pt"
# epochs = 10
device = "auto"
# lr = 0.02
file_dir="../asm_func/asm_hex/func_bytes.csv"
if device == 'auto':
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# load model, tokens
model = asm2vec.utils.load_model(mpath, device=device)
name_list=["_fsopen","_fstat","_ftime","_futime","_fdopen",
"_getch","_getche",
"_strcmpi","_strupr","_strnset",
"fgetwc","fgetws","fgetc","fgets","fprintf","fputc","fputs","fread","fclose","free","fflush"
,"public _class_ostream_____thiscall_ostream operator___void_const","public _class_ostream_____thiscall_ostream operator___unsigned_short_int"
,"public _class_ostream_____thiscall_ostream operator___long_int"
,"vprintf","vsprintf","vswprintf"
,"fputs","fputwc","fputws"
,"wcslen","wcsncat","wcsncmp","wcsncpy","wcsrchr","wcspbrk"
,"strstr","strncmp","strncat","strlen","strcpy","strcmp","strchr",
"time"]
name_list2=[""]*len(name_list)
for i in range(len(name_list)):
name_list2[i]=name_list[i]
# name_list=os.listdir(file_dir)
# name_list=random.sample(name_list, 100)
print(name_list)
# exit()
# for i in range(len(name_list)):
# name_list[i]=os.path.join(file_dir,"sym.MSVCRT20.dll_"+name_list[i])
for i in range(len(name_list)):
name_list[i] = "sym.MSVCRT20.dll_" + name_list[i]
# functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2])
# print(name_list)
functions = asm2vec.utils.load_data(file_dir,name_list)
# print(len(name_list))
# print(len(functions))
# exit()
# print(len(functions))
# print(functions)
# exit()
# tokens.update(tokens_new)
# model.update(2, tokens.size())
# model.update(len(name_list), tokens.size())
model = model.to(device)
# train function embedding
# model = asm2vec.utils.train(
# functions,
# # tokens,
# model=model,
# epochs=epochs,
# device=device,
# mode='test',
# # mode='train',
# learning_rate=lr
# )
# compare 2 function vectors
# len_list=[i for i in range(len(name_list))]
# v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1]))
# v_list= model.to('cpu').embeddings_f(torch.tensor(len_list))
# a,fun_vec_list=preprocess(functions)
fun_vec_list=[]
for fn in functions:
fun_vec_list.append(fn.fun2vec)
v_list = model.to('cpu').linear_f(torch.tensor(fun_vec_list))
print(v_list)
# exit()
result_vec = np.array(v_list.tolist())
print("res")
result = compute_pca(result_vec, 2)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(name_list2):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
if __name__ == '__main__':
cli()