detect_rep/ASM2VEC_plus_scripts/asm2vec/get_opcode_vector.py
2023-04-05 10:04:49 +08:00

677 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# Capstone Python bindings, by Nguyen Anh Quynnh <aquynh@gmail.com>
from __future__ import print_function
from capstone import *
from capstone.x86 import *
from .xprint import to_hex, to_x, to_x_32
import lief
import base64
import os
import random
import numpy as np
def softmax(x):
x_exp = np.exp(x)
# 如果是列向量则axis=0
x_sum = np.sum(x_exp, axis=0, keepdims=True)
s = x_exp / x_sum
return s
legacy_prefix_all_msg={"lock":[0x0,0xF0],"segment":[0x2E,0x36,0x3E,0x26,0x64,0x65],"oprandsize":[0x0,0x66],"address":[0x0,0x67]}
# X86_CODE64 = b"\x55\x48\x8b\x05\xb8\x13\x00\x00\xe9\xea\xbe\xad\xde\xff\x25\x23\x01\x00\x00\xe8\xdf\xbe\xad\xde\x74\xff"
# X86_CODE16 = b"\x8d\x4c\x32\x08\x01\xd8\x81\xc6\x34\x12\x00\x00\x05\x23\x01\x00\x00\x36\x8b\x84\x91\x23\x01\x00\x00\x41\x8d\x84\x39\x89\x67\x00\x00\x8d\x87\x89\x67\x00\x00\xb4\xc6\x66\xe9\xb8\x00\x00\x00\x67\xff\xa0\x23\x01\x00\x00\x66\xe8\xcb\x00\x00\x00\x74\xfc"
# X86_CODE32 = b"\x8d\x4c\x32\x08\x01\xd8\x81\xc6\x34\x12\x00\x00\x05\x23\x01\x00\x00\x36\x8b\x84\x91\x23\x01\x00\x00\x41\x8d\x84\x39\x89\x67\x00\x00\x8d\x87\x89\x67\x00\x00\xb4\xc6\xe9\xea\xbe\xad\xde\xff\xa0\x23\x01\x00\x00\xe8\xdf\xbe\xad\xde\x74\xff"
#记录有效性错误的程序总数
error_validity_sum=0
#验证程序的有效性暂时先只用32位未加密程序做测试
def check_validity(binary):
global error_validity_sum
#是否加壳标志
shell_flag=1
try:
text = binary.get_section(".text")
except Exception as e:
if "No such section with this name" in str(e):
text = binary.get_section("CODE")
else:
shell_flag=0
if shell_flag==0:
print("程序可能加壳")
error_validity_sum+=1
return False
if "CHARA_32BIT_MACHINE" not in str(binary.header):
print("不是一个32位程序")
error_validity_sum+=1
return False
return True
def str_hex_to_bytes(str_hex):
# print(str_hex)
# exit()
y = bytearray.fromhex(str_hex)
z = list(y)
# print(z)
asm_hex_str = b''
# test_content = [0x26, 0x66, 0x67, 0xF0, 0x81, 0x84, 0xC8, 0x44, 0x33, 0x22, 0x11, 0x78, 0x56, 0x34, 0x12]
#normal
for i in z:
# for i in test_content:
right = str(hex(i))[2:]
if right == "0":
right = "00"
if len(right) == 1:
right = "0" + right
item = base64.b16decode(right.upper())
asm_hex_str += item
# print(asm_hex_str)
return asm_hex_str
def get_asm_text_code(file_name):
binary = lief.parse(file_name)
if check_validity(binary)==True:
try:
text = binary.get_section(".text")
except Exception as e:
text = binary.get_section("CODE")
# print(text.content)
# exit()
asm_hex_str=b''
# test_content = [0x26, 0x66, 0x67, 0xF0, 0x81, 0x84, 0xC8, 0x44, 0x33, 0x22, 0x11, 0x78, 0x56, 0x34, 0x12]
for i in text.content:
# for i in test_content:
right=str(hex(i))[2:]
if right=="0":
right="00"
if len(right)==1:
right="0"+right
item =base64.b16decode(right.upper())
asm_hex_str+=item
return True,asm_hex_str
else:
return False,""
# X86_CODE32 = get_asm_text_code("./TEST")
# all_tests = (
# # (CS_ARCH_X86, CS_MODE_16, X86_CODE16, "X86 16bit (Intel syntax)", None),
# # (CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (AT&T syntax)", CS_OPT_SYNTAX_ATT),
# (CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (Intel syntax)", None),
# # (CS_ARCH_X86, CS_MODE_64, X86_CODE64, "X86 64 (Intel syntax)", None),
# )
def get_eflag_name(eflag):
if eflag == X86_EFLAGS_UNDEFINED_OF:
return "UNDEF_OF"
elif eflag == X86_EFLAGS_UNDEFINED_SF:
return "UNDEF_SF"
elif eflag == X86_EFLAGS_UNDEFINED_ZF:
return "UNDEF_ZF"
elif eflag == X86_EFLAGS_MODIFY_AF:
return "MOD_AF"
elif eflag == X86_EFLAGS_UNDEFINED_PF:
return "UNDEF_PF"
elif eflag == X86_EFLAGS_MODIFY_CF:
return "MOD_CF"
elif eflag == X86_EFLAGS_MODIFY_SF:
return "MOD_SF"
elif eflag == X86_EFLAGS_MODIFY_ZF:
return "MOD_ZF"
elif eflag == X86_EFLAGS_UNDEFINED_AF:
return "UNDEF_AF"
elif eflag == X86_EFLAGS_MODIFY_PF:
return "MOD_PF"
elif eflag == X86_EFLAGS_UNDEFINED_CF:
return "UNDEF_CF"
elif eflag == X86_EFLAGS_MODIFY_OF:
return "MOD_OF"
elif eflag == X86_EFLAGS_RESET_OF:
return "RESET_OF"
elif eflag == X86_EFLAGS_RESET_CF:
return "RESET_CF"
elif eflag == X86_EFLAGS_RESET_DF:
return "RESET_DF"
elif eflag == X86_EFLAGS_RESET_IF:
return "RESET_IF"
elif eflag == X86_EFLAGS_TEST_OF:
return "TEST_OF"
elif eflag == X86_EFLAGS_TEST_SF:
return "TEST_SF"
elif eflag == X86_EFLAGS_TEST_ZF:
return "TEST_ZF"
elif eflag == X86_EFLAGS_TEST_PF:
return "TEST_PF"
elif eflag == X86_EFLAGS_TEST_CF:
return "TEST_CF"
elif eflag == X86_EFLAGS_RESET_SF:
return "RESET_SF"
elif eflag == X86_EFLAGS_RESET_AF:
return "RESET_AF"
elif eflag == X86_EFLAGS_RESET_TF:
return "RESET_TF"
elif eflag == X86_EFLAGS_RESET_NT:
return "RESET_NT"
elif eflag == X86_EFLAGS_PRIOR_OF:
return "PRIOR_OF"
elif eflag == X86_EFLAGS_PRIOR_SF:
return "PRIOR_SF"
elif eflag == X86_EFLAGS_PRIOR_ZF:
return "PRIOR_ZF"
elif eflag == X86_EFLAGS_PRIOR_AF:
return "PRIOR_AF"
elif eflag == X86_EFLAGS_PRIOR_PF:
return "PRIOR_PF"
elif eflag == X86_EFLAGS_PRIOR_CF:
return "PRIOR_CF"
elif eflag == X86_EFLAGS_PRIOR_TF:
return "PRIOR_TF"
elif eflag == X86_EFLAGS_PRIOR_IF:
return "PRIOR_IF"
elif eflag == X86_EFLAGS_PRIOR_DF:
return "PRIOR_DF"
elif eflag == X86_EFLAGS_TEST_NT:
return "TEST_NT"
elif eflag == X86_EFLAGS_TEST_DF:
return "TEST_DF"
elif eflag == X86_EFLAGS_RESET_PF:
return "RESET_PF"
elif eflag == X86_EFLAGS_PRIOR_NT:
return "PRIOR_NT"
elif eflag == X86_EFLAGS_MODIFY_TF:
return "MOD_TF"
elif eflag == X86_EFLAGS_MODIFY_IF:
return "MOD_IF"
elif eflag == X86_EFLAGS_MODIFY_DF:
return "MOD_DF"
elif eflag == X86_EFLAGS_MODIFY_NT:
return "MOD_NT"
elif eflag == X86_EFLAGS_MODIFY_RF:
return "MOD_RF"
elif eflag == X86_EFLAGS_SET_CF:
return "SET_CF"
elif eflag == X86_EFLAGS_SET_DF:
return "SET_DF"
elif eflag == X86_EFLAGS_SET_IF:
return "SET_IF"
else:
return None
def vector_to_asm(res_vector):
test_vector=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
test_vector=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
addr_vec=test_vector[:64]
opcode_vec=test_vector[64:(64+256)]
option_vec=test_vector[(64+256):(64+256+5)]
prefix_vec=test_vector[(64+256+5):(64+256+5+9)]
modrm_vec=test_vector[(64+256+5+9):(64+256+5+9+20)]
sib_vec=test_vector[(64+256+5+9+20):(64+256+5+9+20+20)]
disp_flag=test_vector[(64+256+5+9+20+20):(64+256+5+9+20+20+1)]
disp_vec=test_vector[(64+256+5+9+20+20+1):(64+256+5+9+20+20+64)]
imme_flag=test_vector[(64+256+5+9+20+20+1+64):(64+256+5+9+20+20+1+64+1)]
imme_vec=test_vector[(64+256+5+9+20+20+1+64+1):]
# print("\taddr_vec : "+ str(addr_vec))
# print("\topcode_vec : " + str(opcode_vec))
# print("\toption_vec : " + str(option_vec))
# print("\tprefix_vec : " + str(prefix_vec))
# print("\tmodrm_vec : " + str(modrm_vec))
# print("\tsib_vec : " + str(sib_vec))
# print("\tdisp_flag : " + str(disp_flag))
# print("\tdisp_vec : " + str(disp_vec))
# print("\timme_flag : " + str(imme_flag))
# print("\timme_vec : " + str(imme_vec))
addr="0b"
opcode=0
# option="0b"
prefix=[0,0,0,0]
modrm="0b"
sib="0b"
disp_flag_str="0b"
disp="0b"
imme_flag_str="0b"
imme="0b"
for i in addr_vec:
addr+=str(i)
for i in range(len(opcode_vec)):
if opcode_vec[i]==1:
opcode=i
if prefix_vec[0]==1:
prefix[0]=0xF0
prefix_vec_2=prefix_vec[1:7]
for i in range(len(prefix_vec_2)):
if prefix_vec_2[i]==1:
prefix[1]=legacy_prefix_all_msg[i]
if prefix_vec[7]==1:
prefix[2]=0x66
if prefix_vec[8]==1:
prefix[3]=0x67
modrm1=0
modrm2=0
modrm3=0
modrm_vec1=modrm_vec[:4]
for i in range(len(modrm_vec1)):
if modrm_vec1[i]==1:
modrm1=i
modrm_vec2 = modrm_vec[4:12]
for i in range(len(modrm_vec2)):
if modrm_vec2[i] == 1:
modrm2 = i
modrm_vec3 = modrm_vec[12:]
for i in range(len(modrm_vec3)):
if modrm_vec3[i] == 1:
modrm3 = i
modrm1,modrm2,modrm3=bin(modrm1),bin(modrm2),bin(modrm3)
modrm1, modrm2, modrm3 = modrm1[2:],modrm2[2:],modrm3[2:]
modrm ="0b"+modrm1+modrm2+modrm3
sib1 = 0
sib2 = 0
sib3 = 0
sib_vec1 = sib_vec[:4]
for i in range(len(sib_vec1)):
if sib_vec1[i] == 1:
sib1 = i
sib_vec2 = sib_vec[4:12]
for i in range(len(sib_vec2)):
if sib_vec2[i] == 1:
sib2 = i
sib_vec3 = sib_vec[12:]
for i in range(len(sib_vec3)):
if sib_vec3[i] == 1:
sib3 = i
sib1, sib2, sib3 = bin(sib1), bin(sib2), bin(sib3)
sib1, sib2, sib3 = sib1[2:], sib2[2:], sib3[2:]
sib = "0b"+sib1 + sib2 + sib3
# print()
# print("\taddr : " + str(hex(int(addr,2))))
# print("\topcode : " + str(opcode))
# print("\tmodrm : " + str(modrm))
# print("\tsib : " + str(sib))
pass
def msg_to_vector(normal,address,prefix,opcode,modrm,sib,disp,imme,imme_cont):
addr_vec=[0]*64
opcode_vec=[0]*256
option_vec=[0]*8 #prefix[0,0,0,0]\modrm\sib\disp\imme
prefix_vec=[0]*9 #1/6/1/1
moderm_vec=[0]*20
sib_vec=[0]*20
disp_flag=[0]
disp_vec=[0]*64
imme_flag=[0]
imme_vec=[0]*64
address, opcode, modrm, sib, disp, imme=int(address),int(opcode),int(modrm),int(sib),int(disp),int(imme)
address, modrm, sib, disp, imme= bin(address), bin(modrm), bin(sib), bin(disp), bin(imme)
if disp[0]=="-":
disp_flag=[1]
disp=disp[1:]
if imme[0]=="-":
imme_flag=[1]
imme=disp[1:]
address, modrm, sib, disp, imme= address[2:], modrm[2:], sib[2:], disp[2:], imme[2:]
#填充address至64位
if len(address)<64:
address="0"*(64-len(address))+address
for i in range(len(address)):
addr_vec[i]=int(address[i])
# print("addr_vec:"+str(addr_vec))
#one-hot表示
opcode_vec[opcode]=1
if prefix[0]!=0 :
option_vec[0]=1
if prefix[1]!=0 :
option_vec[1] = 1
if prefix[2]!=0 :
option_vec[2] = 1
if prefix[3]!=0:
option_vec[3] = 1
if modrm!="0":
option_vec[4]=1
if sib!="0":
option_vec[5]=1
if disp!="0":
option_vec[6]=1
if imme_cont!=0:
option_vec[7]=1
if len(disp)<64:
disp="0"*(64-len(disp))+disp
if len(imme)<64:
imme="0"*(64-len(imme))+imme
for i in range(len(disp)):
disp_vec[i] = int(disp[i])
for i in range(len(imme)):
imme_vec[i] = int(imme[i])
# print("0x%x:\t%s\t%s" % (insn.address, insn.mnemonic, insn.op_str))
# print("\t机器码 : "+ str(asm_code))
for i in range(len(prefix)):
prefix[i]=int(prefix[i])
for j in range(len(legacy_prefix_all_msg["lock"])):
if prefix[0]==legacy_prefix_all_msg["lock"][j]:
prefix_vec[0]=j
break
for j in range(len(legacy_prefix_all_msg["segment"])):
if prefix[1] == legacy_prefix_all_msg["segment"][j]:
prefix_vec[j+1] = 1
break
for j in range(len(legacy_prefix_all_msg["oprandsize"])):
if prefix[2] == legacy_prefix_all_msg["oprandsize"][j]:
prefix_vec[7] = j
break
for j in range(len(legacy_prefix_all_msg["address"])):
if prefix[3] == legacy_prefix_all_msg["address"][j]:
prefix_vec[8] = j
break
# print("prefix_vec:" + str(prefix_vec))
if len(modrm)<8:
modrm="0"*(8-len(modrm))+modrm
# print("modrm:"+modrm)
modrm_split1="0b"+modrm[:2]
modrm_split2="0b"+modrm[2:5]
modrm_split3="0b"+modrm[5:]
modrm_int1 = int(modrm_split1,2)
modrm_int2 = int(modrm_split2, 2)
modrm_int3 = int(modrm_split3, 2)
moderm_vec[modrm_int1] = 1
moderm_vec[modrm_int2 + 4] = 1
moderm_vec[modrm_int3 + 12] = 1
if len(sib)<8:
sib="0"*(8-len(sib))+sib
# print("sib:"+sib)
sib_split1="0b"+sib[:2]
sib_split2="0b"+sib[2:5]
sib_split3="0b"+sib[5:]
sib_int1 = int(sib_split1,2)
sib_int2 = int(sib_split2, 2)
sib_int3 = int(sib_split3, 2)
sib_vec[sib_int1] = 1
sib_vec[sib_int2 + 4] = 1
sib_vec[sib_int3 + 12] = 1
# result_vec = opcode_vec + option_vec + prefix_vec + moderm_vec + sib_vec + disp_flag + disp_vec + imme_flag + imme_vec
result_vec=addr_vec+opcode_vec+option_vec+prefix_vec+moderm_vec+sib_vec+disp_flag+disp_vec+imme_flag+imme_vec
# print("addr_vec:"+str(addr_vec))
# print("opcode_vec:" + str(opcode_vec))
# print("option_vec:"+str(option_vec))
# print("disp_vec:"+str(disp_vec))
# print("imme_vec:"+str(imme_vec))
# print("prefix_vec:" + str(prefix_vec))
# print("moderm_vec:" + str(moderm_vec))
# print("sib_vec:" + str(sib_vec))
# print("result_vec:\n"+str(result_vec))
#对数组做归一化处理
if normal==True:
not_zero_sum=0
for i in result_vec:
if i != 0:
not_zero_sum+=i
for i in range(len(result_vec)):
if result_vec[i] != 0:
result_vec[i]=result_vec[i]/not_zero_sum
return result_vec
# opcode_list = []
def get_asm_msg(insn):
# print(dir(insn))
# exit()
text1=""
for i in range(insn.size):
text1 += '%02X ' % insn.bytes[i]
address=insn.address
prefix=insn.prefix
opcode=insn.opcode
modrm=insn.modrm
disp=insn.disp
sib=insn.sib
imme_cont=insn.op_count(X86_OP_IMM)
if imme_cont!=0:
op = insn.op_find(X86_OP_IMM, 1)
imme="0x"+to_x(op.imm)
imme=int(imme,16)
else:
imme=0
# if prefix!=[0,0,0,0]:
# print("\t%s" % (insn.mnemonic))
# print("\t%s\t%s" % (insn.mnemonic, insn.op_str))
# print(dir(insn.op_count))
# print("\t机器码 : "+ str(asm_code))
# print("\t地址addr : "+ str(address))
# print("\t前缀prefix " + str(prefix))
# print("\t操作码opcode : "+str(opcode))
# print("\tmodrm : "+str(modrm))
# print("\tsib : "+str(sib))
# print(type(insn.disp))
# print("\tdisp : "+hex(insn.disp))
# print("\timme : "+hex(imme))
# exit()
# 打印操作数的REX前缀非零值与x86_64指令相关
# print("\trex: 0x%x" % (insn.rex))
return str(address),prefix,str(opcode[0]),str(modrm),str(sib),str(disp),str(imme),imme_cont
def get_asm_msg2(insn):
# print(dir(insn))
# exit()
text1=""
for i in range(insn.size):
text1 += '%02X ' % insn.bytes[i]
address=insn.address
prefix=insn.prefix
opcode=insn.opcode
modrm=insn.modrm
disp=insn.disp
sib=insn.sib
imme_cont=insn.op_count(X86_OP_IMM)
if imme_cont!=0:
op = insn.op_find(X86_OP_IMM, 1)
imme="0x"+to_x(op.imm)
imme=int(imme,16)
else:
imme=0
# if prefix!=[0,0,0,0]:
# print("\t%s" % (insn.mnemonic))
# print(dir(insn.op_count))
# print("\t机器码 : "+ str(asm_code))
# print("\t地址addr : "+ str(address))
# print("\t前缀prefix " + str(prefix))
# print("\t操作码opcode : "+str(opcode))
# print("\tmodrm : "+str(modrm))
# print("\tsib : "+str(sib))
# print(type(insn.disp))
# print("\tdisp : "+hex(insn.disp))
# print("\timme : "+hex(imme))
op_str=insn.op_str.replace("0x","ox")
if insn.disp<10:
disp=str(insn.disp)
else:
disp=hex(insn.disp)
if imme<10:
imme2=str(imme)
else:
imme2=hex(imme).replace("0x","ox")
r_list=["r1","r2","r3","r4","r5","r6""r7","r8","r9","r10","r11","r12""r13","r14","r15"]
print("\t%s\t%s" % (insn.mnemonic, insn.op_str))
#替换为CONST
if disp in op_str:
flag=0
for i in r_list:
if i in op_str:
flag=1
break
if flag==1:
op_str=op_str.replace(" "+disp," CONST")
else:
op_str = op_str.replace(disp, "CONST")
if imme2 in op_str:
flag = 0
for i in r_list:
if i in op_str:
flag = 1
break
if flag == 1:
op_str = op_str.replace(" " + imme2, " CONST")
else:
op_str = op_str.replace(imme2, "CONST")
print("\t%s\t%s" % (insn.mnemonic, op_str))
# exit()
# exit()
# 打印操作数的REX前缀非零值与x86_64指令相关
# print("\trex: 0x%x" % (insn.rex))
return str(address),prefix,str(opcode[0]),str(modrm),str(sib),str(disp),str(imme),imme_cont
def get_asm_input_vector(X86_CODE32,normal=False):
arch, mode, code, comment, syntax=CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (Intel syntax)", None
one_sample_vec_seq=[]
opcode_oprand_seq=[]
try:
md = Cs(arch, mode)
md.detail = True
if syntax is not None:
md.syntax = syntax
for insn in md.disasm(code, 0x0):
address, prefix, opcode, modrm, sib, disp, imme,imme_cont=get_asm_msg( insn)
result_vec=msg_to_vector(normal,address, prefix, opcode, modrm, sib, disp, imme,imme_cont)
one_sample_vec_seq.append(result_vec)
opcode_oprand_seq.append("0x%x:%s %s" % (insn.address, insn.mnemonic, insn.op_str))
# print("one_sample_vec_seq:" + str(one_sample_vec_seq))
# print(len(one_sample_vec_seq))
#返回vector列表
return one_sample_vec_seq,opcode_oprand_seq
except CsError as e:
print("ERROR: %s" % e)
exit()
#随机获取50条指令以及对应的vector
def get_random_asm_seq(seq_num,one_sample_vec_seq,opcode_oprand_seq):
one_sample_vec_seq=random.sample(one_sample_vec_seq,seq_num)
opcode_oprand_seq=random.sample(opcode_oprand_seq,seq_num)
# print(one_sample_vec_seq)
# print(opcode_oprand_seq)
return one_sample_vec_seq,opcode_oprand_seq
# pass
def get_vec_seq():
file_dir = "E:\Data\Data_\malware"
file_list = sorted(os.listdir(file_dir))
# print(file_list)
sample_vec_seq=[]
# 对文件夹中每一个文件进行遍历
for item_file in file_list:
# item_file="msvcrt20.dll"
item_file="e3d5f6b7189fc7fb5904943f24fb749ccd70c6d2b5b9b3892525a1188310f80a"
# item_file="efa4f015dc1b81d9dedd130439dea9f9cc2e5d2451e9bd1186990973cf14b693"
# print(item_file)
bool_flag, X86_CODE32 = get_asm_text_code(os.path.join(file_dir, item_file))
# print(X86_CODE32)
# exit()
#如果当前程序不是32位程序或者有加壳则跳过
if bool_flag == False:
continue
one_sample_vec_seq,opcode_oprand_seq = get_asm_input_vector(X86_CODE32)
# sample_vec_seq.append(one_sample_vec_seq)
break
return one_sample_vec_seq,opcode_oprand_seq
if __name__ == '__main__':
get_asm_input_vector("6a09e8c5fdffff83c404ff742404e89efeffff83c4046a09e80bf3ffff83c404c3",normal=True)
# one_sample_vec_seq, opcode_oprand_seq=get_random_asm_seq()
# print(len(one_sample_vec_seq))
# print(len(opcode_oprand_seq))
# exit()
# sample_vec_seq=get_vec_seq()
# print("有效性错误程序总数:"+str(error_validity_sum))
# print(sample_vec_seq)
# print(len(sample_vec_seq[0]))