#!/usr/bin/env python # Capstone Python bindings, by Nguyen Anh Quynnh from __future__ import print_function from capstone import * from capstone.x86 import * from .xprint import to_hex, to_x, to_x_32 import lief import base64 import os import random import numpy as np def softmax(x): x_exp = np.exp(x) # 如果是列向量,则axis=0 x_sum = np.sum(x_exp, axis=0, keepdims=True) s = x_exp / x_sum return s legacy_prefix_all_msg={"lock":[0x0,0xF0],"segment":[0x2E,0x36,0x3E,0x26,0x64,0x65],"oprandsize":[0x0,0x66],"address":[0x0,0x67]} # X86_CODE64 = b"\x55\x48\x8b\x05\xb8\x13\x00\x00\xe9\xea\xbe\xad\xde\xff\x25\x23\x01\x00\x00\xe8\xdf\xbe\xad\xde\x74\xff" # X86_CODE16 = b"\x8d\x4c\x32\x08\x01\xd8\x81\xc6\x34\x12\x00\x00\x05\x23\x01\x00\x00\x36\x8b\x84\x91\x23\x01\x00\x00\x41\x8d\x84\x39\x89\x67\x00\x00\x8d\x87\x89\x67\x00\x00\xb4\xc6\x66\xe9\xb8\x00\x00\x00\x67\xff\xa0\x23\x01\x00\x00\x66\xe8\xcb\x00\x00\x00\x74\xfc" # X86_CODE32 = b"\x8d\x4c\x32\x08\x01\xd8\x81\xc6\x34\x12\x00\x00\x05\x23\x01\x00\x00\x36\x8b\x84\x91\x23\x01\x00\x00\x41\x8d\x84\x39\x89\x67\x00\x00\x8d\x87\x89\x67\x00\x00\xb4\xc6\xe9\xea\xbe\xad\xde\xff\xa0\x23\x01\x00\x00\xe8\xdf\xbe\xad\xde\x74\xff" #记录有效性错误的程序总数 error_validity_sum=0 #验证程序的有效性,暂时先只用32位未加密程序做测试 def check_validity(binary): global error_validity_sum #是否加壳标志 shell_flag=1 try: text = binary.get_section(".text") except Exception as e: if "No such section with this name" in str(e): text = binary.get_section("CODE") else: shell_flag=0 if shell_flag==0: print("程序可能加壳") error_validity_sum+=1 return False if "CHARA_32BIT_MACHINE" not in str(binary.header): print("不是一个32位程序") error_validity_sum+=1 return False return True def str_hex_to_bytes(str_hex): # print(str_hex) # exit() y = bytearray.fromhex(str_hex) z = list(y) # print(z) asm_hex_str = b'' # test_content = [0x26, 0x66, 0x67, 0xF0, 0x81, 0x84, 0xC8, 0x44, 0x33, 0x22, 0x11, 0x78, 0x56, 0x34, 0x12] #normal for i in z: # for i in test_content: right = str(hex(i))[2:] if right == "0": right = "00" if len(right) == 1: right = "0" + right item = base64.b16decode(right.upper()) asm_hex_str += item # print(asm_hex_str) return asm_hex_str def get_asm_text_code(file_name): binary = lief.parse(file_name) if check_validity(binary)==True: try: text = binary.get_section(".text") except Exception as e: text = binary.get_section("CODE") # print(text.content) # exit() asm_hex_str=b'' # test_content = [0x26, 0x66, 0x67, 0xF0, 0x81, 0x84, 0xC8, 0x44, 0x33, 0x22, 0x11, 0x78, 0x56, 0x34, 0x12] for i in text.content: # for i in test_content: right=str(hex(i))[2:] if right=="0": right="00" if len(right)==1: right="0"+right item =base64.b16decode(right.upper()) asm_hex_str+=item return True,asm_hex_str else: return False,"" # X86_CODE32 = get_asm_text_code("./TEST") # all_tests = ( # # (CS_ARCH_X86, CS_MODE_16, X86_CODE16, "X86 16bit (Intel syntax)", None), # # (CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (AT&T syntax)", CS_OPT_SYNTAX_ATT), # (CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (Intel syntax)", None), # # (CS_ARCH_X86, CS_MODE_64, X86_CODE64, "X86 64 (Intel syntax)", None), # ) def get_eflag_name(eflag): if eflag == X86_EFLAGS_UNDEFINED_OF: return "UNDEF_OF" elif eflag == X86_EFLAGS_UNDEFINED_SF: return "UNDEF_SF" elif eflag == X86_EFLAGS_UNDEFINED_ZF: return "UNDEF_ZF" elif eflag == X86_EFLAGS_MODIFY_AF: return "MOD_AF" elif eflag == X86_EFLAGS_UNDEFINED_PF: return "UNDEF_PF" elif eflag == X86_EFLAGS_MODIFY_CF: return "MOD_CF" elif eflag == X86_EFLAGS_MODIFY_SF: return "MOD_SF" elif eflag == X86_EFLAGS_MODIFY_ZF: return "MOD_ZF" elif eflag == X86_EFLAGS_UNDEFINED_AF: return "UNDEF_AF" elif eflag == X86_EFLAGS_MODIFY_PF: return "MOD_PF" elif eflag == X86_EFLAGS_UNDEFINED_CF: return "UNDEF_CF" elif eflag == X86_EFLAGS_MODIFY_OF: return "MOD_OF" elif eflag == X86_EFLAGS_RESET_OF: return "RESET_OF" elif eflag == X86_EFLAGS_RESET_CF: return "RESET_CF" elif eflag == X86_EFLAGS_RESET_DF: return "RESET_DF" elif eflag == X86_EFLAGS_RESET_IF: return "RESET_IF" elif eflag == X86_EFLAGS_TEST_OF: return "TEST_OF" elif eflag == X86_EFLAGS_TEST_SF: return "TEST_SF" elif eflag == X86_EFLAGS_TEST_ZF: return "TEST_ZF" elif eflag == X86_EFLAGS_TEST_PF: return "TEST_PF" elif eflag == X86_EFLAGS_TEST_CF: return "TEST_CF" elif eflag == X86_EFLAGS_RESET_SF: return "RESET_SF" elif eflag == X86_EFLAGS_RESET_AF: return "RESET_AF" elif eflag == X86_EFLAGS_RESET_TF: return "RESET_TF" elif eflag == X86_EFLAGS_RESET_NT: return "RESET_NT" elif eflag == X86_EFLAGS_PRIOR_OF: return "PRIOR_OF" elif eflag == X86_EFLAGS_PRIOR_SF: return "PRIOR_SF" elif eflag == X86_EFLAGS_PRIOR_ZF: return "PRIOR_ZF" elif eflag == X86_EFLAGS_PRIOR_AF: return "PRIOR_AF" elif eflag == X86_EFLAGS_PRIOR_PF: return "PRIOR_PF" elif eflag == X86_EFLAGS_PRIOR_CF: return "PRIOR_CF" elif eflag == X86_EFLAGS_PRIOR_TF: return "PRIOR_TF" elif eflag == X86_EFLAGS_PRIOR_IF: return "PRIOR_IF" elif eflag == X86_EFLAGS_PRIOR_DF: return "PRIOR_DF" elif eflag == X86_EFLAGS_TEST_NT: return "TEST_NT" elif eflag == X86_EFLAGS_TEST_DF: return "TEST_DF" elif eflag == X86_EFLAGS_RESET_PF: return "RESET_PF" elif eflag == X86_EFLAGS_PRIOR_NT: return "PRIOR_NT" elif eflag == X86_EFLAGS_MODIFY_TF: return "MOD_TF" elif eflag == X86_EFLAGS_MODIFY_IF: return "MOD_IF" elif eflag == X86_EFLAGS_MODIFY_DF: return "MOD_DF" elif eflag == X86_EFLAGS_MODIFY_NT: return "MOD_NT" elif eflag == X86_EFLAGS_MODIFY_RF: return "MOD_RF" elif eflag == X86_EFLAGS_SET_CF: return "SET_CF" elif eflag == X86_EFLAGS_SET_DF: return "SET_DF" elif eflag == X86_EFLAGS_SET_IF: return "SET_IF" else: return None def vector_to_asm(res_vector): test_vector=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] test_vector=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] addr_vec=test_vector[:64] opcode_vec=test_vector[64:(64+256)] option_vec=test_vector[(64+256):(64+256+5)] prefix_vec=test_vector[(64+256+5):(64+256+5+9)] modrm_vec=test_vector[(64+256+5+9):(64+256+5+9+20)] sib_vec=test_vector[(64+256+5+9+20):(64+256+5+9+20+20)] disp_flag=test_vector[(64+256+5+9+20+20):(64+256+5+9+20+20+1)] disp_vec=test_vector[(64+256+5+9+20+20+1):(64+256+5+9+20+20+64)] imme_flag=test_vector[(64+256+5+9+20+20+1+64):(64+256+5+9+20+20+1+64+1)] imme_vec=test_vector[(64+256+5+9+20+20+1+64+1):] # print("\taddr_vec : "+ str(addr_vec)) # print("\topcode_vec : " + str(opcode_vec)) # print("\toption_vec : " + str(option_vec)) # print("\tprefix_vec : " + str(prefix_vec)) # print("\tmodrm_vec : " + str(modrm_vec)) # print("\tsib_vec : " + str(sib_vec)) # print("\tdisp_flag : " + str(disp_flag)) # print("\tdisp_vec : " + str(disp_vec)) # print("\timme_flag : " + str(imme_flag)) # print("\timme_vec : " + str(imme_vec)) addr="0b" opcode=0 # option="0b" prefix=[0,0,0,0] modrm="0b" sib="0b" disp_flag_str="0b" disp="0b" imme_flag_str="0b" imme="0b" for i in addr_vec: addr+=str(i) for i in range(len(opcode_vec)): if opcode_vec[i]==1: opcode=i if prefix_vec[0]==1: prefix[0]=0xF0 prefix_vec_2=prefix_vec[1:7] for i in range(len(prefix_vec_2)): if prefix_vec_2[i]==1: prefix[1]=legacy_prefix_all_msg[i] if prefix_vec[7]==1: prefix[2]=0x66 if prefix_vec[8]==1: prefix[3]=0x67 modrm1=0 modrm2=0 modrm3=0 modrm_vec1=modrm_vec[:4] for i in range(len(modrm_vec1)): if modrm_vec1[i]==1: modrm1=i modrm_vec2 = modrm_vec[4:12] for i in range(len(modrm_vec2)): if modrm_vec2[i] == 1: modrm2 = i modrm_vec3 = modrm_vec[12:] for i in range(len(modrm_vec3)): if modrm_vec3[i] == 1: modrm3 = i modrm1,modrm2,modrm3=bin(modrm1),bin(modrm2),bin(modrm3) modrm1, modrm2, modrm3 = modrm1[2:],modrm2[2:],modrm3[2:] modrm ="0b"+modrm1+modrm2+modrm3 sib1 = 0 sib2 = 0 sib3 = 0 sib_vec1 = sib_vec[:4] for i in range(len(sib_vec1)): if sib_vec1[i] == 1: sib1 = i sib_vec2 = sib_vec[4:12] for i in range(len(sib_vec2)): if sib_vec2[i] == 1: sib2 = i sib_vec3 = sib_vec[12:] for i in range(len(sib_vec3)): if sib_vec3[i] == 1: sib3 = i sib1, sib2, sib3 = bin(sib1), bin(sib2), bin(sib3) sib1, sib2, sib3 = sib1[2:], sib2[2:], sib3[2:] sib = "0b"+sib1 + sib2 + sib3 # print() # print("\taddr : " + str(hex(int(addr,2)))) # print("\topcode : " + str(opcode)) # print("\tmodrm : " + str(modrm)) # print("\tsib : " + str(sib)) pass def msg_to_vector(normal,address,prefix,opcode,modrm,sib,disp,imme,imme_cont): addr_vec=[0]*64 opcode_vec=[0]*256 option_vec=[0]*8 #prefix[0,0,0,0]\modrm\sib\disp\imme prefix_vec=[0]*9 #1/6/1/1 moderm_vec=[0]*20 sib_vec=[0]*20 disp_flag=[0] disp_vec=[0]*64 imme_flag=[0] imme_vec=[0]*64 address, opcode, modrm, sib, disp, imme=int(address),int(opcode),int(modrm),int(sib),int(disp),int(imme) address, modrm, sib, disp, imme= bin(address), bin(modrm), bin(sib), bin(disp), bin(imme) if disp[0]=="-": disp_flag=[1] disp=disp[1:] if imme[0]=="-": imme_flag=[1] imme=disp[1:] address, modrm, sib, disp, imme= address[2:], modrm[2:], sib[2:], disp[2:], imme[2:] #填充address至64位 if len(address)<64: address="0"*(64-len(address))+address for i in range(len(address)): addr_vec[i]=int(address[i]) # print("addr_vec:"+str(addr_vec)) #one-hot表示 opcode_vec[opcode]=1 if prefix[0]!=0 : option_vec[0]=1 if prefix[1]!=0 : option_vec[1] = 1 if prefix[2]!=0 : option_vec[2] = 1 if prefix[3]!=0: option_vec[3] = 1 if modrm!="0": option_vec[4]=1 if sib!="0": option_vec[5]=1 if disp!="0": option_vec[6]=1 if imme_cont!=0: option_vec[7]=1 if len(disp)<64: disp="0"*(64-len(disp))+disp if len(imme)<64: imme="0"*(64-len(imme))+imme for i in range(len(disp)): disp_vec[i] = int(disp[i]) for i in range(len(imme)): imme_vec[i] = int(imme[i]) # print("0x%x:\t%s\t%s" % (insn.address, insn.mnemonic, insn.op_str)) # print("\t机器码 : "+ str(asm_code)) for i in range(len(prefix)): prefix[i]=int(prefix[i]) for j in range(len(legacy_prefix_all_msg["lock"])): if prefix[0]==legacy_prefix_all_msg["lock"][j]: prefix_vec[0]=j break for j in range(len(legacy_prefix_all_msg["segment"])): if prefix[1] == legacy_prefix_all_msg["segment"][j]: prefix_vec[j+1] = 1 break for j in range(len(legacy_prefix_all_msg["oprandsize"])): if prefix[2] == legacy_prefix_all_msg["oprandsize"][j]: prefix_vec[7] = j break for j in range(len(legacy_prefix_all_msg["address"])): if prefix[3] == legacy_prefix_all_msg["address"][j]: prefix_vec[8] = j break # print("prefix_vec:" + str(prefix_vec)) if len(modrm)<8: modrm="0"*(8-len(modrm))+modrm # print("modrm:"+modrm) modrm_split1="0b"+modrm[:2] modrm_split2="0b"+modrm[2:5] modrm_split3="0b"+modrm[5:] modrm_int1 = int(modrm_split1,2) modrm_int2 = int(modrm_split2, 2) modrm_int3 = int(modrm_split3, 2) moderm_vec[modrm_int1] = 1 moderm_vec[modrm_int2 + 4] = 1 moderm_vec[modrm_int3 + 12] = 1 if len(sib)<8: sib="0"*(8-len(sib))+sib # print("sib:"+sib) sib_split1="0b"+sib[:2] sib_split2="0b"+sib[2:5] sib_split3="0b"+sib[5:] sib_int1 = int(sib_split1,2) sib_int2 = int(sib_split2, 2) sib_int3 = int(sib_split3, 2) sib_vec[sib_int1] = 1 sib_vec[sib_int2 + 4] = 1 sib_vec[sib_int3 + 12] = 1 # result_vec = opcode_vec + option_vec + prefix_vec + moderm_vec + sib_vec + disp_flag + disp_vec + imme_flag + imme_vec result_vec=addr_vec+opcode_vec+option_vec+prefix_vec+moderm_vec+sib_vec+disp_flag+disp_vec+imme_flag+imme_vec # print("addr_vec:"+str(addr_vec)) # print("opcode_vec:" + str(opcode_vec)) # print("option_vec:"+str(option_vec)) # print("disp_vec:"+str(disp_vec)) # print("imme_vec:"+str(imme_vec)) # print("prefix_vec:" + str(prefix_vec)) # print("moderm_vec:" + str(moderm_vec)) # print("sib_vec:" + str(sib_vec)) # print("result_vec:\n"+str(result_vec)) #对数组做归一化处理 if normal==True: not_zero_sum=0 for i in result_vec: if i != 0: not_zero_sum+=i for i in range(len(result_vec)): if result_vec[i] != 0: result_vec[i]=result_vec[i]/not_zero_sum return result_vec # opcode_list = [] def get_asm_msg(insn): # print(dir(insn)) # exit() text1="" for i in range(insn.size): text1 += '%02X ' % insn.bytes[i] address=insn.address prefix=insn.prefix opcode=insn.opcode modrm=insn.modrm disp=insn.disp sib=insn.sib imme_cont=insn.op_count(X86_OP_IMM) if imme_cont!=0: op = insn.op_find(X86_OP_IMM, 1) imme="0x"+to_x(op.imm) imme=int(imme,16) else: imme=0 # if prefix!=[0,0,0,0]: # print("\t%s" % (insn.mnemonic)) # print("\t%s\t%s" % (insn.mnemonic, insn.op_str)) # print(dir(insn.op_count)) # print("\t机器码 : "+ str(asm_code)) # print("\t地址addr : "+ str(address)) # print("\t前缀prefix :" + str(prefix)) # print("\t操作码opcode : "+str(opcode)) # print("\tmodrm : "+str(modrm)) # print("\tsib : "+str(sib)) # print(type(insn.disp)) # print("\tdisp : "+hex(insn.disp)) # print("\timme : "+hex(imme)) # exit() # 打印操作数的REX前缀(非零值与x86_64指令相关) # print("\trex: 0x%x" % (insn.rex)) return str(address),prefix,str(opcode[0]),str(modrm),str(sib),str(disp),str(imme),imme_cont def get_asm_msg2(insn): # print(dir(insn)) # exit() text1="" for i in range(insn.size): text1 += '%02X ' % insn.bytes[i] address=insn.address prefix=insn.prefix opcode=insn.opcode modrm=insn.modrm disp=insn.disp sib=insn.sib imme_cont=insn.op_count(X86_OP_IMM) if imme_cont!=0: op = insn.op_find(X86_OP_IMM, 1) imme="0x"+to_x(op.imm) imme=int(imme,16) else: imme=0 # if prefix!=[0,0,0,0]: # print("\t%s" % (insn.mnemonic)) # print(dir(insn.op_count)) # print("\t机器码 : "+ str(asm_code)) # print("\t地址addr : "+ str(address)) # print("\t前缀prefix :" + str(prefix)) # print("\t操作码opcode : "+str(opcode)) # print("\tmodrm : "+str(modrm)) # print("\tsib : "+str(sib)) # print(type(insn.disp)) # print("\tdisp : "+hex(insn.disp)) # print("\timme : "+hex(imme)) op_str=insn.op_str.replace("0x","ox") if insn.disp<10: disp=str(insn.disp) else: disp=hex(insn.disp) if imme<10: imme2=str(imme) else: imme2=hex(imme).replace("0x","ox") r_list=["r1","r2","r3","r4","r5","r6""r7","r8","r9","r10","r11","r12""r13","r14","r15"] print("\t%s\t%s" % (insn.mnemonic, insn.op_str)) #替换为CONST if disp in op_str: flag=0 for i in r_list: if i in op_str: flag=1 break if flag==1: op_str=op_str.replace(" "+disp," CONST") else: op_str = op_str.replace(disp, "CONST") if imme2 in op_str: flag = 0 for i in r_list: if i in op_str: flag = 1 break if flag == 1: op_str = op_str.replace(" " + imme2, " CONST") else: op_str = op_str.replace(imme2, "CONST") print("\t%s\t%s" % (insn.mnemonic, op_str)) # exit() # exit() # 打印操作数的REX前缀(非零值与x86_64指令相关) # print("\trex: 0x%x" % (insn.rex)) return str(address),prefix,str(opcode[0]),str(modrm),str(sib),str(disp),str(imme),imme_cont def get_asm_input_vector(X86_CODE32,normal=False): arch, mode, code, comment, syntax=CS_ARCH_X86, CS_MODE_32, X86_CODE32, "X86 32 (Intel syntax)", None one_sample_vec_seq=[] opcode_oprand_seq=[] try: md = Cs(arch, mode) md.detail = True if syntax is not None: md.syntax = syntax for insn in md.disasm(code, 0x0): address, prefix, opcode, modrm, sib, disp, imme,imme_cont=get_asm_msg( insn) result_vec=msg_to_vector(normal,address, prefix, opcode, modrm, sib, disp, imme,imme_cont) one_sample_vec_seq.append(result_vec) opcode_oprand_seq.append("0x%x:%s %s" % (insn.address, insn.mnemonic, insn.op_str)) # print("one_sample_vec_seq:" + str(one_sample_vec_seq)) # print(len(one_sample_vec_seq)) #返回vector列表, return one_sample_vec_seq,opcode_oprand_seq except CsError as e: print("ERROR: %s" % e) exit() #随机获取50条指令,以及对应的vector def get_random_asm_seq(seq_num,one_sample_vec_seq,opcode_oprand_seq): one_sample_vec_seq=random.sample(one_sample_vec_seq,seq_num) opcode_oprand_seq=random.sample(opcode_oprand_seq,seq_num) # print(one_sample_vec_seq) # print(opcode_oprand_seq) return one_sample_vec_seq,opcode_oprand_seq # pass def get_vec_seq(): file_dir = "E:\Data\Data_\malware" file_list = sorted(os.listdir(file_dir)) # print(file_list) sample_vec_seq=[] # 对文件夹中每一个文件进行遍历 for item_file in file_list: # item_file="msvcrt20.dll" item_file="e3d5f6b7189fc7fb5904943f24fb749ccd70c6d2b5b9b3892525a1188310f80a" # item_file="efa4f015dc1b81d9dedd130439dea9f9cc2e5d2451e9bd1186990973cf14b693" # print(item_file) bool_flag, X86_CODE32 = get_asm_text_code(os.path.join(file_dir, item_file)) # print(X86_CODE32) # exit() #如果当前程序不是32位程序或者有加壳,则跳过 if bool_flag == False: continue one_sample_vec_seq,opcode_oprand_seq = get_asm_input_vector(X86_CODE32) # sample_vec_seq.append(one_sample_vec_seq) break return one_sample_vec_seq,opcode_oprand_seq if __name__ == '__main__': get_asm_input_vector("6a09e8c5fdffff83c404ff742404e89efeffff83c4046a09e80bf3ffff83c404c3",normal=True) # one_sample_vec_seq, opcode_oprand_seq=get_random_asm_seq() # print(len(one_sample_vec_seq)) # print(len(opcode_oprand_seq)) # exit() # sample_vec_seq=get_vec_seq() # print("有效性错误程序总数:"+str(error_validity_sum)) # print(sample_vec_seq) # print(len(sample_vec_seq[0]))