detect_rep/ASM2VEC_base_scripts/bin2asm_hex.py
2023-04-05 10:04:49 +08:00

142 lines
3.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import re
import os
import click
import r2pipe
import hashlib
from pathlib import Path
import csv
def sha3(data):
return hashlib.sha3_256(data.encode()).hexdigest()
def validEXE(filename):
magics = [bytes.fromhex('4d5a9000')]
with open(filename, 'rb') as f:
header = f.read(4)
return header in magics
def normalize(opcode):
opcode = opcode.replace(' - ', ' + ')
opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
opcode = re.sub(r' [0-9]', ' CONST', opcode)
return opcode
def fn2asm(pdf, minlen):
# check
if pdf is None:
return
if len(pdf['ops']) < minlen:
return
if 'invalid' in [op['type'] for op in pdf['ops']]:
return
ops = pdf['ops']
# set label
labels, scope = {}, [op['offset'] for op in ops]
assert (None not in scope)
for i, op in enumerate(ops):
if op.get('jump') in scope:
labels.setdefault(op.get('jump'), i)
# dump output
output = ''
for op in ops:
output+=normalize(op["bytes"])
# output += f'{hex(op["offset"])+":"+normalize(op["opcode"])+":"+normalize(op["bytes"])}\n'
# exit()
return output
def bin2asm(filename, opath, minlen):
# check
# if not validEXE(filename):
# print("fail Exe")
# return 0
# print("666")
r = r2pipe.open(str(filename))
r.cmd('aaaa')
# print(str(filename))
# exit()
count = 0
# header = ['func_name', 'bytes']
csv_data = []
saved_asm_list = []
with open('./asm_func/asm_hex/func_bytes.csv',encoding="utf8") as f:
csv_reader = csv.reader(f)
for line in csv_reader:
# print("good")
# print(line[1])
asm=line[1]
if asm != "bytes":
saved_asm_list.append(line[1])
for fn in r.cmdj('aflj'):
r.cmd(f's {fn["offset"]}')
asm = fn2asm(r.cmdj('pdfj'), minlen)
#如果该asm未重复且不为空则记录
if asm not in saved_asm_list and asm:
csv_data.append([fn["name"], asm])
count += 1
with open('./asm_func/asm_hex/func_bytes.csv','a+',encoding='utf-8',newline='') as fp:
writer =csv.writer(fp)
# writer.writerow(header)
writer.writerows(csv_data)
print(f'[+] {filename}')
return count
@click.command()
@click.option('-i', '--input', 'ipath', help='input directory / file', required=True)
@click.option('-o', '--output', 'opath', default='asm', help='output directory')
@click.option('-l', '--len', 'minlen', default=10,
help='ignore assembly code with instructions amount smaller than minlen')
def cli(ipath, opath, minlen):
'''
Extract assembly functions from binary executable
'''
ipath = Path(ipath)
opath = Path(opath)
# create output directory
if not os.path.exists(opath):
os.mkdir(opath)
fcount, bcount = 0, 0
# directory
if os.path.isdir(ipath):
for f in os.listdir(ipath):
if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f):
fcount += bin2asm(ipath / f, opath, minlen)
bcount += 1
# file
elif os.path.exists(ipath):
fcount += bin2asm(ipath, opath, minlen)
bcount += 1
else:
print(f'[Error] No such file or directory: {ipath}')
print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}')
if __name__ == '__main__':
cli()