detect_rep/ASM2VEC_base_scripts/vocab_generator.py

45 lines
1.3 KiB
Python
Raw Permalink Normal View History

2023-04-05 10:04:49 +08:00
import os
import json
if __name__ == '__main__':
vocab = {}
file_list=os.listdir("./asm_base")
# print(file)
# exit()
for file_name in file_list:
file = open(os.path.join("./asm_base",file_name), mode = 'r')
for line in file.readlines()[3:]:
remove_n=line.strip("\n")
asm_list=remove_n.split(maxsplit=1)
opcode=asm_list[0]
if 'LABEL' not in opcode:
if opcode not in vocab:
vocab[opcode]=1
else:
vocab[opcode]+=1
if len(asm_list)>1:
operation_list=asm_list[1].split(",")
for oper in operation_list:
if oper[0]==" ":
oper=oper.lstrip()
if 'LABEL' not in oper:
if oper not in vocab :
vocab[oper] = 1
else:
vocab[oper] += 1
print(len(vocab))
print(vocab)
sorted_d = sorted(vocab.items(), key=lambda x: x[1],reverse=True)
res_json = {}
i=0
for (token,sum) in sorted_d:
i+=1
res_json[token]=i
print(res_json)
# for i in vocab:
# print(i)
# print(vocab[i])