MalGraph/samples/funCount.py

39 lines
1.7 KiB
Python
Raw Permalink Normal View History

2023-12-28 17:01:36 +08:00
import json
2024-01-10 10:32:38 +08:00
import os
from itertools import islice
import heapq
2023-12-28 17:01:36 +08:00
from tqdm import tqdm
if __name__ == '__main__':
2024-01-26 13:10:33 +08:00
mal_file_name = '/home/king/python/data/jsonl/infected_jsonl/'
ben_file_name = '/home/king/python/data/jsonl/refind_jsonl/'
2023-12-28 17:01:36 +08:00
fun_name_dict = {}
2024-01-10 10:32:38 +08:00
for file in tqdm(os.listdir(mal_file_name)):
with open(mal_file_name + file, 'r') as item:
item = json.loads(item.readline())
item_fun_list = item['function_names']
for fun_name in item_fun_list:
if fun_name != 'start' and fun_name != 'start_0' and 'sub_' not in fun_name:
if fun_name_dict.get(fun_name) is not None:
fun_name_dict[fun_name] += 1
else:
fun_name_dict[fun_name] = 1
for file in tqdm(os.listdir(ben_file_name)):
with open(ben_file_name + file, 'r') as item:
item = json.loads(item.readline())
item_fun_list = item['function_names']
for fun_name in item_fun_list:
if fun_name != 'start' and fun_name != 'start_0' and 'sub_' not in fun_name:
if fun_name_dict.get(fun_name) is not None:
fun_name_dict[fun_name] += 1
else:
fun_name_dict[fun_name] = 1
2024-01-26 13:10:33 +08:00
with open('/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl', 'w') as file:
2024-01-10 10:32:38 +08:00
largest_10000_items = heapq.nlargest(10000, fun_name_dict.items(), key=lambda item: item[1])
for key, value in largest_10000_items:
2024-01-05 14:30:45 +08:00
temp = {"f_name": key, "count": value}
2023-12-28 17:01:36 +08:00
file.write(json.dumps(temp) + '\n')
2024-01-10 10:32:38 +08:00