import json import os from itertools import islice import heapq from tqdm import tqdm if __name__ == '__main__': mal_file_name = '/home/king/python/data/jsonl/infected_jsonl/' ben_file_name = '/home/king/python/data/jsonl/refind_jsonl/' fun_name_dict = {} for file in tqdm(os.listdir(mal_file_name)): with open(mal_file_name + file, 'r') as item: item = json.loads(item.readline()) item_fun_list = item['function_names'] for fun_name in item_fun_list: if fun_name != 'start' and fun_name != 'start_0' and 'sub_' not in fun_name: if fun_name_dict.get(fun_name) is not None: fun_name_dict[fun_name] += 1 else: fun_name_dict[fun_name] = 1 for file in tqdm(os.listdir(ben_file_name)): with open(ben_file_name + file, 'r') as item: item = json.loads(item.readline()) item_fun_list = item['function_names'] for fun_name in item_fun_list: if fun_name != 'start' and fun_name != 'start_0' and 'sub_' not in fun_name: if fun_name_dict.get(fun_name) is not None: fun_name_dict[fun_name] += 1 else: fun_name_dict[fun_name] = 1 with open('/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl', 'w') as file: largest_10000_items = heapq.nlargest(10000, fun_name_dict.items(), key=lambda item: item[1]) for key, value in largest_10000_items: temp = {"f_name": key, "count": value} file.write(json.dumps(temp) + '\n')