diff --git a/funNameGet.py b/funNameGet.py index 15a7020..ee129ad 100644 --- a/funNameGet.py +++ b/funNameGet.py @@ -2,35 +2,55 @@ import concurrent.futures import os import r2pipe from tqdm import tqdm +import pandas as pd + def get_fun_name_list(file_path): # 读取csv文件 - r2 = r2pipe.open(os.path.join(file_path), flags=['-2']) - r2.cmd('aaa') - r2.cmd('e arch=x86') - function_list = r2.cmdj("aflj") fun_name_list = [] - for function in function_list: - fun_name_list.append(function['name']) + try: + r2 = r2pipe.open(os.path.join(file_path), flags=['-2']) + r2.cmd('aaa') + r2.cmd('e arch=x86') + function_list = r2.cmdj("aflj") + + for function in function_list: + fun_name_list.append(function['name']) + except Exception as err: + print(f'error at {file_path} , {err}') r2.quit() return fun_name_list - -if __name__ == '__main__': +def fun_name_count(): file_path = os.path.join('/mnt/d/bishe/dataset/sample_20230130_458') - file_list = os.listdir(file_path) + bengin_file_path = os.path.join('/mnt/d/bishe/dataset/train_benign') + file_list = [os.path.join(file_path, file_name) for file_name in os.listdir(file_path)] + file_list.extend([os.path.join(bengin_file_path, file_name) for file_name in os.listdir(bengin_file_path)]) fun_name_set = {} - with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor: future_to_args = { - executor.submit(get_fun_name_list, os.path.join(file_path, file_name)): file_name + executor.submit(get_fun_name_list, file_name): file_name for file_name in file_list } for future in tqdm(concurrent.futures.as_completed(future_to_args), total=len(future_to_args)): fun_name_list = future.result() - for fun_name in fun_name_list: - if fun_name not in fun_name_set: - fun_name_set[fun_name] = 1 - else: - fun_name_set[fun_name] += 1 - print(fun_name_set) + if fun_name_list: + for fun_name in fun_name_list: + if fun_name not in fun_name_set: + fun_name_set[fun_name] = 1 + else: + fun_name_set[fun_name] += 1 + pd.DataFrame(fun_name_set.items(), columns=['fun_name', 'count']).to_csv('./out/fun_name.csv', index=False, mode='a') + +def fun_name_sort(): + fun_name_df = pd.read_csv('./out/fun_name.csv') + # 去除fun_name中fun_name列中的局部函数 + for item in ['fcn.', 'loc.', 'main', 'entr']: + fun_name_df = fun_name_df[fun_name_df['fun_name'].apply(lambda x: item not in x and item not in x)] + fun_name_df = fun_name_df.sort_values(by='count', ascending=False)[:10000] + fun_name_df.to_csv('fun_name_sort.csv', index=False) + +if __name__ == '__main__': + fun_name_count() + fun_name_sort() diff --git a/ngram.py b/ngram.py index e71c18e..8948457 100644 --- a/ngram.py +++ b/ngram.py @@ -102,9 +102,8 @@ def process_csv_file(csvfile, ngram_type, file_percent_filter, frequency_filter) idx + 1, file_percent_filter, frequency_filter): start for start in range(0, len(dataframe['corpus'].values), 10000) } - - - for future in concurrent.futures.as_completed(future_to_args): + for future in tqdm(concurrent.futures.as_completed(future_to_args), total=len(future_to_args), + desc=f'Computing {ngram_type}-gram on files'): try: sub_ngram_list, sub_filtered_ngram_list = future.result() for i in [sub_ngram_list, ngram_list]: @@ -113,10 +112,11 @@ def process_csv_file(csvfile, ngram_type, file_percent_filter, frequency_filter) for i in [sub_filtered_ngram_list, filtered_ngram_list]: for key, value in i.items(): filtered_ngram_list[key] += value - process_bar.update(10000) # 手动更新进度条 except Exception as exc: logging.error(f"Error processing {idx + 1}-gram: {exc}") - return ngram_list, filtered_ngram_list + return ngram_list, filtered_ngram_list + + # -------------------------------------------------------------------------------------------------- # Execution starts here