diff --git a/requirement_conda.txt b/requirement_conda.txt index f9d3094..ec5bd75 100644 --- a/requirement_conda.txt +++ b/requirement_conda.txt @@ -1,83 +1,79 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -_libgcc_mutex=0.1=main -antlr4-python3-runtime=4.8=pypi_0 -ase=3.21.1=pypi_0 -ca-certificates=2021.1.19=h06a4308_1 -cached-property=1.5.2=pypi_0 -certifi=2020.12.5=py37h06a4308_0 -cffi=1.14.5=pypi_0 -chardet=4.0.0=pypi_0 -cmake=3.18.4.post1=pypi_0 -cycler=0.10.0=pypi_0 -dataclasses=0.6=pypi_0 -decorator=4.4.2=pypi_0 -future=0.18.2=pypi_0 -googledrivedownloader=0.4=pypi_0 -h5py=3.2.1=pypi_0 -hydra-core=1.0.6=pypi_0 -idna=2.10=pypi_0 -importlib-resources=5.1.2=pypi_0 -intel-openmp=2021.1.2=pypi_0 -isodate=0.6.0=pypi_0 -jinja2=2.11.3=pypi_0 -joblib=1.0.1=pypi_0 -kiwisolver=1.3.1=pypi_0 -ld_impl_linux-64=2.33.1=h53a641e_7 -libedit=3.1.20191231=h14c3975_1 -libffi=3.3=he6710b0_2 -libgcc-ng=9.1.0=hdf63c60_0 -libstdcxx-ng=9.1.0=hdf63c60_0 -llvmlite=0.35.0=pypi_0 -magma-cuda112=2.5.2=1 -markupsafe=1.1.1=pypi_0 -matplotlib=3.3.4=pypi_0 -mkl=2021.1.1=pypi_0 -mkl-include=2021.1.1=pypi_0 -ncurses=6.2=he6710b0_1 -networkx=2.5=pypi_0 -ninja=1.10.0.post2=pypi_0 -numba=0.52.0=pypi_0 -numpy=1.20.1=pypi_0 -omegaconf=2.0.6=pypi_0 -openssl=1.1.1j=h27cfd23_0 -pandas=1.2.3=pypi_0 -pillow=8.1.2=pypi_0 -pip=21.0.1=py37h06a4308_0 -prefetch-generator=1.0.1=pypi_0 -pycparser=2.20=pypi_0 -pyparsing=2.4.7=pypi_0 -python=3.7.9=h7579374_0 -python-dateutil=2.8.1=pypi_0 -python-louvain=0.15=pypi_0 -pytz=2021.1=pypi_0 -pyyaml=5.4.1=pypi_0 -rdflib=5.0.0=pypi_0 -readline=8.1=h27cfd23_0 -requests=2.25.1=pypi_0 -scikit-learn=0.24.1=pypi_0 -scipy=1.6.1=pypi_0 -seaborn=0.11.1=pypi_0 -setuptools=52.0.0=py37h06a4308_0 -six=1.15.0=pypi_0 -sqlite=3.33.0=h62c20be_0 -tbb=2021.1.1=pypi_0 -texttable=1.6.3=pypi_0 -threadpoolctl=2.1.0=pypi_0 -tk=8.6.10=hbc83047_0 -torch=1.8.0+cu111=pypi_0 -torch-cluster=1.5.9=pypi_0 -torch-geometric=1.6.3=pypi_0 -torch-scatter=2.0.6=pypi_0 -torch-sparse=0.6.9=pypi_0 -torch-spline-conv=1.2.1=pypi_0 -torchaudio=0.8.0=pypi_0 -torchvision=0.9.0+cu111=pypi_0 -tqdm=4.59.0=pypi_0 -typing-extensions=3.7.4.3=pypi_0 -urllib3=1.26.3=pypi_0 -wheel=0.36.2=pyhd3eb1b0_0 -xz=5.2.5=h7b6447c_0 -zipp=3.4.1=pypi_0 -zlib=1.2.11=h7b6447c_3 + +antlr4-python3-runtime==4.8 +ase==3.21.1 +ca-certificates==2021.1.19 +cached-property==1.5.2 +certifi==2020.12.5 +cffi==1.14.5 +chardet==4.0.0 +cmake==3.18.4.post1 +cycler==0.10.0 +dataclasses==0.6 +decorator==4.4.2 +future==0.18.2 +googledrivedownloader==0.4 +h5py==3.2.1 +hydra-core==1.0.6 +idna==2.10 +importlib-resources==5.1.2 +intel-openmp==2021.1.2 +isodate==0.6.0 +jinja2==2.11.3 +joblib==1.0.1 +kiwisolver==1.3.1 +ld_impl_linux-64==2.33.1 +libedit==3.1.20191231 +libffi==3.3 +libgcc-ng==9.1.0 +libstdcxx-ng==9.1.0 +llvmlite==0.35.0 +magma-cuda112==2.5.2 +markupsafe==1.1.1 +matplotlib==3.3.4 +mkl==2021.1.1 +mkl-include==2021.1.1 +ncurses==6.2 +networkx==2.5 +ninja==1.10.0.post2 +numba==0.52.0 +numpy==1.20.1 +omegaconf==2.0.6 +openssl==1.1.1j +pandas==1.2.3 +pillow==8.1.2 +pip==21.0.1 +prefetch-generator==1.0.1 +pycparser==2.20 +pyparsing==2.4.7 +python-dateutil==2.8.1 +python-louvain==0.15 +pytz==2021.1 +pyyaml==5.4.1 +rdflib==5.0.0 +readline==8.1 +requests==2.25.1 +scikit-learn==0.24.1 +scipy==1.6.1 +seaborn==0.11.1 +setuptools==52.0.0 +six==1.15.0 +sqlite==3.33.0 +tbb==2021.1.1 +texttable==1.6.3 +threadpoolctl==2.1.0 +tk==8.6.10 +torch==1.8.0+cu111 +torch-cluster==1.5.9 +torch-geometric==1.6.3 +torch-scatter==2.0.6 +torch-sparse==0.6.9 +torch-spline-conv==1.2.1 +torchaudio==0.8.0 +torchvision==0.9.0+cu111 +tqdm==4.59.0 +typing-extensions==3.7.4.3 +urllib3==1.26.3 +wheel==0.36.2 +xz==5.2.5 +zipp==3.4.1 +zlib==1.2.11 diff --git a/samples/PreProcess.py b/samples/PreProcess.py index d04c402..b23d29d 100644 --- a/samples/PreProcess.py +++ b/samples/PreProcess.py @@ -3,10 +3,11 @@ import torch from torch_geometric.data import Data from tqdm import tqdm -from utils.Vocabulary import Vocab +from src.utils.Vocabulary import Vocab def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab): +#def parse_json_list_2_pyg_object(jsonl_file: str): index = 0 with open(jsonl_file, "r", encoding="utf-8") as file: for item in tqdm(file): @@ -35,7 +36,8 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab): if __name__ == '__main__': json_path = "./sample.jsonl" - train_vocab_file = "../ReservedDataCode/processed_dataset/train_external_function_name_vocab.jsonl" + train_vocab_file = "../data/processed_dataset/train_external_function_name_vocab.jsonl" + # train_vocab_file = "./res.jsonl" max_vocab_size = 10000 vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size) - parse_json_list_2_pyg_object(jsonl_file=json_path, label=1, vocab=vocabulary) \ No newline at end of file + parse_json_list_2_pyg_object(jsonl_file=json_path, label=1, vocab=vocabulary) diff --git a/samples/funCount.py b/samples/funCount.py new file mode 100644 index 0000000..165e675 --- /dev/null +++ b/samples/funCount.py @@ -0,0 +1,21 @@ +import json + +from tqdm import tqdm + +if __name__ == '__main__': + file_name = './sample.jsonl' + fil = open(file_name, mode='r') + fun_name_dict = {} + for item in tqdm(fil): + item = json.loads(item) + item_fun_list = item['function_names'] + for fun_name in item_fun_list: + if fun_name_dict.get(fun_name) is not None: + fun_name_dict[fun_name] += 1 + else: + fun_name_dict[fun_name] = 1 + + with open('./res.jsonl','w') as file: + for key,value in fun_name_dict.items(): + temp = {"f_name":key, "count":value} + file.write(json.dumps(temp) + '\n') diff --git a/samples/sample.jsonl b/samples/sample.jsonl deleted file mode 100644 index 02877b7..0000000 --- a/samples/sample.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"function_edges": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]], "acfg_list": [{"block_number": 3, "block_edges": [[0, 0, 1, 1], [0, 2, 0, 2]], "block_features": [[0, 2, 1, 0, 7, 0, 1, 1, 4, 0, 0], [0, 2, 0, 0, 3, 1, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]]}, {"block_number": 29, "block_edges": [[0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 18, 19, 19, 20, 20, 21, 21, 23, 24, 24, 26, 26, 27, 28], [16, 0, 2, 0, 4, 1, 3, 3, 3, 25, 15, 8, 6, 6, 7, 28, 12, 9, 23, 16, 25, 11, 21, 17, 13, 19, 22, 14, 19, 18, 27, 24, 23, 26, 21, 22, 25, 10, 25, 5, 14, 8]], "block_features": [[8, 2, 1, 5, 36, 0, 6, 0, 2, 0, 0], [0, 7, 0, 0, 3, 0, 1, 1, 1, 0, 0], [0, 7, 0, 0, 2, 0, 1, 1, 0, 0, 0], [0, 7, 0, 1, 8, 1, 2, 0, 0, 0, 0], [0, 7, 1, 0, 2, 0, 1, 0, 0, 0, 0], [0, 7, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 18, 0, 1, 9, 0, 2, 1, 1, 0, 0], [1, 21, 1, 0, 3, 0, 1, 1, 0, 0, 0], [0, 21, 0, 1, 4, 1, 2, 0, 0, 0, 0], [0, 24, 0, 2, 12, 1, 3, 0, 0, 0, 0], [1, 26, 0, 3, 16, 0, 4, 1, 4, 0, 0], [1, 2, 0, 5, 22, 0, 5, 0, 1, 0, 0], [5, 4, 1, 3, 21, 0, 4, 1, 3, 0, 0], [4, 11, 0, 2, 17, 1, 2, 0, 1, 0, 0], [2, 14, 0, 1, 12, 0, 2, 1, 1, 0, 0], [3, 17, 0, 0, 10, 0, 1, 0, 1, 0, 0], [1, 1, 0, 1, 5, 0, 2, 0, 0, 0, 0], [0, 14, 0, 0, 1, 0, 0, 0, 0, 0, 0], [3, 17, 0, 0, 7, 0, 0, 0, 0, 0, 0], [0, 17, 0, 1, 5, 0, 2, 1, 1, 0, 0], [2, 28, 1, 1, 11, 1, 2, 1, 1, 0, 0], [0, 11, 0, 1, 8, 1, 2, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0], [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], [12, 27, 1, 7, 41, 0, 8, 1, 6, 0, 0], [0, 0, 1, 0, 7, 1, 0, 0, 0, 1, 0], [2, 9, 0, 2, 17, 0, 3, 1, 3, 0, 0], [2, 14, 0, 0, 5, 0, 1, 0, 4, 0, 0], [1, 21, 4, 1, 13, 0, 2, 0, 5, 0, 0]]}], "function_names": ["sub_401000", "start", "GetTempPathW", "GetFileSize", "GetCurrentDirectoryW", "DeleteFileW", "CloseHandle", "WriteFile", "lstrcmpW", "ReadFile", "GetModuleHandleW", "ExitProcess", "HeapCreate", "HeapAlloc", "GetModuleFileNameW", "CreateFileW", "lstrlenW", "ShellExecuteW", "wsprintfW", "HttpSendRequestW", "InternetSetOptionW", "InternetQueryOptionW", "HttpOpenRequestW", "HttpQueryInfoW", "InternetReadFile", "InternetConnectW", "InternetOpenW"], "hash": "316ebb797d5196020eee013cfe771671fff4da8859adc9f385f52a74e82f4e55", "function_number": 27} \ No newline at end of file diff --git a/src/utils/Vocabulary.py b/src/utils/Vocabulary.py index ce9c805..5ee3911 100644 --- a/src/utils/Vocabulary.py +++ b/src/utils/Vocabulary.py @@ -64,6 +64,7 @@ class Vocab: def load_freq_counter_from_file(file_path: str, min_freq: int): freq_dict = {} with open(file_path, 'r') as f: + for line in tqdm(f, desc="Load frequency list from the file of {} ... ".format(file_path)): line = json.loads(line) f_name = line["f_name"]