备份

2024-04-29 17:31:04 +08:00 · 2024-04-29 17:31:04 +08:00 · b4b131fc61
commit b4b131fc61
parent 3df2fe07cb
6 changed files with 105 additions and 96 deletions
--- a/configs/default.yaml
+++ b/configs/default.yaml
@ -1,9 +1,9 @@
 Data:
-  preprocess_root: "/home/king/python/data/processed_dataset/DatasetJSON_remake"
-  train_vocab_file: "/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl"
+  preprocess_root: "/home/king/python/data/DatasetJSON_remake"
+  train_vocab_file: "/home/king/python/data/fun_name_sort.jsonl"
  max_vocab_size: 10000         # modify according to the result of 1BuildExternalVocab.py
 Training:
-  cuda: True                    # enable GPU training if cuda is available
+  cuda: False                    # enable GPU training if cuda is available
  dist_backend: "nccl"          # if using torch.distribution, the backend to be used
  dist_port: "1234"
  max_epoches: 10
--- a/requirement_conda.txt
+++ b/requirement_conda.txt
@ -1,83 +1,83 @@
 # This file may be used to create an environment using:
 # $ conda create --name <env> --file <this file>
 # platform: linux-64
-_libgcc_mutex=0.1=main
-antlr4-python3-runtime=4.8=pypi_0
-ase=3.21.1=pypi_0
-ca-certificates=2021.1.19=h06a4308_1
-cached-property=1.5.2=pypi_0
-certifi=2020.12.5=py37h06a4308_0
-cffi=1.14.5=pypi_0
-chardet=4.0.0=pypi_0
-cmake=3.18.4.post1=pypi_0
-cycler=0.10.0=pypi_0
-dataclasses=0.6=pypi_0
-decorator=4.4.2=pypi_0
-future=0.18.2=pypi_0
-googledrivedownloader=0.4=pypi_0
-h5py=3.2.1=pypi_0
-hydra-core=1.0.6=pypi_0
-idna=2.10=pypi_0
-importlib-resources=5.1.2=pypi_0
-intel-openmp=2021.1.2=pypi_0
-isodate=0.6.0=pypi_0
-jinja2=2.11.3=pypi_0
-joblib=1.0.1=pypi_0
-kiwisolver=1.3.1=pypi_0
-ld_impl_linux-64=2.33.1=h53a641e_7
-libedit=3.1.20191231=h14c3975_1
-libffi=3.3=he6710b0_2
-libgcc-ng=9.1.0=hdf63c60_0
-libstdcxx-ng=9.1.0=hdf63c60_0
-llvmlite=0.35.0=pypi_0
-magma-cuda112=2.5.2=1
-markupsafe=1.1.1=pypi_0
-matplotlib=3.3.4=pypi_0
-mkl=2021.1.1=pypi_0
-mkl-include=2021.1.1=pypi_0
-ncurses=6.2=he6710b0_1
-networkx=2.5=pypi_0
-ninja=1.10.0.post2=pypi_0
-numba=0.52.0=pypi_0
-numpy=1.20.1=pypi_0
-omegaconf=2.0.6=pypi_0
-openssl=1.1.1j=h27cfd23_0
-pandas=1.2.3=pypi_0
-pillow=8.1.2=pypi_0
-pip=21.0.1=py37h06a4308_0
-prefetch-generator=1.0.1=pypi_0
-pycparser=2.20=pypi_0
-pyparsing=2.4.7=pypi_0
-python=3.7.9=h7579374_0
-python-dateutil=2.8.1=pypi_0
-python-louvain=0.15=pypi_0
-pytz=2021.1=pypi_0
-pyyaml=5.4.1=pypi_0
-rdflib=5.0.0=pypi_0
-readline=8.1=h27cfd23_0
-requests=2.25.1=pypi_0
-scikit-learn=0.24.1=pypi_0
-scipy=1.6.1=pypi_0
-seaborn=0.11.1=pypi_0
-setuptools=52.0.0=py37h06a4308_0
-six=1.15.0=pypi_0
-sqlite=3.33.0=h62c20be_0
-tbb=2021.1.1=pypi_0
-texttable=1.6.3=pypi_0
-threadpoolctl=2.1.0=pypi_0
-tk=8.6.10=hbc83047_0
-torch=1.8.0+cu111=pypi_0
-torch-cluster=1.5.9=pypi_0
-torch-geometric=1.6.3=pypi_0
-torch-scatter=2.0.6=pypi_0
-torch-sparse=0.6.9=pypi_0
-torch-spline-conv=1.2.1=pypi_0
-torchaudio=0.8.0=pypi_0
-torchvision=0.9.0+cu111=pypi_0
-tqdm=4.59.0=pypi_0
-typing-extensions=3.7.4.3=pypi_0
-urllib3=1.26.3=pypi_0
-wheel=0.36.2=pyhd3eb1b0_0
-xz=5.2.5=h7b6447c_0
-zipp=3.4.1=pypi_0
-zlib=1.2.11=h7b6447c_3
+_libgcc_mutex=0.1
+antlr4-python3-runtime=4.8
+ase=3.21.1
+ca-certificates=2021.1.19
+cached-property=1.5.2
+certifi=2020.12.5
+cffi=1.14.5
+chardet=4.0.0
+cmake=3.18.4.post1
+cycler=0.10.0
+dataclasses=0.6
+decorator=4.4.2
+future=0.18.2
+googledrivedownloader=0.4
+h5py=3.2.1
+hydra-core=1.0.6
+idna=2.10
+importlib-resources=5.1.2
+intel-openmp=2021.1.2
+isodate=0.6.0
+jinja2=2.11.3
+joblib=1.0.1
+kiwisolver=1.3.1
+ld_impl_linux-64=2.33.1
+libedit=3.1.20191231
+libffi=3.3
+libgcc-ng=9.1.0
+libstdcxx-ng=9.1.0
+llvmlite=0.35.0
+magma-cuda112=2.5.2
+markupsafe=1.1.1
+matplotlib=3.3.4
+mkl=2021.1.1
+mkl-include=2021.1.1
+ncurses=6.2
+networkx=2.5
+ninja=1.10.0.post2
+numba=0.52.0
+numpy=1.20.1
+omegaconf=2.0.6
+openssl=1.1.1j
+pandas=1.2.3
+pillow=8.1.2
+pip=21.0.1
+prefetch-generator=1.0.1
+pycparser=2.20
+pyparsing=2.4.7
+python=3.7.9
+python-dateutil=2.8.1
+python-louvain=0.15
+pytz=2021.1
+pyyaml=5.4.1
+rdflib=5.0.0
+readline=8.1
+requests=2.25.1
+scikit-learn=0.24.1
+scipy=1.6.1
+seaborn=0.11.1
+setuptools=52.0.0
+six=1.15.0
+sqlite=3.33.0
+tbb=2021.1.1
+texttable=1.6.3
+threadpoolctl=2.1.0
+tk=8.6.10
+torch=1.8.0+cu111
+torch-cluster=1.5.9
+torch-geometric=1.6.3
+torch-scatter=2.0.6
+torch-sparse=0.6.9
+torch-spline-conv=1.2.1
+torchaudio=0.8.0
+torchvision=0.9.0+cu111
+tqdm=4.59.0
+typing-extensions=3.7.4.3
+urllib3=1.26.3
+wheel=0.36.2
+xz=5.2.5
+zipp=3.4.1
+zlib=1.2.11
--- a/samples/PreProcess.py
+++ b/samples/PreProcess.py
@ -18,7 +18,7 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab, save
    test_flag = True
    file_len = len(os.listdir(jsonl_file))

-    for file in tqdm(os.listdir(jsonl_file)):
+    for file in tqdm(os.listdir(jsonl_file), desc=file_type):
        if index >= file_len * 0.8 and valid_flag:
            type_index += 1
            valid_flag = False
@ -36,7 +36,7 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab, save

 def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: str, train_type: str, index: int):
    if not os.path.exists(save_path+f"{train_type}_{file_type}/"):
-        os.mkdir(save_path+f"{train_type}_{file_type}/")
+        os.makedirs(save_path+f"{train_type}_{file_type}/")
    with open(file, "r", encoding="utf-8") as item:
        line = item.readline()
        item = json.loads(line)
@ -67,10 +67,10 @@ def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: s


 if __name__ == '__main__':
-    malware_json_path = "/home/king/python/data/jsonl/infected_jsonl/"
-    benign_json_path = "/home/king/python/data/jsonl/refind_jsonl/"
-    train_vocab_file = "/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl"
-    save_vocab_file = "/home/king/python/data/processed_dataset/DatasetJSON_remake/"
+    malware_json_path = "/home/king/python/data/jsonl/malware/"
+    benign_json_path = "/home/king/python/data/jsonl/benign/"
+    train_vocab_file = "/home/king/python/data/fun_name_sort.jsonl"
+    save_vocab_file = "/home/king/python/data/DatasetJSON_remake/"
    file_type = ["malware", "benign"]
    max_vocab_size = 10000
    vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size)
--- a/src/DistTrainModel.py
+++ b/src/DistTrainModel.py
@ -16,7 +16,7 @@ from omegaconf import DictConfig
 from prefetch_generator import BackgroundGenerator
 from sklearn.metrics import roc_auc_score, roc_curve
 from torch import nn
-from torch_geometric.data import DataLoader
+from torch_geometric.loader import DataLoader
 from tqdm import tqdm

 from models.HierarchicalGraphModel import HierarchicalGraphNeuralNetwork
@ -26,6 +26,10 @@ from utils.PreProcessedDataset import MalwareDetectionDataset
 from utils.RealBatch import create_real_batch_data
 from utils.Vocabulary import Vocab

+os.environ['TORCH_USE_CUDA_DSA'] = "1"
+os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
+
+

 class DataLoaderX(DataLoader):
    def __iter__(self):
@ -304,8 +308,12 @@ def main_app(config: DictConfig):
        # num_gpus = 1
        num_gpus = torch.cuda.device_count()
        log.info("Total number of GPUs = {}".format(num_gpus))
+        # try:
+        #     torch_mp.spawn(main_train_worker, nprocs=num_gpus, args=(num_gpus, _train_params, _model_params, _optim_params, log, log_result_file,))
+        # except Exception as e:
+        #     print(e)
        torch_mp.spawn(main_train_worker, nprocs=num_gpus, args=(num_gpus, _train_params, _model_params, _optim_params, log, log_result_file,))
-        
+        # main_train_worker(0, num_gpus, _train_params, _model_params, _optim_params, log, log_result_file)
        best_model_file = os.path.join(os.getcwd(), 'LocalRank_{}_best_model.pt'.format(0))
    
    else:
--- a/src/utils/PreProcessedDataset.py
+++ b/src/utils/PreProcessedDataset.py
@ -24,7 +24,7 @@ class MalwareDetectionDataset(Dataset):
                files.append(name)
        return files
    
-    def __len__(self):
+    def len(self):
        # def len(self):
        # return 201
        return len(self.malware_files) + len(self.benign_files)
--- a/torch_test.py
+++ b/torch_test.py
@ -1,8 +1,9 @@
 import torch_geometric
 import torch
 if __name__ == '__main__':
-    # print(torch.__version__)
-    # print(torch.cuda.device_count())
-    # print(torch.cuda.get_device_name())
-    print(torch.cuda.nccl.is_available())
+    print(torch.__version__)
+    print(torch.cuda.device_count())
+    print(torch.cuda.get_device_name())
+    print(torch.cuda.is_available())
+    # print(torch.cuda.nccl.is_available())
    print(torch.cuda.nccl.version())