This commit is contained in:
huihun 2024-04-29 17:31:04 +08:00
parent 3df2fe07cb
commit b4b131fc61
6 changed files with 105 additions and 96 deletions

View File

@ -1,9 +1,9 @@
Data: Data:
preprocess_root: "/home/king/python/data/processed_dataset/DatasetJSON_remake" preprocess_root: "/home/king/python/data/DatasetJSON_remake"
train_vocab_file: "/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl" train_vocab_file: "/home/king/python/data/fun_name_sort.jsonl"
max_vocab_size: 10000 # modify according to the result of 1BuildExternalVocab.py max_vocab_size: 10000 # modify according to the result of 1BuildExternalVocab.py
Training: Training:
cuda: True # enable GPU training if cuda is available cuda: False # enable GPU training if cuda is available
dist_backend: "nccl" # if using torch.distribution, the backend to be used dist_backend: "nccl" # if using torch.distribution, the backend to be used
dist_port: "1234" dist_port: "1234"
max_epoches: 10 max_epoches: 10

View File

@ -1,83 +1,83 @@
# This file may be used to create an environment using: # This file may be used to create an environment using:
# $ conda create --name <env> --file <this file> # $ conda create --name <env> --file <this file>
# platform: linux-64 # platform: linux-64
_libgcc_mutex=0.1=main _libgcc_mutex=0.1
antlr4-python3-runtime=4.8=pypi_0 antlr4-python3-runtime=4.8
ase=3.21.1=pypi_0 ase=3.21.1
ca-certificates=2021.1.19=h06a4308_1 ca-certificates=2021.1.19
cached-property=1.5.2=pypi_0 cached-property=1.5.2
certifi=2020.12.5=py37h06a4308_0 certifi=2020.12.5
cffi=1.14.5=pypi_0 cffi=1.14.5
chardet=4.0.0=pypi_0 chardet=4.0.0
cmake=3.18.4.post1=pypi_0 cmake=3.18.4.post1
cycler=0.10.0=pypi_0 cycler=0.10.0
dataclasses=0.6=pypi_0 dataclasses=0.6
decorator=4.4.2=pypi_0 decorator=4.4.2
future=0.18.2=pypi_0 future=0.18.2
googledrivedownloader=0.4=pypi_0 googledrivedownloader=0.4
h5py=3.2.1=pypi_0 h5py=3.2.1
hydra-core=1.0.6=pypi_0 hydra-core=1.0.6
idna=2.10=pypi_0 idna=2.10
importlib-resources=5.1.2=pypi_0 importlib-resources=5.1.2
intel-openmp=2021.1.2=pypi_0 intel-openmp=2021.1.2
isodate=0.6.0=pypi_0 isodate=0.6.0
jinja2=2.11.3=pypi_0 jinja2=2.11.3
joblib=1.0.1=pypi_0 joblib=1.0.1
kiwisolver=1.3.1=pypi_0 kiwisolver=1.3.1
ld_impl_linux-64=2.33.1=h53a641e_7 ld_impl_linux-64=2.33.1
libedit=3.1.20191231=h14c3975_1 libedit=3.1.20191231
libffi=3.3=he6710b0_2 libffi=3.3
libgcc-ng=9.1.0=hdf63c60_0 libgcc-ng=9.1.0
libstdcxx-ng=9.1.0=hdf63c60_0 libstdcxx-ng=9.1.0
llvmlite=0.35.0=pypi_0 llvmlite=0.35.0
magma-cuda112=2.5.2=1 magma-cuda112=2.5.2
markupsafe=1.1.1=pypi_0 markupsafe=1.1.1
matplotlib=3.3.4=pypi_0 matplotlib=3.3.4
mkl=2021.1.1=pypi_0 mkl=2021.1.1
mkl-include=2021.1.1=pypi_0 mkl-include=2021.1.1
ncurses=6.2=he6710b0_1 ncurses=6.2
networkx=2.5=pypi_0 networkx=2.5
ninja=1.10.0.post2=pypi_0 ninja=1.10.0.post2
numba=0.52.0=pypi_0 numba=0.52.0
numpy=1.20.1=pypi_0 numpy=1.20.1
omegaconf=2.0.6=pypi_0 omegaconf=2.0.6
openssl=1.1.1j=h27cfd23_0 openssl=1.1.1j
pandas=1.2.3=pypi_0 pandas=1.2.3
pillow=8.1.2=pypi_0 pillow=8.1.2
pip=21.0.1=py37h06a4308_0 pip=21.0.1
prefetch-generator=1.0.1=pypi_0 prefetch-generator=1.0.1
pycparser=2.20=pypi_0 pycparser=2.20
pyparsing=2.4.7=pypi_0 pyparsing=2.4.7
python=3.7.9=h7579374_0 python=3.7.9
python-dateutil=2.8.1=pypi_0 python-dateutil=2.8.1
python-louvain=0.15=pypi_0 python-louvain=0.15
pytz=2021.1=pypi_0 pytz=2021.1
pyyaml=5.4.1=pypi_0 pyyaml=5.4.1
rdflib=5.0.0=pypi_0 rdflib=5.0.0
readline=8.1=h27cfd23_0 readline=8.1
requests=2.25.1=pypi_0 requests=2.25.1
scikit-learn=0.24.1=pypi_0 scikit-learn=0.24.1
scipy=1.6.1=pypi_0 scipy=1.6.1
seaborn=0.11.1=pypi_0 seaborn=0.11.1
setuptools=52.0.0=py37h06a4308_0 setuptools=52.0.0
six=1.15.0=pypi_0 six=1.15.0
sqlite=3.33.0=h62c20be_0 sqlite=3.33.0
tbb=2021.1.1=pypi_0 tbb=2021.1.1
texttable=1.6.3=pypi_0 texttable=1.6.3
threadpoolctl=2.1.0=pypi_0 threadpoolctl=2.1.0
tk=8.6.10=hbc83047_0 tk=8.6.10
torch=1.8.0+cu111=pypi_0 torch=1.8.0+cu111
torch-cluster=1.5.9=pypi_0 torch-cluster=1.5.9
torch-geometric=1.6.3=pypi_0 torch-geometric=1.6.3
torch-scatter=2.0.6=pypi_0 torch-scatter=2.0.6
torch-sparse=0.6.9=pypi_0 torch-sparse=0.6.9
torch-spline-conv=1.2.1=pypi_0 torch-spline-conv=1.2.1
torchaudio=0.8.0=pypi_0 torchaudio=0.8.0
torchvision=0.9.0+cu111=pypi_0 torchvision=0.9.0+cu111
tqdm=4.59.0=pypi_0 tqdm=4.59.0
typing-extensions=3.7.4.3=pypi_0 typing-extensions=3.7.4.3
urllib3=1.26.3=pypi_0 urllib3=1.26.3
wheel=0.36.2=pyhd3eb1b0_0 wheel=0.36.2
xz=5.2.5=h7b6447c_0 xz=5.2.5
zipp=3.4.1=pypi_0 zipp=3.4.1
zlib=1.2.11=h7b6447c_3 zlib=1.2.11

View File

@ -18,7 +18,7 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab, save
test_flag = True test_flag = True
file_len = len(os.listdir(jsonl_file)) file_len = len(os.listdir(jsonl_file))
for file in tqdm(os.listdir(jsonl_file)): for file in tqdm(os.listdir(jsonl_file), desc=file_type):
if index >= file_len * 0.8 and valid_flag: if index >= file_len * 0.8 and valid_flag:
type_index += 1 type_index += 1
valid_flag = False valid_flag = False
@ -36,7 +36,7 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab, save
def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: str, train_type: str, index: int): def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: str, train_type: str, index: int):
if not os.path.exists(save_path+f"{train_type}_{file_type}/"): if not os.path.exists(save_path+f"{train_type}_{file_type}/"):
os.mkdir(save_path+f"{train_type}_{file_type}/") os.makedirs(save_path+f"{train_type}_{file_type}/")
with open(file, "r", encoding="utf-8") as item: with open(file, "r", encoding="utf-8") as item:
line = item.readline() line = item.readline()
item = json.loads(line) item = json.loads(line)
@ -67,10 +67,10 @@ def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: s
if __name__ == '__main__': if __name__ == '__main__':
malware_json_path = "/home/king/python/data/jsonl/infected_jsonl/" malware_json_path = "/home/king/python/data/jsonl/malware/"
benign_json_path = "/home/king/python/data/jsonl/refind_jsonl/" benign_json_path = "/home/king/python/data/jsonl/benign/"
train_vocab_file = "/home/king/python/data/processed_dataset/train_external_function_name_vocab.jsonl" train_vocab_file = "/home/king/python/data/fun_name_sort.jsonl"
save_vocab_file = "/home/king/python/data/processed_dataset/DatasetJSON_remake/" save_vocab_file = "/home/king/python/data/DatasetJSON_remake/"
file_type = ["malware", "benign"] file_type = ["malware", "benign"]
max_vocab_size = 10000 max_vocab_size = 10000
vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size) vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size)

View File

@ -16,7 +16,7 @@ from omegaconf import DictConfig
from prefetch_generator import BackgroundGenerator from prefetch_generator import BackgroundGenerator
from sklearn.metrics import roc_auc_score, roc_curve from sklearn.metrics import roc_auc_score, roc_curve
from torch import nn from torch import nn
from torch_geometric.data import DataLoader from torch_geometric.loader import DataLoader
from tqdm import tqdm from tqdm import tqdm
from models.HierarchicalGraphModel import HierarchicalGraphNeuralNetwork from models.HierarchicalGraphModel import HierarchicalGraphNeuralNetwork
@ -26,6 +26,10 @@ from utils.PreProcessedDataset import MalwareDetectionDataset
from utils.RealBatch import create_real_batch_data from utils.RealBatch import create_real_batch_data
from utils.Vocabulary import Vocab from utils.Vocabulary import Vocab
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
class DataLoaderX(DataLoader): class DataLoaderX(DataLoader):
def __iter__(self): def __iter__(self):
@ -304,8 +308,12 @@ def main_app(config: DictConfig):
# num_gpus = 1 # num_gpus = 1
num_gpus = torch.cuda.device_count() num_gpus = torch.cuda.device_count()
log.info("Total number of GPUs = {}".format(num_gpus)) log.info("Total number of GPUs = {}".format(num_gpus))
# try:
# torch_mp.spawn(main_train_worker, nprocs=num_gpus, args=(num_gpus, _train_params, _model_params, _optim_params, log, log_result_file,))
# except Exception as e:
# print(e)
torch_mp.spawn(main_train_worker, nprocs=num_gpus, args=(num_gpus, _train_params, _model_params, _optim_params, log, log_result_file,)) torch_mp.spawn(main_train_worker, nprocs=num_gpus, args=(num_gpus, _train_params, _model_params, _optim_params, log, log_result_file,))
# main_train_worker(0, num_gpus, _train_params, _model_params, _optim_params, log, log_result_file)
best_model_file = os.path.join(os.getcwd(), 'LocalRank_{}_best_model.pt'.format(0)) best_model_file = os.path.join(os.getcwd(), 'LocalRank_{}_best_model.pt'.format(0))
else: else:

View File

@ -24,7 +24,7 @@ class MalwareDetectionDataset(Dataset):
files.append(name) files.append(name)
return files return files
def __len__(self): def len(self):
# def len(self): # def len(self):
# return 201 # return 201
return len(self.malware_files) + len(self.benign_files) return len(self.malware_files) + len(self.benign_files)

View File

@ -1,8 +1,9 @@
import torch_geometric import torch_geometric
import torch import torch
if __name__ == '__main__': if __name__ == '__main__':
# print(torch.__version__) print(torch.__version__)
# print(torch.cuda.device_count()) print(torch.cuda.device_count())
# print(torch.cuda.get_device_name()) print(torch.cuda.get_device_name())
print(torch.cuda.nccl.is_available()) print(torch.cuda.is_available())
# print(torch.cuda.nccl.is_available())
print(torch.cuda.nccl.version()) print(torch.cuda.nccl.version())