backup

2023-09-01 11:47:19 +08:00 · 2023-09-01 11:47:19 +08:00 · ddf9ff3b59
commit ddf9ff3b59
parent 4637fd0d97
4 changed files with 69 additions and 23 deletions
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -55,7 +55,7 @@ def convert(start, end):
                            elif 'label' in line:
                                functions_list.append(line[line.find('= "') + 3:line.find('",')])

-                    # 没有内部函数被检测到，保险起见还是不要这数据了
+                    # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
                    if raw_function_edges.__len__() == 0:
                        continue

@ -113,4 +113,4 @@ def convert(start, end):


 if __name__ == '__main__':
-    convert(0, 35)
+    convert(35, 69)
--- a/Genius3/raw-feature-extractor/func.pyc
+++ b/Genius3/raw-feature-extractor/func.pyc
--- a/Genius3/raw-feature-extractor/ida_batch.py
+++ b/Genius3/raw-feature-extractor/ida_batch.py
@ -1,4 +1,5 @@
 # coding=utf-8
+import re
 import os
 import subprocess
 import multiprocessing
@ -10,35 +11,52 @@ import time
 # 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理，若仍然超时则放弃
 TIMEOUT = 60

+# 每个家族最大处理数量
+MAX_FAMILY_PROCESS_NUM = 200
+

 def call_preprocess(cmd_line):
    subprocess.call(cmd_line, shell=True)


 def batch_mode(start, end):
+    # 只选其中这些类的pe进行分析，其他的就直接跳过
+    families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
+                                'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
    for workflow in range(start, end):
-        # workflow = 0
-        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
-        # for test
        # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
+        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
+        family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
        log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
        process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
-        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
+                                                                                            'r') as family_file:
            logged = log.readline()
            if logged == '':
                log_index = 0
            else:
                log_index = int(logged)

-            # pe = "VirusShare_bc161e5e792028e8137aa070fda53f82"
+            families = family_file.read()
            for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
                if index < log_index:
                    continue

-                # for test
+                # 匹配文件md5，取出family文件中该md5的家族
+                regex = re.compile(pe[11:] + r'[\t][\S]*')
+                search_result = regex.findall(families)
+                if len(search_result) == 0:
+                    continue
+
+                pe_family = search_result[0].split()[1]
+                if pe_family not in families_need_to_analyze:
+                    continue
+
+                # FOR TEST ONLY
                # cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
                #     workflow, os.path.join(pe_dir, pe))
-                cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(workflow, os.path.join(pe_dir, pe))
+                cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
+                    workflow, os.path.join(pe_dir, pe))

                p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
                p.start()
@ -53,7 +71,8 @@ def batch_mode(start, end):

                if flag_kill:
                    subprocess.call('taskkill /im idaq64.exe /f')
-                    process_log.write("index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
+                    process_log.write(
+                        "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
                else:
                    # 正常运行结束
                    log.truncate(0)
@ -61,6 +80,8 @@ def batch_mode(start, end):
                    log.write(str(index))
                    log.flush()
                    process_log.write("index {}, {} process done.\n".format(index, pe))
+
+                    families_need_to_analyze[pe_family] += 1
        # 一次workflow结束后将所有副产物删除
        delete_output()

@ -68,10 +89,11 @@ def batch_mode(start, end):
 def delete_output():
    out_dir = 'F:\\iout'
    for f in os.listdir(out_dir):
-        os.remove(f)
+        if os.path.exists(os.path.join(out_dir, f)):
+            os.remove(os.path.join(out_dir, f))


 # 注意：该py文件必须放在IDA的根目录下，且必须使用cmd命令执行，否则无法链接到python库
 # F:\\kkk\\IDA_6.6
 if __name__ == '__main__':
-    batch_mode(20, 35)
+    batch_mode(36, 69)
--- a/Genius3/raw-feature-extractor/test.py
+++ b/Genius3/raw-feature-extractor/test.py
@ -6,6 +6,7 @@ import time
 import json
 import random
 import shutil
+from tqdm import tqdm


 def func():
@ -25,21 +26,27 @@ def func1():

 def create_dir():
    parent_dir = "D:\\hkn\\infected\\datasets"
-    for workflow in range(35, 40):
+    for workflow in range(40, 70):
        # 生成raw data文件夹
-        # infected = "virusshare_infected{}".format(workflow)
-        # cfg = "virusshare_infected{}_cfg".format(workflow)
-        # dot = "virusshare_infected{}_dot".format(workflow)
+        infected = "virusshare_infected{}".format(workflow)
+        cfg = "virusshare_infected{}_cfg".format(workflow)
+        dot = "virusshare_infected{}_dot".format(workflow)
        jsonl = "virusshare_infected{}_json".format(workflow)
-        # os.mkdir(os.path.join(parent_dir, infected))
-        # os.mkdir(os.path.join(parent_dir, cfg))
-        # os.mkdir(os.path.join(parent_dir, dot))
-        os.mkdir(os.path.join(parent_dir, jsonl))
+        create(parent_dir, infected)
+        create(parent_dir, cfg)
+        create(parent_dir, dot)
+        create(parent_dir, jsonl)
        # iout = "virusshare_infected{}_iout".format(workflow)
        # os.rmdir(os.path.join(parent_dir, iout))
        # os.rmdir(os.path.join(parent_dir, ida))


+def create(parent_dir, folder):
+    if not os.path.exists(os.path.join(parent_dir, folder)):
+        os.mkdir(os.path.join(parent_dir, folder))
+
+
+
 def change_max_item_lines():
    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
    s = f.read()
@ -82,7 +89,7 @@ def delete_error():


 def check_json():
-    for workflow in range(5, 16):
+    for workflow in tqdm(range(0, 69)):
        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
        for json_file in os.listdir(json_dir):
            f = open(os.path.join(json_dir, json_file), 'r')
@ -183,6 +190,22 @@ def read_test():
                print(line[line.find('= "') + 3:line.find('",')])


+# 临时工具，有些pe文件没有经过api分类，直接删掉
+def del_redundant():
+    for workflow in range(0, 68):
+        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
+        family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
+
+        with open(family_file_path, 'r') as f_file:
+            family = f_file.read()
+            for name in os.listdir(pe_dir):
+                if name[11:] in family:
+                    continue
+                else:
+                    # print(name)
+                    os.remove(os.path.join(pe_dir, name))
+
+
 if __name__ == '__main__':
    # create_dir()
    # change_max_item_lines()
@ -192,8 +215,9 @@ if __name__ == '__main__':
    # delete_jsonl()
    # check_json()
    split_samples()
-    rename()
+    # rename()
    # half_divide()
    # copy_train_data()
    # clear_dot()
    # read_test()
+    # del_redundant()