-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuilder.py
87 lines (73 loc) · 3.25 KB
/
builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import sys
import json
import transformer
import concurrent.futures
import config
def Builder(typeClass):
"""
Iterates over all samples and dumps raw features into a json file.
:param typeClass: type class to focus on.
"""
print(f"{config.Colours.INFO}[*] Building dataset for {typeClass}.{config.Colours.ENDC}")
# Set the path and clear typeClass' download queues and json dump.
path = f"dataset/{typeClass}"
try:
os.remove(f"{path}/dump.json")
os.remove(f"{path}/queue.txt")
except FileNotFoundError:
pass
# Use PETransformer to fetch feature vectors for a given PE and dump into typeClass' local folder.
with open(f"{path}/dump.json", 'w') as buildFile:
for sample in os.listdir(path):
print(f"[~] Building {path}/{sample}")
transformed = transformer.PETransformer(f"{path}/{sample}")
data_dict = transformed.feature_dict
buildFile.write(json.dumps(data_dict))
buildFile.write('\n')
print(f"{config.Colours.SUCCESS}[+] Dataset build for {typeClass} complete.{config.Colours.ENDC}")
return
def Reader():
"""
Reads the dump files for all classes and returns a feature vector dictionary.
as a dictionary of lists.
"""
print(f"{config.Colours.HEADER}[+] Initiated dataset read.{config.Colours.ENDC}")
# Iterate over all classes.
data = {}
for typeClass in config.Classes:
print(f"{config.Colours.INFO}[*] Reading dataset for {typeClass}.{config.Colours.ENDC}")
# Load typeClass' json dump into memory and append to a dictionary.
path = f"dataset/{typeClass}/dump.json"
data[typeClass] = []
try:
with open(path, 'r') as buildFile:
lines = [line.strip() for line in buildFile.readlines()]
for line in lines:
data[typeClass].append(transformer.PETransformer(raw_features=line).vector)
except FileNotFoundError:
print(f"{config.Colours.ERROR}[!] Dump file not found for {typeClass}!{config.Colours.ENDC}")
return
print(f"{config.Colours.SUCCESS}[+] Dataset fetch for {typeClass} complete.{config.Colours.ENDC}")
print(f"{config.Colours.SUCCESS}[+] Dataset loading complete.{config.Colours.ENDC}")
return data
def Build_Dataset():
"""
Download all the files in a multi-threaded implementation to build a local database.
"""
print(f"{config.Colours.HEADER}[+] Initiated dataset build.{config.Colours.ENDC}")
# Multi threaded building process for json dumps.
executor = concurrent.futures.ThreadPoolExecutor(max_workers = len(config.Classes))
for typeClass in config.Classes:
try:
executor.submit(Builder, typeClass)
print(f"[+] Thread started for {typeClass}.")
except:
print(f"{config.Colours.ERROR}[!] Unable to start thread for {typeClass}.{config.Colours.ENDC}")
# Shutdown the thread manager during exit.
executor.shutdown(wait=True)
print(f"{config.Colours.SUCCESS}[+] Dataset build complete.{config.Colours.ENDC}")
return
if __name__ == "__main__":
# Simply build the dataset when this file is run individually.
Build_Dataset()