-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathh2_preprocess.py
147 lines (114 loc) · 4.97 KB
/
h2_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 29 16:52:20 2017
@author: Samuli
"""
import numpy as np
import pandas
from sklearn.preprocessing import normalize # , LabelEncoder
# Read the info about features
print('Reading feature info...')
data_info = pandas.read_csv("NUSW-NB15_features.csv", encoding = "ISO-8859-1", header=None).values
features = data_info[1:-2,:]
feature_names = features[:, 1] # Names of the features in a list
feature_types = np.array([item.lower() for item in features[:, 2]]) # The types of the corresponding features in 'features_names'
# index arrays for different types of features
print('Finding column indices for feature types...')
nominal_cols = np.where(feature_types == "nominal")[0]
integer_cols = np.where(feature_types == "integer")[0]
binary_cols = np.where(feature_types == "binary")[0]
float_cols = np.where(feature_types == "float")[0]
# arrays for names of the different types of features
nominal_names = feature_names[nominal_cols]
integer_names = feature_names[integer_cols]
binary_names = feature_names[binary_cols]
float_names = feature_names[float_cols]
print('Reading csv files...')
dataframe1 = pandas.read_csv("UNSW-NB15_1.csv", header=None)
dataframe2 = pandas.read_csv("UNSW-NB15_2.csv", header=None)
dataframe3 = pandas.read_csv("UNSW-NB15_3.csv", header=None)
dataframe4 = pandas.read_csv("UNSW-NB15_4.csv", header=None)
print('Concatenating...')
dataframe = pandas.concat([dataframe1, dataframe2, dataframe3, dataframe4])
del dataframe1
del dataframe2
del dataframe3
del dataframe4
print('Preprocessing...')
print('Converting data...')
dataframe[integer_cols] = dataframe[integer_cols].convert_objects(convert_numeric=True)
dataframe[binary_cols] = dataframe[binary_cols].convert_objects(convert_numeric=True)
dataframe[float_cols] = dataframe[float_cols].convert_objects(convert_numeric=True)
dataframe[48] = dataframe[48].convert_objects(convert_numeric=True)
#dataframe[nominal_cols] = dataframe[nominal_cols].astype(str)
print('Replacing NaNs...')
dataframe.loc[:,47] = dataframe.loc[:,47].replace(np.nan,'normal', regex=True).apply(lambda x: x.strip().lower())
dataframe.loc[:,binary_cols] = dataframe.loc[:,binary_cols].replace(np.nan, 0, regex=True)
dataframe.loc[:,37:39] = dataframe.loc[:,37:39].replace(np.nan, 0, regex=True)
# dataframe.loc[:,float_cols] = dataframe.loc[:,float_cols].replace(np.nan, 0, regex=True)
print('Stripping nominal columns and setting them lower case...')
dataframe.loc[:,nominal_cols] = dataframe.loc[:,nominal_cols].applymap(lambda x: x.strip().lower())
print('Changing targets \'backdoors\' to \'backdoor\'...')
dataframe.loc[:,47] = dataframe.loc[:,47].replace('backdoors','backdoor', regex=True).apply(lambda x: x.strip().lower())
dataset = dataframe.values
del dataframe
# Subsets of the dataset which have data that is only of the corresponding data type (nominal, integer etc)
# Columns don't include the target classes (the two last columns of the dataset)
print('Slicing dataset...')
nominal_x = dataset[:, nominal_cols][:,:]
integer_x = dataset[:, integer_cols][:,:].astype(np.float32)
binary_x = dataset[:, binary_cols][:,:].astype(np.float32)
float_x = dataset[:, float_cols][:,:].astype(np.float32)
# aon_x = dataset[:, 48][np.newaxis,:].astype(np.float32).transpose() # Attack or not (binary)
# Make nominal (textual) data binary vectors
print('Vectorizing nominal data...')
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
# D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
D = map(lambda dataline: dict(zip(nominal_names, dataline)), nominal_x)
labeled_nominal_x = v.fit_transform(D).astype(np.float32)
del nominal_x
print('Concatenating X...')
X = np.concatenate((integer_x, labeled_nominal_x, float_x, binary_x), axis=1)
del integer_x
del labeled_nominal_x
del float_x
del binary_x
# Find rows that have NaNs
print('Removing NaNs if any...')
nan_indices = []
for feature_i in range(X.shape[1]):
nan_indices.extend(list(np.where(np.isnan(X[:, feature_i]))[0]))
nan_indices = np.unique(nan_indices)
# Remove rows that have NaNs
X_no_nans = np.delete(X, nan_indices, axis=0)
del X
print('Normalizing X...')
normalized_X = normalize(X_no_nans, copy=False)
del X_no_nans
data_dim = normalized_X.shape
print('Data dimensions are', data_dim)
print('Creating target Y matrix...')
Y = np.delete(dataset[:, -2], nan_indices)
Y_A = np.delete(dataset[:, -1], nan_indices).astype(np.int16) # Is attack or not
del dataset
'''
# Remove same rows as in X to have correct y's
Y_no_nans = np.delete(Y, nan_indices, axis=0)
'''
print('Vectorizing Y labels...')
D = [{'attack_cat': y} for y in Y]
labeled_Y = v.fit_transform(D)
del D
print('Saving normalized X and labeled Y to HDF5')
import h5py
h5f = h5py.File('data.h5', 'w')
h5f.create_dataset('normalized_X', data=normalized_X)
h5f.create_dataset('labeled_Y', data=labeled_Y)
dt = h5py.special_dtype(vlen=str)
h5f.create_dataset('Y', data=Y, dtype=dt)
h5f.create_dataset('Y_A', data=Y_A)
h5f.close()
del Y
del normalized_X
del labeled_Y