-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_preparation.py
345 lines (289 loc) · 13.3 KB
/
dataset_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import pandas as pd
import numpy as np
import copy
import os
from uah_dataset.pandas_importer import UAHDataset
from uah_dataset.image_process import add_pointers_to_window, dict_with_all_frames_pointed, video_to_frames, create_windowed_frames
import shutil
import pickle
from datetime import datetime
import argparse
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
def windowing(dictionary : dict ,rows_per_minute : int = 360, initial_threshold : int = 60, increment : int = 10) -> dict:
"""
Creates windows, for every driver's trip.
:param dictionary: nested dic road types -> mood -> dataframe
:param rows_per_minute: number of rows that on average constitute one minute
:param initial_threshold: timestamp when we start to approve the windows
:param increment: timestamp difference between end of adjacent windows
:return: dictionary: road type -> mood -> window_index -> dataframe
"""
windowed_dic = copy.deepcopy(dictionary)
window_number = 0
window_time = 0
time_difference = []
for road, road_dic in dictionary.items():
i = 0
print(f"______________________Road: {road} _________________________________")
for mood, mood_df in road_dic.items():
windowed = {}
t = initial_threshold #first window ends at a point where more than t seconds have passed
print(f"Mood: {mood}")
'''for window in mood_df.rolling(window = rows_per_minute, min_periods = rows_per_minute):
if window.shape != (2,41):
print('something went wrong')
if window.iloc[-1, 0] < window.iloc[0, 0]: # meaning we have finished one driver trip, as the nnext df values are lower than the previous
#TODO keep an eye om this
t = initial_threshold
elif int(window.iloc[-1, 0]) > t:
#TODO time does not begin from zero
windowed[i] = window
i += 1
t += increment #creates 10 second windows
window_number += 1
time = (window.iloc[-1, 0]-window.iloc[0,0])+1
window_time += time
time_difference.append(time)'''
for j in mood_df.index:
try:
window = mood_df[j:j+rows_per_minute]
except Exception:
pass
if window.shape != (rows_per_minute,41):
print('pass')
pass
if window.iloc[-1, 0] < window.iloc[0, 0]: # meaning we have finished one driver trip, as the nnext df values are lower than the previous
#TODO keep an eye om this
t = initial_threshold
elif int(window.iloc[-1, 0]) > t:
#TODO time does not begin from zero
windowed[i] = window
i += 1
t += increment #creates 10 second windows
window_number += 1
time = (window.iloc[-1, 0]-window.iloc[0,0])+1
window_time += time
time_difference.append(time)
windowed_dic[road][mood] = windowed
print(f"Number of windows: {window_number}")
return windowed_dic
def read(path_to_uah_folder: str = f"{os.path.dirname(__file__)}/uah_dataset/UAH-DRIVESET-v1/"):
"""
Note: we ended up not using this loading system
Mainly copied from the original reader.
:param path_to_uah_folder:
:return: Online semantics, in nested dictionary ->road type -> mood -> dataframe for all driver on this mood and road type
"""
root_dir = "./uah_dataset/"
latest = "UAH-DRIVESET-v1/"
drivers = ['D1','D2','D3','D4','D5','D6']
roads = ["MOTORWAY", "SECONDARY"]
headers = ["time", "Latitude" , " Longitude", "Total" , "Accel", "Braking", "Turning", "Weaving",
"Drifting", "Oversspeed", "Carfollow", "Normal", "Drowsy", "Aggressive", "Unknown",
"Total_last_minute", "Accel_last_minute", "Braking_last_minute" , "Turning_last_minute",
"Weaving_last_minute","Drifting_last_minute", "Oversspeed_last_minute", "Carfollow_last_minute",
"Normal_last_minute", "Drowsy_last_minute", "Aggressive_last_minute", "Unknown_last_minute"
]
online_semantics = {}
for driver in drivers:
folder = f"{path_to_uah_folder}{driver}"
for direc in os.listdir(folder):
splitted_string = direc.split('-')
road = splitted_string[-1]
mood = splitted_string[-2]
scoresFileName = folder + '/' + direc + '/' + 'SEMANTIC_ONLINE.txt'
scoresData = np.genfromtxt(scoresFileName, dtype=np.float64, delimiter=' ')
df = pd.DataFrame(scoresData, columns = headers)
df['Driver'] = driver
if road in online_semantics:
if mood in online_semantics[road]:
online_semantics[road][mood] = pd.concat([online_semantics[road][mood], df])
else:
online_semantics[road][mood] = df
else:
online_semantics.update({road : {mood : df}})
return online_semantics
def reshaping_to_numpy(dataf : pd.DataFrame, window_size):
""""
This function will convert the data from the dataframes in the dictionary into a numpy array.
:param dataf dictionary: road type -> mood -> window_index -> dataframe
:param window_size int: the size of the window
:return numpy array (number of windows, window_size, number_of_features)
"""
feature_size = list(list(list(dataf.values())[0].values())[0].values())[0].shape[1] #TODO change that again
train = np.empty([0,window_size, feature_size])
labels = np.empty([0,3], dtype=int)
for road, road_dic in dataf.items():
for mood, mood_df in road_dic.items():
if "NORMAL" in mood:
label = np.array([1,0,0], dtype=int)
elif "AGGRESSIVE" in mood:
label = np.array([0,1,0], dtype=int)
elif "DROWSY" in mood:
label = np.array([0,0,1], dtype=int)
else:
raise RuntimeError(mood , " does not correspond to any existing labels")
for i in mood_df:
train = np.concatenate((train, mood_df[i].values[np.newaxis,...]), axis=0)
labels = np.concatenate((labels, label[np.newaxis,...]), axis=0)
print('Iteration passed')
print(train.shape)
print(labels.shape)
return train, labels
def sensor_data_prepare(window_size):
""""
This function will prepare the sensor data. It takes in the desired size of the window,
it then loads the data, and calls the window function. It then transforms the windows into numpy,
performs some preprocessing and saves the values as a memory map.
:param window_size int: the desired size of the window
:return indexing This is permutation with length equal to the number of windows, we use this to match the video
windows to the correct sensor windows
:return n_samples int: is the number of windows
:return dictionary Road type -> mood -> window_index -> dataframe
"""
now = datetime.now() # current date and time
# Read data from files and store to panda frames
dataset = UAHDataset()
road_type_dict = dataset.dataframe(skip_missing_headers=True, suppress_warings=True)
# Windowing the dataset
windowed_dic = copy.deepcopy(road_type_dict)
online_semantic = windowing(windowed_dic, rows_per_minute=window_size)
pass
# Reshaping to numpy
train, labels = reshaping_to_numpy(online_semantic, window_size)
n_samples = len(train)
indexing = np.random.permutation(n_samples)
(train,labels) = (train[indexing],labels[indexing])
npy_new_dir = './uah_dataset/processed_dataset/sensor'
if not os.path.exists(npy_new_dir):
os.mkdir(npy_new_dir)
npy_new_dir = npy_new_dir + '/npy'
if not os.path.exists(npy_new_dir):
os.mkdir(npy_new_dir)
npy_new_dir = npy_new_dir + '/window_' + str(window_size)
if os.path.exists(npy_new_dir):
shutil.rmtree(npy_new_dir)
os.mkdir(npy_new_dir)
np.save(npy_new_dir + '/train_' + now.strftime("%m_%d_%Y-%H_%M_%S"), train)
np.save(npy_new_dir + '/labels_' + now.strftime("%m_%d_%Y-%H_%M_%S"), labels)
parent_dir = './uah_dataset/processed_dataset/sensor/npy/window_' + str(window_size)
train_path = labels_path = None
for file in os.listdir(parent_dir):
if "train" in file:
train_path = file
else:
labels_path = file
# read data
train = np.load(parent_dir + "/" + train_path, allow_pickle=True)
labels = np.load(parent_dir + "/" + labels_path, allow_pickle=True)
train_processed = train
# train = np.load("train.npy",allow_pickle=True)
# labels = np.load("labels.npy",allow_pickle=True)
# 0:(OUT) Timestamp
# 1: Car position form lane center (m)
# 2: Phi
# 3:(OUT) Road width
# 4: State of lane estimator
# 5: Current road max speed
# 6:(OUT) Max speed reliability
# 7:(OUT) Road type
# 8: #lanes in road
# 9: estimated current lane
# 10: Latitude used to query OSM
# 11:(OUT) Longitude used to query OSM
# 12:(OUT) Delay answer OSM Query (s)
# 13:(OUT) Speed (kmh)
# 14: Distance to ahead vehicle
# 15: Impact to ahead vehicle (s)
# 16: Detectet # of vehicles
# 17: GPS speed (kmh)
# 18: Activation Boolean (speed>50kmh)
# 19: X acceleration (Gs)
# 20: Y acceleration(Gs)
# 21:(OUT) Z acceleration(Gs)
# 22: X acceleration (Gs)(Kernel Filter)
# 23: Y acceleration(Gs)(Kernel Filter)
# 24: Z acceleration(Gs)(Kernel Filter)
# 25: Roll
# 26:(OUT) Pitch
# 27:(OUT) Yaw
# 28:(OUT)Speed
# 29:(OUT) Latitude
# 30:(OUT) Longitude
# 31:(OUT) Altitude
# 32: Vertical Accuracy
# 33: Horizontal accuracy
# 34: Course
# 35: Difcourse: course variation
# 36:(OUT) Position state
# 37:(OUT) Lanex dist state
# 38:(OUT) Lanex history
# 39:(OUT) Unknown
# 40:(OUT) Driver
idx_OUT_columns = [0, 3, 6, 7, 11, 12, 13, 21,26,27,28, 29, 30, 31, 36, 37, 38, 39, 40]
idx_IN_columns = [i for i in range(np.shape(train_processed)[2]) if i not in idx_OUT_columns]
extractedData = train_processed[:, :, idx_IN_columns]
extractedData = extractedData.astype(np.float32)
#Remove NaN values
extractedData = np.nan_to_num(extractedData, copy=True, nan=0.0, posinf=None, neginf=None)
#Normalize train by feautures (column)
scalers = {}
for i in range(extractedData.shape[2]):
scalers[i] = StandardScaler() if scaler == "standard" else MinMaxScaler()
extractedData[:, :, i] = scalers[i].fit_transform(extractedData[:, :, i])
#for test
'''for i in range(extractedData.shape[2]):
extractedData[:, :, i] = scalers[i].transform(extractedData[:, :, i])'''
"""#Normalize train by feautures (column)
# FIXME: normalize over the whole dataset
for j in range(len(extractedData)):
df1=pd.DataFrame(extractedData[j])
for i in range(0, len(idx_IN_columns)):
df1[i] = (df1[i] / (df1[i].abs().max()+0.01)).astype(float)
extractedData[j]=df1.to_numpy()"""
# save data to .dat format
dat_new_dir = './uah_dataset/processed_dataset/sensor'
if not os.path.exists(dat_new_dir):
os.mkdir(dat_new_dir)
dat_new_dir = dat_new_dir + '/dat'
if not os.path.exists(dat_new_dir):
os.mkdir(dat_new_dir)
dat_new_dir = dat_new_dir + '/window_' + str(window_size)
if os.path.exists(dat_new_dir):
shutil.rmtree(dat_new_dir)
os.mkdir(dat_new_dir)
# save train data
fp = np.memmap(dat_new_dir + '/train_' + now.strftime("%m_%d_%Y-%H_%M_%S") + ".dat", dtype='float32', mode='w+',
shape=extractedData.shape)
fp[:] = extractedData[:]
fp.flush()
del fp
print(extractedData.shape)
# save label data
labels_processed = labels
dp = np.memmap(dat_new_dir + '/labels_' + now.strftime("%m_%d_%Y-%H_%M_%S") + ".dat", dtype='int', mode='w+',
shape=labels_processed.shape)
dp[:] = labels_processed[:]
dp.flush()
del dp
# save shape
dict = {'sensor': extractedData.shape, 'labels': labels_processed.shape}
file = open(dat_new_dir + '/shape.txt', 'wb')
pickle.dump(dict, file)
file.close()
return indexing, n_samples, online_semantic
scaler = "minmax"
if __name__ == "__main__":
# Create the parser
dataset = UAHDataset()
road_type_dict = dataset.dataframe(skip_missing_headers=True, suppress_warings=True)
parser = argparse.ArgumentParser(description='Preprocessing stage')
parser.add_argument('--window_size', type=int, help='window_size', required=True)
parser.add_argument('--scaler', type=str, help='either minmax or standard', required=True)
args = parser.parse_args()
window_size = args.window_size
scaler = args.scaler
(indexing, n_samples, online_semantic) = sensor_data_prepare(window_size)
fps = (window_size/60)
video_to_frames(fps)
create_windowed_frames(window_size, indexing, n_samples, online_semantic)