-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathraw_data_processing.py
96 lines (70 loc) · 3.71 KB
/
raw_data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import glob
import re
import os
import pandas as pd
import numpy as np
__author__ = "C. I. Tang"
__copyright__ = "Copyright (C) 2020 C. I. Tang"
"""
Based on work of Tang et al.: https://arxiv.org/abs/2011.11542
Contact: [email protected]
License: GNU General Public License v3.0
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
def process_motion_sense_accelerometer_files(accelerometer_data_folder_path):
"""
Preprocess the accelerometer files of the MotionSense dataset into the 'user-list' format
Data files can be found at https://github.com/mmalekzadeh/motion-sense/tree/master/data
Parameters:
accelerometer_data_folder_path (str):
the path to the folder containing the data files (unzipped)
e.g. motionSense/B_Accelerometer_data/
the trial folders should be directly inside it (e.g. motionSense/B_Accelerometer_data/dws_1/)
Return:
user_datsets (dict of {user_id: [(sensor_values, activity_labels)]})
the processed dataset in a dictionary, of type {user_id: [(sensor_values, activity_labels)]}
the keys of the dictionary is the user_id (participant id)
the values of the dictionary are lists of (sensor_values, activity_labels) pairs
sensor_values are 2D numpy array of shape (length, channels=3)
activity_labels are 1D numpy array of shape (length)
each pair corresponds to a separate trial
(i.e. time is not contiguous between pairs, which is useful for making sliding windows, where it is easy to separate trials)
"""
# label_set = {}
user_datasets = {}
all_trials_folders = sorted(glob.glob(accelerometer_data_folder_path + "/*"))
# Loop through every trial folder
for trial_folder in all_trials_folders:
trial_name = os.path.split(trial_folder)[-1]
# label of the trial is given in the folder name, separated by underscore
label = trial_name.split("_")[0]
# label_set[label] = True
print(trial_folder)
# Loop through files for every user of the trail
for trial_user_file in sorted(glob.glob(trial_folder + "/*.csv")):
# use regex to match the user id
user_id_match = re.search(r'(?P<user_id>[0-9]+)\.csv', os.path.split(trial_user_file)[-1])
if user_id_match is not None:
user_id = int(user_id_match.group('user_id'))
# Read file
user_trial_dataset = pd.read_csv(trial_user_file)
user_trial_dataset.dropna(how = "any", inplace = True)
# Extract the x, y, z channels
values = user_trial_dataset[["x", "y", "z"]].values
# the label is the same during the entire trial, so it is repeated here to pad to the same length as the values
labels = np.repeat(label, values.shape[0])
if user_id not in user_datasets:
user_datasets[user_id] = []
user_datasets[user_id].append((values, labels))
else:
print("[ERR] User id not found", trial_user_file)
return user_datasets