-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
194 lines (151 loc) · 9.6 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#load package
import pandas as pd
import numpy as np
from missingpy import MissForest #impute missing value
from datetime import timedelta
def merge_data(dailyactivity, wear_time, dailyFitbitActiveZoneMinutes, dailyHRV, fitbitBreathingRate, fitbitSkinTemperature, sleepDay, sleepStageLogInfo, heartrate_15min):
#rename data file
dailyactivity.rename(columns = {'ActivityDate':'date'}, inplace=True)
wear_time.rename(columns = {'Day':'date'}, inplace=True)
dailyFitbitActiveZoneMinutes.rename(columns = {'Date':'date'}, inplace=True)
dailyHRV.rename(columns = {'SleepDay':'date'}, inplace=True)
fitbitBreathingRate.rename(columns = {'SleepDay':'date'}, inplace = True)
fitbitSkinTemperature.rename(columns = {'SleepDay':'date'}, inplace=True)
sleepDay.rename(columns = {'SleepDay':'date'}, inplace=True)
sleepStageLogInfo.rename(columns = {'SleepDay':'date'}, inplace=True)
heartrate_15min.rename(columns = {'Time':'date'}, inplace=True)
#Combine to one dataset
total_data = [wear_time, dailyactivity, dailyFitbitActiveZoneMinutes, dailyHRV, fitbitBreathingRate,
fitbitSkinTemperature, sleepDay, sleepStageLogInfo, heartrate_15min]
for d in total_data:
d['date'] = pd.to_datetime(d['date'], errors = 'coerce')
d['date'] = d['date'].dt.date #.strftime('%m/%d/%Y').str.replace('/0', '/')
#transfer heart_rate_15 min
heartrate_15min = heartrate_15min.groupby(by = ['Id','date']).mean().reset_index()
#merge file
merged1 = pd.merge(wear_time, dailyactivity, on=['Id','date'], how='left') # inner join only keeps common dates
merged2 = pd.merge(merged1, sleepDay, on=['Id','date'], how='left')
merged3 = pd.merge(merged2, sleepStageLogInfo, on=['Id','date'], how='left')
merged4 = pd.merge(merged3, heartrate_15min, on=['Id','date'], how='left')
merged5 = pd.merge(merged4, dailyFitbitActiveZoneMinutes, on=['Id','date'], how='left')
merged6 = pd.merge(merged5, dailyHRV, on=['Id','date'], how='left')
merged7 = pd.merge(merged6, fitbitBreathingRate, on=['Id','date'], how='left')
merged_final = pd.merge(merged7, fitbitSkinTemperature, on=['Id','date'], how='left')
#filter
merged_final_10 = merged_final[(merged_final['TotalMinutesWearTime'] >= 10*60)] #10 hour, what's the meaning of four days a week, is this for calendar week or four consecutive days
merged_0_3000steps = merged_final[(merged_final['TotalMinutesWearTime'] == 0) & (merged_final['TotalSteps'] >= 3000)]
merged_final = pd.concat([merged_final_10, merged_0_3000steps])
merged_final = merged_final.sort_values(by=['Id', 'date'])
#rename ID
merged_final['Id'] = [items[-3:] for items in merged_final['Id']]
merged_final['Id'] = [int(digit) for digit in merged_final['Id']]
return merged_final
def select_survey_data(survey_data):
#Proprocessing survey data
survey_data['health_coach_survey_complete'].value_counts()
#select people who complete health coach session and the second follow-up
survey_data_clean = survey_data[survey_data['weeks_followup_survey_96ac_complete'] == 2] #87 participants
#select variables from survey
uses_vars_of_survey = ['record_id','current_status',
'demographics_age','demographics_sex','demographics_sexorient','demographics_ethnicity','demographics_immigration',
'demographics_race___1', 'demographics_race___2', 'demographics_race___3', 'demographics_race___4', 'demographics_race___5',
'demographics_race___6', 'demographics_race___7', 'demographics_education', 'demographics_sorority',
'nervous_v1', 'down_v1', 'calm_v1', 'blue_v1', 'happy_v1',
'nervous_v2', 'down_v2', 'calm_v2', 'blue_v2', 'happy_v2',
'weeks_followup_survey_complete','weeks_followup_survey_96ac_complete',
'weeks_followup_survey_timestamp', 'weeks_followup_survey_96ac_timestamp'
]
select_survey = survey_data_clean[uses_vars_of_survey] #have the demongraphic information
#rename the name of record_id
select_survey.rename(columns = {"record_id":'Id'}, inplace=True)
#reframe the date time format
select_survey['weeks_followup_survey_timestamp'] = pd.to_datetime(select_survey['weeks_followup_survey_timestamp'], errors = 'coerce' )
select_survey['weeks_followup_survey_timestamp'] = select_survey['weeks_followup_survey_timestamp'].dt.date
select_survey['weeks_followup_survey_96ac_timestamp'] = pd.to_datetime(select_survey['weeks_followup_survey_96ac_timestamp'], errors = 'coerce' )
select_survey['weeks_followup_survey_96ac_timestamp'] = select_survey['weeks_followup_survey_96ac_timestamp'].dt.date
return select_survey
def impute_missing(data):
#impute missing value
total_ids = data['Id'].unique()
impute_data_frame = pd.DataFrame()
for i in total_ids:
imputed_data = data[data['Id'] == i]
date = imputed_data['date']
imputed_data_date = imputed_data.set_index('date')
# Drop columns that have all values missing
non_missing_columns = imputed_data_date.dropna(axis=1, how='all')
# Apply MissForest if there are remaining columns with data
if not non_missing_columns.empty:
mf = MissForest()
mf.fit(non_missing_columns)
imputed_data_transformed = pd.DataFrame(mf.transform(non_missing_columns), index=non_missing_columns.index, columns=non_missing_columns.columns)
# Re-add the dropped columns as all NaNs
for col in imputed_data_date.columns:
if col not in imputed_data_transformed.columns:
imputed_data_transformed[col] = 0 # Or any other fill method, like 0
# Reorder columns to match the original order
imputed_data_transformed = imputed_data_transformed[imputed_data_date.columns]
# Reorganize the data with date column
last_data_frame = pd.concat([imputed_data_transformed.reset_index(), date.reset_index(drop=True)], axis=1)
# Append to the final DataFrame
impute_data_frame = pd.concat([impute_data_frame, last_data_frame])
# Check the result
return impute_data_frame
def extend_time(impute_data_frame, select_survey):
time_sequency = impute_data_frame.copy()
all_fitbit_id = time_sequency['Id'].unique()
new_data_frame = pd.DataFrame()
for i in range(len(all_fitbit_id)):
fitbit_id = all_fitbit_id[i]
sorted_individual = time_sequency[time_sequency['Id'] == fitbit_id].sort_values(by = 'date')
#outcome variable
individual_survey = select_survey[select_survey['Id'] == fitbit_id]
followup1_time = individual_survey['weeks_followup_survey_timestamp'].iloc[0] if not individual_survey['weeks_followup_survey_timestamp'].empty else pd.NaT
followup2_time = individual_survey['weeks_followup_survey_96ac_timestamp'].iloc[0] if not individual_survey['weeks_followup_survey_96ac_timestamp'].empty else pd.NaT
start_date = sorted_individual['date'].iloc[0]
end_date = followup2_time - timedelta(days = 1) # the previous one day of completing the second follow-up, the end day data
full_date_range = pd.date_range(start=start_date, end=end_date)
sorted_individual.set_index('date', inplace=True)
sorted_individual_reindexed = sorted_individual.reindex(full_date_range).reset_index() #.fillna(0) #reset the wrong index, I need to handle the missing value before fill 0
sorted_individual_reindexed['Id'] = fitbit_id
sorted_individual_reindexed.rename(columns = {'index':'date'}, inplace = True)
new_data_frame = pd.concat([new_data_frame, sorted_individual_reindexed])
return new_data_frame
def assign_survey_date(time, followup1_time, followup2_time):
if time < followup1_time:
return 1
elif time == followup1_time:
return 2
elif followup1_time < time < followup2_time:
return 3
elif time == followup2_time:
return 4
return None
def recode_survey_time(new_data_frame, select_survey):
new_used_fitbit_data = new_data_frame.copy()
all_fitbit_id = new_used_fitbit_data['Id'].unique()
for i in range(len(all_fitbit_id)):
fitbit_id = all_fitbit_id[i]
#outcome variable
individual_survey = select_survey[select_survey['Id'] == fitbit_id]
followup1_time = individual_survey['weeks_followup_survey_timestamp'].iloc[0] if not individual_survey['weeks_followup_survey_timestamp'].empty else pd.NaT
#print(followup1_time)
followup2_time = individual_survey['weeks_followup_survey_96ac_timestamp'].iloc[0] if not individual_survey['weeks_followup_survey_96ac_timestamp'].empty else pd.NaT
followup2_time = followup2_time - timedelta(days=1)
#print(followup2_time)
if pd.isna(followup1_time) or pd.isna(followup2_time):
print(f"missing follow-up times for Id{fitbit_id}")
continue
#predictors
individual_fitbit = new_used_fitbit_data[new_used_fitbit_data['Id'] == fitbit_id].copy()
individual_fitbit['survey_date'] = individual_fitbit.apply(
lambda row: assign_survey_date(row['date'].date(), followup1_time, followup2_time), axis=1
) #
new_used_fitbit_data.loc[new_used_fitbit_data['Id'] == fitbit_id, 'survey_date'] = individual_fitbit['survey_date'] #replace with new define
return new_used_fitbit_data
def add_gaussian_noise(time_series, mean=0.0, stddev=1.0):
# Gaussian noise generation
noise = np.random.normal(mean, stddev, len(time_series))
# Adding noise to the original time series
noisy_series = time_series + noise
return noisy_series