-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures_gen.py
111 lines (85 loc) · 3.88 KB
/
features_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
# Data path
data_path = './data/Full_data_Trentino.csv'
taxa = [
'Ambrosia', 'Artemisia', 'Betula', 'Corylus', 'Cupressaceae, Taxaceae',
'Fraxinus', 'Olea europaea', 'Ostrya carpinifolia', 'Poaceae', 'Urticaceae'
]
def days_convolution(num_days):
kernel = np.ones(num_days) / num_days
return kernel
def get_features(taxa: list[str] = taxa):
# Loading data
data = pd.read_csv(data_path)
data['datetime'] = pd.to_datetime(data['datetime'])
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.dayofyear
# Taxa concentrations
# Meteorological features
meteo_features = ['temp_max', 'temp_min', 'temp_mean', 'rain', 'humidity',
'wind_dir', 'wind_speed', 'wind_gusts', 'rad', 'sun_hours', 'pressure']
# Defining new input Features
# Defining the different time windows (1/2 weeks, 1/3/6 months)
time_windows = {
'1w': 7,
'2w': 14,
'1m': 30,
'3m': 90,
'6m': 180
}
# Replace all '--' occurrences with previous day values
data.replace('--', np.nan, inplace=True)
data.replace('', np.nan, inplace=True)
data.replace(' ', np.nan, inplace=True)
data.ffill(inplace=True)
# change all columns to float except datetime
for column in data.columns:
if column not in ['datetime']:
data[column] = data[column].astype('float32')
# Add features here
features_data = {}
for f in meteo_features + taxa:
for window_name, window_size in time_windows.items():
features_data[f'{f}_rolling_mean_{window_name}'] = data[f].rolling(
window=window_size, min_periods=1).mean()
features_data[f'{f}_rolling_var_{window_name}'] = data[f].rolling(
window=window_size, min_periods=1).var()
for f in meteo_features + taxa:
for window_name, window_size in time_windows.items():
features_data[f'{f}_rolling_mean_{window_name}_delta'] = features_data[f'{f}_rolling_mean_{
window_name}'] - features_data[f'{f}_rolling_mean_{window_name}'].shift(window_size)
features_data[f'{f}_rolling_var_{window_name}_delta'] = features_data[f'{f}_rolling_var_{
window_name}'] - features_data[f'{f}_rolling_var_{window_name}'].shift(window_size)
if f in taxa:
for i in range(2, 6):
features_data[f'{f}_rolling_mean_{window_name}_delta_{i}w'] = features_data[f'{
f}_rolling_mean_{window_name}'] - features_data[f'{f}_rolling_mean_{window_name}'].shift(i)
features_data[f'{f}_rolling_var_{window_name}_delta_{i}w'] = features_data[f'{
f}_rolling_var_{window_name}'] - features_data[f'{f}_rolling_var_{window_name}'].shift(i)
else:
# Adding this type of meteo features lowers the performance of the model
# TODO: find new meteo features to add
pass
for i in range(7, 48):
for t in taxa:
features_data[f'{t}_convolution_{i}_days'] = np.convolve(
data[t], days_convolution(i), mode="same")
def gaussian_kernel(size, sigma):
x = np.arange(-size, size+1)
kernel = np.exp(-(x**2) / (2 * sigma**2))
return kernel / kernel.sum()
# Parametri del kernel
# size = 3 # Raggio del kernel
# sigma = 1.0
# for t in taxa:
# kernel = gaussian_kernel(size, sigma)
# features_data[f'{t}_convolution'] = np.convolve(
# data[t], kernel, mode="same")
final_features_df = pd.DataFrame(features_data)
data = pd.concat([data, final_features_df], axis=1)
data.dropna(inplace=True)
features = [f for f in data.keys() if f not in [
'datetime', 'year', 'month', 'day']]
return data, features