-
Notifications
You must be signed in to change notification settings - Fork 12
/
workshop_solutions.py
123 lines (83 loc) · 3.44 KB
/
workshop_solutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""Workshop main flow."""
# pylint: disable=invalid-name, fixme
# %% [1] Importing Libraries and Data
# Third-party imports
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split
# Local imports
from utils import aggregate_by_year, plot_correlations, plot_color_gradients
# %% [2] Exploring the Data
# Read the data
weather_data = pd.read_csv('data/weatherHistory.csv')
# Print length of data
len(weather_data)
# Print first three rows of DataFrame
weather_data.head(3)
# TO DO: Print the last three rows of the DataFrame
weather_data.tail(3)
# %% [3] Visualization
# Order rows according to date
weather_data = pd.read_csv('data/weatherHistory.csv')
weather_data['Formatted Date'] = pd.to_datetime(
weather_data['Formatted Date'].str[:-6])
weather_data_ordered = weather_data.sort_values(by='Formatted Date')
# Reset index to restore its order
weather_data_ordered.reset_index(drop=True)
# Drop categorical columns
weather_data_ordered.drop(
columns=['Summary', 'Precip Type', 'Loud Cover', 'Daily Summary'])
# Plot temperature vs. date
weather_data_ordered.plot(
x='Formatted Date', y='Temperature (C)', color='red', figsize=(15, 8))
# TODO: Plot temperature vs date using only the data from 2006
weather_data_ordered.loc[
weather_data_ordered["Formatted Date"].dt.year == 2006, :].plot(
x='Formatted Date', y='Temperature (C)', color='red')
# Plot temperature and humidity in the same plot
weather_data_ordered.plot(
subplots=True, x='Formatted Date', y=['Temperature (C)', 'Humidity'],
figsize=(15, 8))
# TODO: Plot different combinations of the variables, and for different years
# %% [4] Data summarization and aggregation
# Weather data by year
weather_data_by_year = aggregate_by_year(
weather_data_ordered, date_column='Formatted Date')
# TODO: Create and use a function to average the weather data by month
# %% [5] Data Analysis and Interpretation
# Plot correlations
plot_correlations(weather_data_ordered, size=15)
# Plot gradient colormaps
plot_color_gradients(
cmap_category='Plot gradients convention', cmap_list=['viridis', ])
# Compute correlations
weather_correlations = weather_data_ordered.corr()
weather_data_ordered['Temperature (C)'].corr(
weather_data_ordered['Humidity'])
# TO DO: Get the correlation for different combinations of variables.
# Contrast them with the weather_correlations dataframe
# %% [6] Data Modeling and Prediction
# Get data subsets for the model
x_train, x_test, y_train, y_test = train_test_split(
weather_data_ordered['Humidity'], weather_data_ordered['Temperature (C)'],
test_size=0.25)
# Run regression
regression = linear_model.LinearRegression()
regression.fit(x_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
# Print coefficients
print(regression.intercept_, regression.coef_) # beta_0, beta_1
# %% [7] Predictive Model Testing and Evaluation
# Plot predicted model with test data
y_predict = regression.predict(x_test.values.reshape(-1, 1))
plt.scatter(x_test, y_test, c='red', label='Observation', s=1)
plt.scatter(x_test, y_predict, c='blue', label='Model')
plt.xlabel('Humidity')
plt.ylabel('Temperature (C)')
plt.legend()
plt.show()
# TODO: Using the model, predict the temperature for a given level of humidity
# Evaluate model numerically
explained_variance_score(y_test, y_predict)