-
Notifications
You must be signed in to change notification settings - Fork 0
/
2- data_preprocessing.py
46 lines (34 loc) · 1.36 KB
/
2- data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
"""Data Preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1A2Uw-sUxi6iu-OmeodPQaW5vRTfpMQwS
#**3 Data Preprocessing**
##**3.1 Data Formatting**
"""
# Load again ignoring the first row (website domain)
df = pd.read_csv(url, header=1)
# Extract rows that don't have standtad format
nan_data = df[df.date.str.len() < 19]
# Extract rows that don't need extra modification
eth_data = df[df.date.str.len() >= 19]
# Format with standard practice
nan_data['date'] = nan_data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %I-%p'))
# Concatenate two dfs to make a new one
eth = pd.concat([eth_data, nan_data], axis=0)
# eth.to_csv('eth.csv')
# Convert date column to datatime object
eth['date'] = pd.to_datetime(eth.date)
eth.set_index('date', inplace=True)
# Drop unix column
eth.drop(columns=['unix','symbol', 'tradecount'], inplace=True)
eth.sort_index(ascending=True, inplace=True)
"""##**3.2 Handling Missing Values**"""
# Find rows that are missing
nan = eth[eth.isnull().any(axis=1)]
# Select rows before and after our target row
rows_toavg = eth.loc[['2019-11-07 02:00:00','2019-11-07 04:00:00'], 'Volume USDT']
# Take average of two rows to fill in our missing record
eth.fillna(rows_toavg.mean(), inplace=True)
# Check if there is still any nan
eth.isnull().sum()