2- data_preprocessing.py

# -*- coding: utf-8 -*-
"""Data Preprocessing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1A2Uw-sUxi6iu-OmeodPQaW5vRTfpMQwS

#**3 Data Preprocessing**

##**3.1 Data Formatting**
"""

# Load again ignoring the first row (website domain)
df = pd.read_csv(url, header=1)

# Extract rows that don't have standtad format
nan_data = df[df.date.str.len() < 19]
# Extract rows that don't need extra modification
eth_data = df[df.date.str.len() >= 19]
# Format with standard practice
nan_data['date'] = nan_data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %I-%p'))
# Concatenate two dfs to make a new one 
eth = pd.concat([eth_data, nan_data], axis=0)

# eth.to_csv('eth.csv')

# Convert date column to datatime object
eth['date'] = pd.to_datetime(eth.date)
eth.set_index('date', inplace=True)
# Drop unix column 
eth.drop(columns=['unix','symbol', 'tradecount'], inplace=True)
eth.sort_index(ascending=True, inplace=True)

"""##**3.2 Handling Missing Values**"""

# Find rows that are missing
nan = eth[eth.isnull().any(axis=1)]
# Select rows before and after our target row
rows_toavg = eth.loc[['2019-11-07 02:00:00','2019-11-07 04:00:00'], 'Volume USDT']

# Take average of two rows to fill in our missing record
eth.fillna(rows_toavg.mean(), inplace=True)

# Check if there is still any nan
eth.isnull().sum()