-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbanks_project.py
104 lines (69 loc) · 3.24 KB
/
banks_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Code for ETL operations on Country-GDP data
from io import StringIO
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from datetime import datetime
from icecream import ic
def log_progress(message):
"""This function logs the mentioned message of a given stage of the
code execution to a log file. Function returns nothing"""
with open('code_log.txt', 'a') as f:
f.write(f'{datetime.now()}: {message}\n')
def extract(url, table_attribs):
""" This function aims to extract the required
information from the website and save it to a data frame. The
function returns the data frame for further processing. """
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
table = soup.find('span', string=table_attribs).find_next('table')
df = pd.read_html(StringIO(str(table)))[0]
log_progress('Data extraction complete. Initiating Transformation process')
return df
def transform(df, csv_path):
""" This function accesses the CSV file for exchange rate
information, and adds three columns to the data frame, each
containing the transformed version of Market Cap column to
respective currencies"""
exchange_rate = pd.read_csv(csv_path, index_col=0).to_dict()['Rate']
df['MC_GBP_Billion'] = round(df['Market cap (US$ billion)'] * exchange_rate['GBP'], 2)
df['MC_EUR_Billion'] = round(df['Market cap (US$ billion)'] * exchange_rate['EUR'], 2)
df['MC_INR_Billion'] = round(df['Market cap (US$ billion)'] * exchange_rate['INR'], 2)
ic(df['MC_EUR_Billion'][4])
log_progress('Data transformation complete. Initiating Loading process')
return df
def load_to_csv(df, output_path):
""" This function saves the final data frame as a CSV file in
the provided path. Function returns nothing."""
df.to_csv(output_path)
log_progress('Data saved to CSV file')
def load_to_db(df, sql_connection, table_name):
""" This function saves the final data frame to a database
table with the provided name. Function returns nothing."""
df.to_sql(table_name, sql_connection, if_exists='replace', index=False)
log_progress('Data loaded to Database as a table, Executing queries')
def run_query(query_statement, sql_connection):
""" This function runs the query on the database table and
prints the output on the terminal. Function returns nothing. """
cursor = sql_connection.cursor()
cursor.execute(query_statement)
result = cursor.fetchall()
# for row in result:
# ic(row)
log_progress('Process Complete')
return result
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
output_csv_path = './Largest_banks_data.csv'
database_name = 'Banks.db'
table_name = 'Largest_banks'
#
log_progress('Preliminaries complete. Initiating ETL process')
#
df = ic(extract(url, 'By market capitalization'))
transform(df, 'exchange_rate.csv')
load_to_csv(df, output_csv_path)
with sqlite3.connect(database_name) as conn:
load_to_db(df, conn, table_name)
ic(run_query('SELECT * FROM Largest_banks', conn))
ic(run_query('SELECT AVG(MC_GBP_Billion) FROM Largest_banks', conn))
ic(run_query('SELECT "Bank name" FROM Largest_banks LIMIT 5', conn))