-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobswell_scraping_reset.py
161 lines (140 loc) · 5.69 KB
/
obswell_scraping_reset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Author: Saeesh Mangwani
# Date: 2021-06-02
# Description: A script that scrapes and organizes observation well data from
# Environment Canada, stored at
# https://www.env.gov.bc.ca/wsd/data_searches/obswell/map/data/
# %% ==== Loading libraries ====
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from io import StringIO
from sqlalchemy import create_engine
from json import load
# %% ==== Initializing global variables ====
# Reading credentials from file
creds = load(open('credentials.json',))
# Setting the default schema to 'obswell' unless another was specified in the file
if 'schema' not in creds.keys():
creds['schema'] = 'obswell'
# Database connection
db = create_engine('postgresql+psycopg2://{}:{}@{}:{}/{}?options=-csearch_path%3D{}'.format(
creds['user'],
creds['password'],
creds['host'],
creds['port'],
creds['dbname'],
creds['schema'],
))
conn = db.raw_connection()
cursor = conn.cursor()
# The path to the directory where the the update report from each run should be
# stored. Defaults to the working directory
path_to_report = 'update_report.txt'
# A dictionary that stores the datatype of all 4 columns in each data
dtype_dict = {
'Time': 'datetime64',
'Value': 'float64',
'Approval': 'str',
'myLocation': 'str'
}
# Todays date as a string
thisDate = str(datetime.today().date())
# %% ==== Preparing URL names ====
# Website containing all data
page = requests.get(
'https://www.env.gov.bc.ca/wsd/data_searches/obswell/map/data')
# Parsing with beautiful soup
soup = BeautifulSoup(page.content, 'html.parser')
# Using beautiful soup to find the data table element by class and id
dat_tab = soup.find(name='table')
# Converting it to a pandas dataframe
df = pd.read_html(str(dat_tab), na_values=' ', keep_default_na=False)[0]
# Getting only the filename column as a list
nameList = df.Name.dropna().to_list()
# Filtering only those names where the '-data.csv' tag is present
datNames = [name for name in nameList if re.search('-data\\.csv$', name)]
# ==== Creating dictionaries to store request status + error messages ====
datStatus = {name: '' for name in datNames}
datError = {name: '' for name in datNames}
# %% ==== Iterating over urls to download and format data ====
# For each name in the list of names
for name in datNames:
try:
# Initialize an empty string buffer
sio = StringIO()
# Creating the url for this dataset
url = 'https://www.env.gov.bc.ca/wsd/data_searches/obswell/map/data/' + name
# Getting the data
page = requests.get(url)
# Storing response status
datStatus[name] = page.status_code
# Splitting the csv text by lines
lines = page.text.splitlines()
# Since each line is stored as a single string, further splitting each line into a list of values
text_list = [line.replace('"', '').split(',') for line in lines]
# Converting the list of lists to a dataframe
df = pd.DataFrame(text_list[1:], columns=text_list[0])
# Ensuring type consistency
df = df.astype(dtype_dict)
# If this is the first table in the list of names, overwriting the table and setting columns
if name == datNames[0]:
df.head(0).to_sql('hourly', db, if_exists='replace', index = False)
# Writing data to buffer
df.to_csv(sio, sep = ',', index=False, header=None, columns = df.columns.array)
sio.seek(0)
# Writing from buffer to database
cursor.copy_from(sio, "hourly", sep = ',')
conn.commit()
# Status print
print('Completed download for ' + name)
except Exception as e:
# Printing a message in case of an error
print('Download failed for station', name)
print("Error message:", str(e))
# Saving the error message in the success dictionary
datError[name] = str(e)
# Status print when the iteration completes
print('All downloads completed')
# %% ==== Writing a status report ====
# Checking whether all links were valid
all_valid = all(
[status == 200 for status in datStatus.values()]
)
# Checking whether all downloads were error free
all_errorFree = all(
[error == '' for error in datError.values()]
)
# Opening a report file
with open(path_to_report, "w") as f:
# Printing a header and description
print('===== BC Observation Wells Data Scraper =====', file=f)
print('', file=f)
print('This file gives a summary of the most BC\nObservation well dataset update, run via\nthe script "obswell_scraping_*.py"', file=f)
print('', file=f)
# Date of scraping attempt
print('Last scrape: ', str(datetime.now()), file=f)
print('', file=f)
# Whether all links were valid
print('All data links valid:', all_valid, file=f)
print('', file=f)
if(not all_valid):
print('Invalid links were:', file=f)
# Getting the invalid links (i.e those that weren't 200)
invalid = {key: value for key, value in datStatus.items()
if value != 200}
# Printing them as a dataframe
print(pd.DataFrame(data=list(invalid.items()),
columns=['station', 'response_code']))
# Whether there were download or other errors
print('All downloads and data formatting was error free:', all_errorFree, file=f)
if(not all_errorFree):
print('Links that thew errors were:', file=f)
# Getting the invalid links (i.e those that weren't 200)
wError = {key: value for key, value in datError.items()
if value != ''}
# Printing them as a dataframe
print(pd.DataFrame(data=list(wError),
columns=['station', 'error_message']))
print('', file=f)