forked from Karlheinzniebuhr/the-weather-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
weather_scraper.py
90 lines (71 loc) · 3.69 KB
/
weather_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Made with love by Karl
# Contact me on Telegram: @karlpy
import requests
import csv
import lxml.html as lh
import config
from util.UnitConverter import ConvertToSystem
from util.Parser import Parser
from util.Utils import Utils
# configuration
stations_file = open('stations.txt', 'r')
URLS = stations_file.readlines()
# Date format: YYYY-MM-DD
START_DATE = config.START_DATE
END_DATE = config.END_DATE
# set to "metric" or "imperial"
UNIT_SYSTEM = config.UNIT_SYSTEM
# find the first data entry automatically
FIND_FIRST_DATE = config.FIND_FIRST_DATE
def scrap_station(weather_station_url):
session = requests.Session()
timeout = 5
global START_DATE
global END_DATE
global UNIT_SYSTEM
global FIND_FIRST_DATE
if FIND_FIRST_DATE:
# find first date
first_date_with_data = Utils.find_first_data_entry(weather_station_url=weather_station_url, start_date=START_DATE)
# if first date found
if(first_date_with_data != -1):
START_DATE = first_date_with_data
url_gen = Utils.date_url_generator(weather_station_url, START_DATE, END_DATE)
station_name = weather_station_url.split('/')[-1]
file_name = f'{station_name}.csv'
with open(file_name, 'a+', newline='') as csvfile:
fieldnames = ['Date', 'Time', 'Temperature', 'Dew_Point', 'Humidity', 'Wind', 'Speed', 'Gust', 'Pressure', 'Precip_Rate', 'Precip_Accum', 'UV', 'Solar']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# Write the correct headers to the CSV file
if UNIT_SYSTEM == "metric":
# 12:04 AM 24.4 C 18.3 C 69 % SW 0.0 km/h 0.0 km/h 1,013.88 hPa 0.00 mm 0.00 mm 0 0 w/m²
writer.writerow({'Date': 'Date', 'Time': 'Time', 'Temperature': 'Temperature_C', 'Dew_Point': 'Dew_Point_C', 'Humidity': 'Humidity_%', 'Wind': 'Wind', 'Speed': 'Speed_kmh', 'Gust': 'Gust_kmh', 'Pressure': 'Pressure_hPa', 'Precip_Rate': 'Precip_Rate_mm', 'Precip_Accum': 'Precip_Accum_mm', 'UV': 'UV', 'Solar': 'Solar_w/m2'})
elif UNIT_SYSTEM == "imperial":
# 12:04 AM 75.9 F 65.0 F 69 % SW 0.0 mph 0.0 mph 29.94 in 0.00 in 0.00 in 0 0 w/m²
writer.writerow({'Date': 'Date', 'Time': 'Time', 'Temperature': 'Temperature_F', 'Dew_Point': 'Dew_Point_F', 'Humidity': 'Humidity_%', 'Wind': 'Wind', 'Speed': 'Speed_mph', 'Gust': 'Gust_mph', 'Pressure': 'Pressure_in', 'Precip_Rate': 'Precip_Rate_in', 'Precip_Accum': 'Precip_Accum_in', 'UV': 'UV', 'Solar': 'Solar_w/m2'})
else:
raise Exception("please set 'unit_system' to either \"metric\" or \"imperial\"! ")
for date_string, url in url_gen:
try:
print(f'Scraping data from {url}')
history_table = False
while not history_table:
html_string = session.get(url, timeout=timeout)
doc = lh.fromstring(html_string.content)
history_table = doc.xpath('//*[@id="main-page-content"]/div/div/div/lib-history/div[2]/lib-history-table/div/div/div/table/tbody')
if not history_table:
print("refreshing session")
session = requests.Session()
# parse html table rows
data_rows = Parser.parse_html_table(date_string, history_table)
# convert to metric system
converter = ConvertToSystem(UNIT_SYSTEM)
data_to_write = converter.clean_and_convert(data_rows)
print(f'Saving {len(data_to_write)} rows')
writer.writerows(data_to_write)
except Exception as e:
print(e)
for url in URLS:
url = url.strip()
print(url)
scrap_station(url)