diff --git a/.gitignore b/.gitignore index 7bbc71c..9868b4e 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,10 @@ ENV/ # mypy .mypy_cache/ + +# data +/data* + +# credentials +api_info.py +db_info.py diff --git a/Get_htmls.py b/Get_htmls.py new file mode 100644 index 0000000..3527de1 --- /dev/null +++ b/Get_htmls.py @@ -0,0 +1,99 @@ + +# coding: utf-8 + +# In[24]: + + +from requests import get +from requests.exceptions import RequestException +from contextlib import closing +from bs4 import BeautifulSoup +from datetime import datetime +from datetime import timedelta +import pandas as pd +import urllib3 +import pickle + + +# In[5]: + + +def simple_get(url): + """ + Attempts to get the content at `url` by making an HTTP GET request. + If the content-type of response is some kind of HTML/XML, return the + text content, otherwise return None + """ + try: + with closing(get(url, stream=True)) as resp: + if is_good_response(resp): + return resp.content + else: + return None + + except RequestException as e: + log_error('Error during requests to {0} : {1}'.format(url, str(e))) + return None + + +def is_good_response(resp): + """ + Returns true if the response seems to be HTML, false otherwise + """ + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 + and content_type is not None + and content_type.find('html') > -1) + + +def log_error(e): + """ + It is always a good idea to log errors. + This function just prints them, but you can + make it do anything. + """ + print(e) + + +# In[6]: + + +cities=['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Frankfurt_am_Main'] + +base_url=['https://www.wetter.de/deutschland/wetter-berlin-18228265/','https://www.wetter.de/deutschland/wetter-hamburg-18219464/','https://www.wetter.de/deutschland/wetter-muenchen-18225562/','https://www.wetter.de/deutschland/wetter-koeln-18220679/','https://www.wetter.de/deutschland/wetter-frankfurt-18221009/'] + + +# In[18]: + + +def collect_htmls(city_base_url): + raw_html=[] + days_to_predict = 15 + http = urllib3.PoolManager() + url_hourly_base = city_base_url + tag_tags = ['tag-'+str(tag) for tag in range(9,days_to_predict+1)] + hourly_website_tags = ['wetterbericht-aktuell', 'wetterbericht-morgen', 'wetterbericht-uebermorgen','wetter-bericht','wettervorhersage','wetter-vorhersage','wettervorschau','wetter-vorschau'] + hourly_website_tags.extend(tag_tags) + for i, tag in enumerate(hourly_website_tags): + url = url_hourly_base+tag+'.html' + raw_html.append(simple_get(url)) + + return raw_html + + +# In[26]: + + +for i,city in enumerate(cities): + html_dict = {} + current_time = pd.Timestamp(datetime.now()) + + html_dict['website'] = 'www.wetter.de' + html_dict['city'] = city + html_dict['date_of_aquisition'] = current_time + html_dict['htmls'] = collect_htmls(base_url[i]) + pkl_name='./wetter_de/wetter_de_'+city+'_'+str(current_time)[:10]+'.pkl' + f = open(pkl_name,"wb") + pickle.dump(html_dict,f) + f.close() + diff --git a/README.md b/README.md new file mode 100644 index 0000000..464453f --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# webscraping_2018 +This repository has the set of files that gather information from the websites bild.de and wetter.de as a webscraping service, and from the weather channel by RESTful API calls. +The scripts that __gather the data__ run on a server as cronjobs. The way they run is described by: `crontab_info.txt` + +The structure for the RESTful API calls is the following: + - `api_info.py` has the necessary information to access the wunderground API. + + - `constants.py` has the global constants used across API scripts. + + - `city_location.py` is the script that gets the coordinates of specified named cities. + + - `daily_structured.py` is the script that __gathers daily data__. + + - `hourly_structured.py` is the script that __gathers hourly data__. + diff --git a/Web_Scraping_wetter_de_day_periods.py b/Web_Scraping_wetter_de_day_periods.py new file mode 100644 index 0000000..a09d9c1 --- /dev/null +++ b/Web_Scraping_wetter_de_day_periods.py @@ -0,0 +1,177 @@ + +# coding: utf-8 + +# In[51]: + + +from requests import get +from requests.exceptions import RequestException +from contextlib import closing +from bs4 import BeautifulSoup +#from datetime import datetime +from datetime import timedelta +import pandas as pd +import urllib3 +import datetime +import time +import os +import db_manager + +# -*- coding: utf -*- + + +# In[52]: + + +def simple_get(url): + """ + Attempts to get the content at `url` by making an HTTP GET request. + If the content-type of response is some kind of HTML/XML, return the + text content, otherwise return None + """ + try: + with closing(get(url, stream=True)) as resp: + if is_good_response(resp): + return resp.content + else: + return None + + except RequestException as e: + log_error('Error during requests to {0} : {1}'.format(url, str(e))) + return None + + +def is_good_response(resp): + """ + Returns true if the response seems to be HTML, false otherwise + """ + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 + and content_type is not None + and content_type.find('html') > -1) + + +def log_error(e): + """ + It is always a good idea to log errors. + This function just prints them, but you can + make it do anything. + """ + print(e) + +def find_between(s, first, last): + try: + start = s.index(first) + len(first) + end = s.index(last, start) + return s[start:end] + except ValueError: + return "" + +def cut_string(s, cut): + try: + cut_from = s.index(cut) + len(cut) + return s[cut_from:] + except ValueError: + return "" + + +# In[53]: + + +class forecast(object): + def __init__(max_temp, min_temp, proc_date, acc_date): + self.max_temp = max_temp + self.min_temp = min_temp + self.proc_date = proc_date + self.acc_date = acc_date + +def create_weather_df(url, http, current_time): + + data = {} + soup = BeautifulSoup(http.request('GET',url).data,'lxml') + daily_periods_dict = {} + + proc_date = [] + temp = [] + rain = [] + wind = [] + condition = [] + rain_l = [] + + for day in range(15): + for h in range(4): + dt = (current_time + timedelta(days=day)).date() + proc_date.append(datetime.datetime.combine(dt,datetime.time(h*6+2)).strftime('%Y%m%d%H')) + + period_forcast = soup.findAll("div", {"class":'forecast-column column-1 wt-border-radius-6'}) + for period in period_forcast: + + temp.append(int(period.find('div', {'class':"forecast-text-temperature wt-font-light"}).text[:-1])) + condition.append(period.find('div', {'class':"forecast-column-condition"}).text) + + rain_html = period.find("div", {"class":'forecast-column-rain'}) + + r = rain_html.findAll('span', {'class':"wt-font-semibold"}) + if len(r) > 1: + rain.append(int(r[0].text[:-1])) + rain_l.append(r[1].text[:-4]) + else: + rain.append(int(rain_html.find('span', {'class':"wt-font-semibold"}).text[:-1])) + rain_l.append(None) + + wind_html = period.find("div", {"class":'forecast-column-wind'}) + wind.append(int(wind_html.find('span', {'class':"wt-font-semibold"}).text[1:-5])) + + daily_periods_dict['date_for_which_weather_is_predicted'] = proc_date + + daily_periods_dict['temperature'] = temp + daily_periods_dict['wind_speed'] = wind + daily_periods_dict['precipitation_per'] = rain + + daily_periods_dict['precipitation_l'] = rain_l + daily_periods_dict['condition'] = condition + + daily = pd.DataFrame(daily_periods_dict) + return daily + + +# In[54]: + + +cities=['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Frankfurt_am_Main'] + +urls=['https://www.wetter.de/deutschland/wetter-berlin-18228265/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-hamburg-18219464/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-muenchen-18225562/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-koeln-18220679/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-frankfurt-18221009/wetterprognose.html'] + +http = urllib3.PoolManager() +current_time = pd.Timestamp(datetime.datetime.now()) +df = pd.DataFrame() + +for i,city in enumerate(cities): + url = urls[i] + cdf = create_weather_df(url,http,current_time) + cdf['city'] = city + df = df.append(cdf) + +df['website'] = 'https://www.wetter.de' +df['wind_direction'] = None +df['date_of_acquisition'] = current_time.strftime('%Y%m%d%H') + +# pkl_name='./wetter_de/day_periods/'+current_time.strftime('%Y%m%d%H')+'.pkl' +df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) +df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H%M').date()) + +#pkl_name='./wetter_de/daily/'+current_time.strftime('%Y%m%d%H')+'.pkl' +try: + db_manager.insert_df("DailyPeriodPrediction", df) +finally: + filename = os.path.expanduser('~/Documents/webscraping_2018/data_wetter_de/day_periods') + timestamp = datetime.datetime.now().strftime('%Y%m%d%H') + filename += timestamp + ".pkl" + df.to_pickle(filename) + + + diff --git a/Web_Scraping_wetter_de_full_day.py b/Web_Scraping_wetter_de_full_day.py new file mode 100644 index 0000000..be30596 --- /dev/null +++ b/Web_Scraping_wetter_de_full_day.py @@ -0,0 +1,163 @@ + +# coding: utf-8 + +# In[23]: + + +from requests import get +from requests.exceptions import RequestException +from contextlib import closing +from bs4 import BeautifulSoup +from datetime import timedelta +import pandas as pd +import urllib3 +import datetime +import time +import os +import db_manager + +# -*- coding: utf -*- + + +# In[24]: + + +def simple_get(url): + """ + Attempts to get the content at `url` by making an HTTP GET request. + If the content-type of response is some kind of HTML/XML, return the + text content, otherwise return None + """ + try: + with closing(get(url, stream=True)) as resp: + if is_good_response(resp): + return resp.content + else: + return None + + except RequestException as e: + log_error('Error during requests to {0} : {1}'.format(url, str(e))) + return None + + +def is_good_response(resp): + """ + Returns true if the response seems to be HTML, false otherwise + """ + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 + and content_type is not None + and content_type.find('html') > -1) + + +def log_error(e): + """ + It is always a good idea to log errors. + This function just prints them, but you can + make it do anything. + """ + print(e) + +def find_between(s, first, last): + try: + start = s.index(first) + len(first) + end = s.index(last, start) + return s[start:end] + except ValueError: + return "" + +def cut_string(s, cut): + try: + cut_from = s.index(cut) + len(cut) + return s[cut_from:] + except ValueError: + return "" + + +# In[25]: + + +class forecast(object): + def __init__(max_temp, min_temp, proc_date, acc_date): + self.max_temp = max_temp + self.min_temp = min_temp + self.proc_date = proc_date + self.acc_date = acc_date + +def create_weather_df(url, http, current_time): + + data = {} + soup = BeautifulSoup(http.request('GET',url).data,'lxml') + daily_periods_dict = {} + + proc_date = [] + temp_min = [] + temp_max = [] + condition = [] + + for day in range(15): + dt = (current_time + timedelta(days=day)).date() + proc_date.append(dt.strftime('%Y%m%d%H')) + + day_forcast = soup.findAll("div", {"class":'forecast-day'}) + for day in day_forcast: + + temps = day.find('div', {"class":'forecast-day-temperature'}) + temp_min.append(int(temps.find('span', {'class':"wt-color-temperature-max"}).text[:-1])) + temp_max.append(int(temps.find('span', {'class':"wt-color-temperature-min"}).text[:-1])) + + cond = str(day.find('div', {'class':"forecast-day-image"})) + condition.append(find_between(cond,'')) + + daily_periods_dict['date_for_which_weather_is_predicted'] = proc_date + + daily_periods_dict['temperature_min'] = temp_min + daily_periods_dict['temperature_max'] = temp_max + daily_periods_dict['condition'] = condition + + daily = pd.DataFrame(daily_periods_dict) + return daily + + +# In[26]: + + +cities=['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Frankfurt_am_Main'] + +urls=['https://www.wetter.de/deutschland/wetter-berlin-18228265/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-hamburg-18219464/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-muenchen-18225562/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-koeln-18220679/wetterprognose.html', + 'https://www.wetter.de/deutschland/wetter-frankfurt-18221009/wetterprognose.html'] + +http = urllib3.PoolManager() +current_time = pd.Timestamp(datetime.datetime.now()) +df = pd.DataFrame() + +for i,city in enumerate(cities): + url = urls[i] + cdf = create_weather_df(url,http,current_time) + cdf['city'] = city + df = df.append(cdf) + +df['wind_speed'] = None +df['humidity'] = None +df['precipitation_per'] = None +df['precipitation_l'] = None +df['wind_direction'] = None +df['snow'] = None +df['uvi'] = None + +df['website'] = 'https://www.wetter.de' +df['date_of_acquisition'] = current_time.strftime('%Y%m%d%H') +df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) +df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H%M').date()) + +#pkl_name='./wetter_de/daily/'+current_time.strftime('%Y%m%d%H')+'.pkl' +try: + db_manager.insert_df("DailyPrediction", df) +finally: + filename = os.path.expanduser('~/Documents/webscraping_2018/data_wetter_de/daily') + timestamp = datetime.datetime.now().strftime('%Y%m%d%H') + filename += timestamp + ".pkl" + df.to_pickle(filename) diff --git a/Wetter_de_scraping.py b/Wetter_de_scraping.py new file mode 100644 index 0000000..76e5282 --- /dev/null +++ b/Wetter_de_scraping.py @@ -0,0 +1,109 @@ +from bs4 import BeautifulSoup +import urllib3 +import time +import datetime +import pandas as pd +import numpy as np +import pickle +import os +import db_manager + + +days_to_predict = 15 +http = urllib3.PoolManager() +cities = ['Berlin','Hamburg', 'Munich', 'Cologne', 'Frankfurt'] +cities_tags = ['berlin-18228265/' ,'hamburg-18219464/', 'muenchen-18225562/', 'koeln-18220679/', 'frankfurt-18221009/'] +url_hourly_base = 'https://www.wetter.de/deutschland/wetter-' +tag_tags = ['tag-'+str(tag) for tag in range(9,days_to_predict+1)] +hourly_website_tags = ['wetterbericht-aktuell', 'wetterbericht-morgen', 'wetterbericht-uebermorgen','wetter-bericht','wettervorhersage','wetter-vorhersage','wettervorschau','wetter-vorschau'] +hourly_website_tags.extend(tag_tags) + +wind_mapping = { 'Nord': 'N', 'Ost':'E', 'West':'W', 'Süd':'S', + 'Nordost':'NE','Nordnordost':'NNE', 'Nordostost':'NEE', + 'Südost':'SE','Südsüdost':'SSE', 'Südostost':'SEE', + 'Ostnordost':'ENE', 'Ostsüdost':'ESE', + 'Nordwest':'NW', 'Nordnordwest':'NNW', 'Nordwestwest':'NWW', + 'Südwest':'SW', 'Südsüdwest':'SSW', 'Südwestwest':'SWW', + 'Westnordwest':'WNW', 'Westsüdwest':'WSW', + 'Ostnord':'EN', 'Ostostnord':'EEN', 'Ostnordnord':'ENN', + 'Westnord':'WN','Westwestnord':'WWN', 'Westnordnord':'WNN', + 'Nordostnord':'NEN', 'Nordwestnord':'NWN', + 'Ostsüd':'ES', 'Ostostsüd':'EES', 'Ostsüdsüd':'ESS', + 'Westsüd':'WS','Westwestsüd':'WWS', 'Westsüdsüd':'WSS', + 'Südostsüd':'SES', 'Südwestsüd':'SWS', + } +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +number_of_features = 9 #e.g. date_for_which_weather_is_predicted, cities, temperature, wind ect. +number_of_cities = len(cities) +number_of_predictions = number_of_cities*len(hourly_website_tags)*25 + +current_time_date = datetime.datetime.now().strftime('%Y%m%d%H') +hourly_dict = {} +hourly_dict['website'] = ['Wetter.de']*number_of_predictions +hourly_dict['date_of_acquisition'] = [current_time_date]*number_of_predictions + +all_features = np.empty((number_of_cities,len(hourly_website_tags),25,number_of_features), dtype=object) +for ci, city in enumerate(cities): + url_hourly_base_city = url_hourly_base+cities_tags[ci] + for i, tag in enumerate(hourly_website_tags): + url = url_hourly_base_city+tag+'.html' + soup = BeautifulSoup(http.request('GET',url).data, "html5lib") + dates_for_predicted_days = [str(datetime.date.today() + datetime.timedelta(days=i)) for i in range(days_to_predict)] + day_to_predict = dates_for_predicted_days[i].replace("-","") + hourly_info = soup.findAll('div',class_="column column-4 forecast-detail-column-1h") + for hi, info in enumerate(hourly_info): + all_features[ci][i][hi][0] = city + hour = info.find('div',class_="forecast-date").text[0:2] + prediction_for = str(day_to_predict)+str(hour) + all_features[ci][i][hi][1] = prediction_for + temp_info = info.find('div', class_="forecast-temperature") + temp = temp_info.find('span',class_="temperature").text.replace("°","") + all_features[ci][i][hi][2] = temp + wind_info = info.find('div',class_="forecast-wind") + wind = wind_info.find('span',class_="wt-font-semibold").text.split("/")[0][1:-3] + all_features[ci][i][hi][3] = wind + humidity_info = info.find('div',class_="forecast-humidity-text") + humidity = humidity_info.find('span',class_="wt-font-semibold").text.replace("%","") + all_features[ci][i][hi][4] = humidity + rain_info = info.find('div',class_="forecast-rain") + rain_perecnt = rain_info.find('span',class_="wt-font-semibold").text.replace("%","") + all_features[ci][i][hi][5] = rain_perecnt + if int(rain_perecnt) > 0: + rain_liter = rain_info.find_all('span',class_="wt-font-semibold")[-1].text.split("/")[0][0:-2] + all_features[ci][i][hi][6] = float(rain_liter.replace(",",".")) + else: + all_features[ci][i][hi][6] = None + wind_text_ger = wind_info.find('div',class_="forecast-wind-text").text.split("aus")[1].split("\n")[0].replace(" ","") + if wind_text_ger in wind_mapping: + wind_text = wind_mapping[wind_text_ger] + else: + wind_text = None + all_features[ci][i][hi][7] = wind_text + temp_condition = temp_info.find('span',class_="temperature-condition").text + all_features[ci][i][hi][8] = temp_condition +all_features = all_features.reshape(number_of_predictions,number_of_features) + +hourly_dict['city'] = list(all_features[:,0]) +hourly_dict['date_for_which_weather_is_predicted'] = list(all_features[:,1]) +hourly_dict['temperature'] = list(all_features[:,2]) +hourly_dict['wind_speed'] = list(all_features[:,3]) +hourly_dict['humidity'] = list(all_features[:,4]) +hourly_dict['precipitation_per'] = list(all_features[:,5]) +hourly_dict['precipitation_l'] = list(all_features[:,6]) +hourly_dict['wind_direction'] = list(all_features[:,7]) +hourly_dict['condition'] = list(all_features[:,8]) +hourly_dict['snow'] = [None]*number_of_predictions +hourly_dict['uvi'] = [None]*number_of_predictions + +df = pd.DataFrame(data=hourly_dict) +df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) +df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H%M').date()) +try: + db_manager.insert_df("HourlyPrediction", df) +finally: + filename = os.path.expanduser('~/Documents/webscraping_2018/data_wetter_de/hourly_period_') + timestamp = datetime.datetime.now().strftime('%Y%m%d%H') + filename += timestamp + ".pkl" + df.to_pickle(filename) diff --git a/bild_scraping.py b/bild_scraping.py new file mode 100644 index 0000000..0b8ccd5 --- /dev/null +++ b/bild_scraping.py @@ -0,0 +1,232 @@ +# coding: utf-8 +# +# Created by Pooja Subramaniam and Marc Aurel Vischer on Tue, May 8. +# Temperature is given as a tuple of daily high and low value, both in degrees Celsius as ints. +# Precipitation is given as "probability" as float. +# Wind is given as a tuple of strength in Bft (int) and direction +#(e.g. "NE" if wind _comes from_ north east). + + +import urllib3 +from bs4 import BeautifulSoup +import pandas as pd +import warnings +import os +import datetime +import db_manager + +#FIRST PART: ONCE-A-DAY PREDICTIONS +#These are the urls referring directly to high, low temperature +hi_lo_url = "https://wetter.bild.de/web2014/ifr-wetter-deutschland.asp" +prec_url = "https://wetter.bild.de/web2014/ifr-niederschlag-deutschland.asp" +wind_url = "https://wetter.bild.de/web2014/ifr-windstaerken-deutschland.asp" + +#load and parse page +http = urllib3.PoolManager() +with warnings.catch_warnings(): + warnings.simplefilter("ignore", category = urllib3.exceptions.InsecureRequestWarning) + hi_lo_bs = BeautifulSoup(http.request('GET', hi_lo_url).data, "html.parser") + prec_bs = BeautifulSoup(http.request('GET',prec_url).data, "html.parser") + wind_bs = BeautifulSoup(http.request('GET',wind_url).data, "html.parser") +#print(hi_lo.prettify()) + +#EXTRACT DATA AND SAVE INTO DICTIONARIES: +#TEMPERATURE HIGH/LOW, bild has today + 5 days forecast for that +#iterate over days, extract day layer for each +temp_dicts = [] +for day in range(6): + # extract current day layer + day_layer = hi_lo_bs.find_all('div', id="wk_layer_wr{}".format(day)) + #print(day_layer[0]['id']) + if len(day_layer)!=1: + raise Exception("Found more than one layer for single day.") + + # extract all the cities from that layer + day_cities = day_layer[0].find_all('div', class_="wk_map_text") + day_dict = {} + for city in day_cities: + hi_lo_str = city.nobr.next_sibling.next_sibling + high = int(hi_lo_str.split('|')[0].split('°')[0]) + low = int(hi_lo_str.split('|')[1].split('°')[0]) + day_dict[city.nobr.string] = (high, low) + temp_dicts.append(day_dict) + +#PRECIPITATION, bild has only today + 2 days forecast for that +#iterate over days, extract day layer for each +prec_dicts = [] +for day in range(1,4): #layer 0 corresponds to next 6 hrs, layer 1 to entire current day + # extract current day layer + day_layer = prec_bs.find_all('div', id="wk_layer_wr{}".format(day)) + #print(day_layer[0]['id']) + if len(day_layer)!=1: + raise Exception("Found more than one layer for single day.") + + # extract all the cities from that layer + day_cities = day_layer[0].find_all('div', class_="wk_map_text") + day_dict = {} + for city in day_cities: + prec_str = city.nobr.next_sibling.next_sibling + prec_value = int(prec_str.split()[0])/100 + day_dict[city.nobr.string] = prec_value + prec_dicts.append(day_dict) + +#WIND, bild again has today + 5 days forecast +WIND_GER_ENG = {"w":"W", "nw":"NW", "n":"N", "no":"NE", "o":"E", "so":"SE", "s":"S", "sw":"SW"} +#iterate over days, extract day layer for each +wind_dicts = [] +for day in range(6): + # extract current day layer + day_layer = wind_bs.find_all('div', id="wk_layer_wr{}".format(day)) + #print(day_layer[0]['id']) + if len(day_layer)!=1: + raise Exception("Found more than one layer for single day.") + + # extract all the cities from that layer + day_cities = day_layer[0].find_all('div', class_="wk_map_text") + day_dict = {} + for city in day_cities: + wind_str = city.nobr.next_sibling.next_sibling + wind_strength = int(wind_str.split()[0]) + wind_symbol_url = city.parent.img['src'] + wind_direction_raw = wind_symbol_url.split('.')[0].split('/')[-1] + wind_direction = WIND_GER_ENG[wind_direction_raw] + day_dict[city.nobr.string] = (wind_strength,wind_direction) + wind_dicts.append(day_dict) + +#BUNDLE THE INDIVIDUAL DICTIONARIES INTO A SINGLE DICT, SAVE AS PD DATAFRAME +date_of_acquisition = datetime.datetime.now() #for timestamp +website = ['Bild.de'] +#storing cities as a dictionary of german name : english name, +#so .keys() and .values() gives the list of cities in german and english respectively +cities = {"Berlin":"Berlin", "Frankfurt":"Frankfurt", "Hamburg":"Hamburg", + "Köln":"Cologne", "München":"Munich"} + +daily_dict = {'website':[], 'date_for_which_weather_is_predicted':[], 'city':[], + 'date_of_acquisition':[], 'temperature_max':[], 'temperature_min':[], + 'wind_speed':[], 'humidity':[], 'precipitation_per':[], + 'precipitation_l':[], 'wind_direction':[], 'condition':[], 'snow':[], 'uvi':[]} + + +for i,city in enumerate(cities): + for days in range(6): + daily_dict['website'].append(website) + daily_dict['date_for_which_weather_is_predicted'].append( + datetime.datetime.now().strftime('%Y%m%d%H')) + daily_dict['city'].append(cities[city]) + print((date_of_acquisition+datetime.timedelta(days))) + daily_dict['date_of_acquisition'].append( + (date_of_acquisition+datetime.timedelta(days)).strftime('%Y%m%d%H')) + + daily_dict['temperature_max'].append(temp_dicts[days][city][0]) + daily_dict['temperature_min'].append(temp_dicts[days][city][1]) + daily_dict['wind_speed'].append(wind_dicts[days][city][0]) + daily_dict['wind_direction'].append(wind_dicts[days][city][1]) + daily_dict['humidity'].append(None) + + #bild has precipitation forecasts only for the next 2 days + if days<2: + daily_dict['precipitation_per'].append(prec_dicts[days+1][city]*100) + else: + daily_dict['precipitation_per'].append(None) + + daily_dict['precipitation_l'].append(None) + daily_dict['condition'].append(None) + daily_dict['snow'].append(None) + daily_dict['uvi'].append(None) + +#convert to dataframe and save to file +df_daily = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in daily_dict.items() ])) +print(df_daily) +df_daily.date_of_acquisition = df_daily.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d%H').date()) +print(df_daily.date_for_which_weather_is_predicted) +df_daily.date_for_which_weather_is_predicted = df_daily.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d%H').date()) +try: + db_manager.insert_df("DailyPrediction", df_daily) +finally: + filename = os.path.expanduser('~/Documents/webscraping_2018/data_bild/daily/daily_') + timestamp = datetime.datetime.now().strftime('%Y%m%d%H') + filename += timestamp + ".pkl" + df_daily.to_pickle(filename) + +#SECOND PART: FOUR-TIMES-A-DAY PREDICTIONS +#scrape specified cities for morning, noon, afternoon, night, extract temperature, +# precipitation in percent and condition + +PREDICTION_TIMES = [datetime.timedelta(days=0, hours=8), #morning + datetime.timedelta(days=0, hours=14), #afternoon + datetime.timedelta(days=0, hours=20), #evening + datetime.timedelta(days=1, hours=2)] #night (tomorrow) + + +#first we need the specific url for each city +city_query_url = 'https://wetter.bild.de/web2014/vorhersage-ort.asp?id=' +city_ids_dict = {'Berlin': '10115-berlin', + 'Frankfurt': '65931-frankfurt-am-main', + 'Hamburg': '22305-hamburg', + 'Köln' : '50668-koeln', + 'München' : '80331-muenchen'} + + +#for the sake of clarity, i tried to be as consistent as possible with +#Pooja's code (daily_dict above) when it comes to saving the data as a dataframe +# +#data will be saved into this dictionary before being converted to a dataframe +daily_periods_dict = {'website':[],'date_for_which_weather_is_predicted':[], + 'city':[], 'date_of_acquisition':[], + 'temperature':[],'wind_speed':[],'precipitation_per':[], + 'precipitation_l':[],'wind_direction':[],'condition':[]} + +for city in cities: + #parse html for each city + city_url = city_query_url + city_ids_dict[city] + city_html = http.request('GET', city_url).data.decode('utf-8') + #CAREFUL!!! there is a mistake in the website: there is a /span that doesn't have a match + #we need to remove it manually before parsing + city_html_fixed = city_html.replace("VORMITTAG","VORMITTAG") + city_bs = BeautifulSoup(city_html_fixed, "html.parser") + + #get the table containing the four-times-a-day forecast and extract the data + four_table = city_bs.find_all('table', class_='wk_forecast_tbl')[1] + # using the magic number here to index this is a bit shitty but there are several + #tables that are all of the class 'wk_forecast_tbl' + + daytimes = four_table.find_all('td', class_="wk_bottomline wk_subheader") + for i,daytime in enumerate(daytimes): + siblings = [sibling for sibling in daytime.next_siblings] + temp_raw = siblings[3] + temp = int(temp_raw.text.split('°')[0]) + condition = siblings[5].text + precip_raw = siblings[7].span.next_sibling.next_sibling.next_sibling.next_sibling + precip = int(precip_raw.split('%')[0]) + #a bit of date arithmetic here: + today_00 = datetime.datetime.combine( + datetime.date.today(), datetime.time(0,0,0)) #gives today at 00 + prediction_datetime = today_00 + PREDICTION_TIMES[i] #time delta from today 00:00 + + daily_periods_dict['website'].append(city_url) + daily_periods_dict['date_for_which_weather_is_predicted'].append( + prediction_datetime.strftime('%Y%m%d%H')) + daily_periods_dict['city'].append(city) + daily_periods_dict['date_of_acquisition'].append( + datetime.datetime.now().strftime('%Y%m%d%H')) + daily_periods_dict['temperature'].append(temp) + daily_periods_dict['wind_speed'].append(None) + daily_periods_dict['precipitation_per'].append(precip) + daily_periods_dict['precipitation_l'].append(None) + daily_periods_dict['wind_direction'].append(None) + daily_periods_dict['condition'].append(condition) + + +#convert to dataframe and save to file +df = pd.DataFrame(daily_periods_dict) +df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) +df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) + +try: + pass + db_manager.insert_df("DailyPeriodPrediction", df) +finally: + filename = os.path.expanduser('~/Documents/webscraping_2018/data_bild/daily_period/daily_period_') + timestamp = datetime.datetime.now().strftime('%Y%m%d%H') + filename += timestamp + ".pkl" + df.to_pickle(filename) diff --git a/city_location.py b/city_location.py new file mode 100644 index 0000000..5afbd39 --- /dev/null +++ b/city_location.py @@ -0,0 +1,28 @@ +import requests +import time +import constants + +def get_coordinates(city): + response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + city) + + resp_json_payload = response.json() + return resp_json_payload['results'][0]['geometry']['location'] + + + + +def map_cities(cities): + dic = {} + for city in cities: + time.sleep(5) + print(city) + coordinates = get_coordinates(city) + dic[city] = (coordinates['lat'], coordinates['lng']) + print() + time.sleep(10) + print(dic[city]) + return dic + +#cities=['BERLIN', 'HAMBURG', 'MUNICH', 'FRANKFURT', 'COLOGNE'] +cities = constants.CITIES +print(map_cities(cities)) diff --git a/constants.py b/constants.py new file mode 100644 index 0000000..40e89e8 --- /dev/null +++ b/constants.py @@ -0,0 +1,18 @@ +# File with important common constants for the API scripts +import api_info + +KEY = api_info.KEY +BASE_URL = "http://api.wunderground.com/api/"+ KEY +"/hourly10day/q/" +DAILY_BASE_URL = "http://api.wunderground.com/api/"+ KEY +"/forecast10day/q/" + +FILENAME = "hourly_forecast.json" +FOLDERNAME = "/home/danielv/Documents/webscraping_2018/data/" +CITIES = ["BERLIN", "HAMBURG", "MUNICH", "COLOGNE", "FRANKFURT"] + +#Coordinates +coordinates = { 'BERLIN': (52.52000659999999, 13.404954), + 'MUNICH': (48.1351253, 11.5819805), + 'HAMBURG': (53.5510846, 9.9936819), + 'FRANKFURT': (50.1109221, 8.6821267), + 'COLOGNE': (50.937531, 6.9602786) + } diff --git a/crontab_info.txt b/crontab_info.txt new file mode 100644 index 0000000..101cc0b --- /dev/null +++ b/crontab_info.txt @@ -0,0 +1,8 @@ +0 8,20 * * * /usr/bin/python3 /home/danielv/Documents/webscraping_2018/hourly_db.py >> /home/danielv/cron_weather.log 2>&1 +0 8,20 * * * /usr/bin/python3 /home/danielv/Documents/webscraping_2018/daily_db.py >> /home/danielv/cron_weather_daily.log 2>&1 +0 2,8,14,20 * * * /home/danielv/anaconda3/bin/python /home/danielv/Documents/webscraping_2018/bild_scraping.py >> /home/danielv/cron_bild.log 2>1 +0 8,20 * * * /home/danielv/anaconda3/bin/python /home/danielv/Documents/webscraping_2018/Wetter_de_scraping.py >> /home/danielv/cron_wetter_de.log 2>&1 + +0 8,20 * * * /usr/bin/python3 /home/danielv/Documents/webscraping_2018/Web_Scraping_wetter_de_full_day.py >> /home/danielv/cron_wetter_full.log 2>&1 + +0 2,8,14,20 * * * /home/danielv/anaconda3/bin/python /home/danielv/Documents/webscraping_2018/Web_Scraping_wetter_de_day_periods.py >> /home/danielv/cron_wetter_daily.log 2>1 diff --git a/daily_db.py b/daily_db.py new file mode 100644 index 0000000..05521db --- /dev/null +++ b/daily_db.py @@ -0,0 +1,107 @@ +import requests +import time +import datetime +import json +import constants +import pandas as pd +import pickle +import db_manager + +def get_response(query): + """ + Access wunderground API to do a get request + """ + try: + response = requests.get(constants.DAILY_BASE_URL + query+ ".json") + return response.json() if response.ok else None + except Exception as e: + raise e + +def extract_parameters(daily_forecast, city, data): + """ + Extract parameters from request object and store it on data dataFrame + """ + date_ = daily_forecast.get('date') + date_predicted = datetime.datetime.fromtimestamp(int(date_.get('epoch'))).strftime('%Y%m%d%H%M') + temperature_max = daily_forecast.get('high').get('celsius') + temperature_min = daily_forecast.get('low').get('celsius') + wind_speed = daily_forecast.get('avewind').get('kph') + humidity = daily_forecast.get('avehumidity') + precipitation_per = daily_forecast.get('pop') + wind_direction = daily_forecast.get('avewind').get('dir') + condition = daily_forecast.get('conditions') + snowcm = daily_forecast.get('snow_allday').get('cm') + if snowcm: snow = snowcm * 10 + else: snow = snowcm + UVI = None + precipitation_l = None + website = 'The Weather Channel' + + data['website'].append(website) + data['city'].append(city) + data['date_of_acquisition'].append(datetime.datetime.now().strftime('%Y%m%d%H')) + data['date_for_which_weather_is_predicted'].append(date_predicted) + data['temperature_max'].append(temperature_max) + data['temperature_min'].append(temperature_min) + data['wind_speed'].append(wind_speed) + data['humidity'].append(humidity) + data['precipitation_per'].append(precipitation_per ) + data['precipitation_l'].append(precipitation_l) + data['wind_direction'].append(wind_direction) + data['condition'].append(condition) + data['snow'].append(snow) + data['uvi'].append(UVI) + return data + +def gather_daily_city(city, data): + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + iterations = 100 + while(response == None and iterations > 0): + response = get_response(location) + time.sleep(10) + iterations -= 1 + if(response == None): + return data + + daily_forecasts = response.get("forecast").get("simpleforecast").get("forecastday") + + for daily_forecast in daily_forecasts: + data = extract_parameters(daily_forecast, city, data) + return data + +def gather_daily_information(): + data = { + 'website' : [], + 'city' : [], + 'date_of_acquisition' : [], + 'date_for_which_weather_is_predicted' : [], + 'temperature_max' : [], + 'temperature_min' : [], + 'wind_speed' : [], + 'humidity' : [], + 'precipitation_per' : [], + 'precipitation_l' : [], + 'wind_direction' : [], + 'condition' : [], + 'snow' : [], + 'uvi' : [], + } + for city in constants.coordinates.keys(): + data = gather_daily_city(city, data) + + df = pd.DataFrame(data) + df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H').date()) + df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H%M').date()) + return df + +df = gather_daily_information() +try: + if(df.size > 0): + db_manager.insert_df("DailyPrediction", df) +finally: + if(df.size > 0): + timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') + filename = "/home/danielv/Documents/webscraping_2018/data_daily/" + timestamp + ".pkl" + df.to_pickle(filename) diff --git a/daily_structured.py b/daily_structured.py new file mode 100644 index 0000000..cc54a0c --- /dev/null +++ b/daily_structured.py @@ -0,0 +1,101 @@ +import requests +import time +import datetime +import json +import constants +import pandas as pd +import pickle + +def get_response(query): + """ + Access wunderground API to do a get request + """ + try: + response = requests.get(constants.DAILY_BASE_URL + query+ ".json") + return response.json() if response.ok else None + except Exception as e: + raise e + +def extract_parameters(daily_forecast, city, data): + """ + Extract parameters from request object and store it on data dataFrame + """ + date_ = daily_forecast.get('date') + date_predicted = datetime.datetime.fromtimestamp(int(date_.get('epoch'))).strftime('%Y%m%d%H%M') + temperature_max = daily_forecast.get('high').get('celsius') + temperature_min = daily_forecast.get('low').get('celsius') + wind_speed = daily_forecast.get('avewind').get('kph') + humidity = daily_forecast.get('avehumidity') + precipitation_per = daily_forecast.get('pop') + wind_direction = daily_forecast.get('avewind').get('dir') + condition = daily_forecast.get('conditions') + snowcm = daily_forecast.get('snow_allday').get('cm') + if snowcm: snow = snowcm * 10 + else: snow = snowcm + UVI = None + precipitation_l = None + website = 'The Weather Channel' + + data['website'].append(website) + data['city'].append(city) + data['date_of_acquisition'].append(datetime.datetime.now().strftime('%Y%m%d%H')) + data['date_for_which_weather_is_predicted'].append(date_predicted) + data['temperature_max'].append(temperature_max) + data['temperature_min'].append(temperature_min) + data['wind_speed'].append(wind_speed) + data['humidity'].append(humidity) + data['precipitation_per'].append(precipitation_per ) + data['precipitation_l'].append(precipitation_l) + data['wind_direction'].append(wind_direction) + data['condition'].append(condition) + data['snow'].append(snow) + data['uvi'].append(UVI) + return data + +def gather_daily_city(city, data): + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + iterations = 100 + while(response == None and iterations > 0): + response = get_response(location) + time.sleep(10) + iterations -= 1 + if(response == None): + return data + + daily_forecasts = response.get("forecast").get("simpleforecast").get("forecastday") + + for daily_forecast in daily_forecasts: + data = extract_parameters(daily_forecast, city, data) + return data + +def gather_daily_information(): + data = { + 'website' : [], + 'city' : [], + 'date_of_acquisition' : [], + 'date_for_which_weather_is_predicted' : [], + 'temperature_max' : [], + 'temperature_min' : [], + 'wind_speed' : [], + 'humidity' : [], + 'precipitation_per' : [], + 'precipitation_l' : [], + 'wind_direction' : [], + 'condition' : [], + 'snow' : [], + 'uvi' : [], + } + for city in constants.coordinates.keys(): + data = gather_daily_city(city, data) + + df = pd.DataFrame(data) + return df + +df = gather_daily_information() + +if(df.size > 0): + timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') + filename = "/home/danielv/Documents/webscraping_2018/data_daily/" + timestamp + ".pkl" + df.to_pickle(filename) diff --git a/database.py b/database.py new file mode 100644 index 0000000..8a9a9bf --- /dev/null +++ b/database.py @@ -0,0 +1,224 @@ +import pony.orm as porm +#import database +import datetime +#import station_names +import getpass +import pandas as pd + +from pony.orm.core import ObjectNotFound, TransactionIntegrityError + + +conn_url = 'postgresql://localhost:5432' +db = porm.Database() + +class Station(db.Entity): + stations_id = porm.PrimaryKey(int, auto=False) + von_datum = porm.Optional(datetime.date) + bis_datum = porm.Optional(datetime.date) + stationshoehe = porm.Optional(int) + geobreite = porm.Optional(float) + geolaenge = porm.Optional(float) + stationsname = porm.Required(str) + bundesland = porm.Optional(str) + measurements = porm.Set('DailyMeasurement') + + @classmethod + def in_city(cls, city_name): + return cls.select(lambda s: city_name in s.stationsname) + + +class DailyMeasurement(db.Entity): + mess_datum = porm.Required(datetime.date) + stations_id = porm.Required(int) + station = porm.Optional(Station) + qn_3 = porm.Optional(int) # quality level of next columns + fx = porm.Optional(float) + fm = porm.Optional(float) + qn_4 = porm.Optional(int) + rsk = porm.Optional(float) + rskf = porm.Optional(float) + sdk = porm.Optional(float) + shk_tag = porm.Optional(float) + nm = porm.Optional(float) + vpm = porm.Optional(float) + pm = porm.Optional(float) + tmk = porm.Optional(float) + upm = porm.Optional(float) + txk = porm.Optional(float) + tnk = porm.Optional(float) + tgk = porm.Optional(float) + + porm.PrimaryKey(mess_datum, stations_id) + + #import math + #def before_insert(self): + # for x in self._columns_: + # if isinstance(getattr(self, x), float): + # if math.isnan((getattr(self, x))): + # setattr(self, x, None) + # self.station = Station[self.stations_id] + + #def after_insert(self): + # self.station = Station[self.stations_id] + + #def after_update(self): + # self.station = Station[self.stations_id] + +class DailyPrediction(db.Entity): + id = porm.PrimaryKey(int, auto=True) + website = porm.Required(str) + city = porm.Required(str) + date_of_acquisition = porm.Required(datetime.date) + date_for_which_weather_is_predicted = porm.Required(datetime.date) + temperature_max = porm.Required(float) + temperature_min = porm.Required(float) + wind_speed = porm.Optional(float, nullable=True) + humidity = porm.Optional(float, nullable=True) + precipitation_per = porm.Optional(float, nullable=True) + precipitation_l = porm.Optional(float, nullable=True) + wind_direction = porm.Optional(str, 3, nullable=True) + condition = porm.Optional(str, nullable=True) + snow = porm.Optional(float, nullable=True) + UVI = porm.Optional(int, unsigned=True) + + +class HourlyPrediction(db.Entity): + id = porm.PrimaryKey(int, auto=True) + website = porm.Required(str) + city = porm.Required(str) + date_of_acquisition = porm.Required(datetime.datetime) + date_for_which_weather_is_predicted = porm.Required(datetime.datetime) + temperature = porm.Required(float) + wind_speed = porm.Optional(float) + humidity = porm.Optional(float) + precipitation_per = porm.Optional(float) + precipitation_l = porm.Optional(float) + wind_direction = porm.Optional(str, 3) + condition = porm.Optional(str) + snow = porm.Optional(float) + UVI = porm.Optional(int, unsigned=True) + + +class DailyPeriodPrediction(db.Entity): + id = porm.PrimaryKey(int, auto=True) + website = porm.Required(str) + city = porm.Required(str) + date_of_acquisition = porm.Required(datetime.datetime) + date_for_which_weather_is_predicted = porm.Required(str) + temperature = porm.Required(float) + wind_speed = porm.Optional(float) + precipitation_per = porm.Optional(float) + precipitation_l = porm.Optional(float) + wind_direction = porm.Optional(str, 3) + condition = porm.Optional(str) + + +@porm.db_session +def set_station_trigger(db): + trigger_text = ''' + create or replace function set_station() + returns trigger as ' + begin + new.station := new.stations_id; + return new; + end; + ' language plpgsql; + drop trigger if exists set_station on dailymeasurement; + create trigger set_station + before insert + on dailymeasurement + for each row + execute procedure set_station(); + ''' + + db.execute(trigger_text) + + +def set_up_connection(db, db_name, user='', password=None, host='127.0.0.1', create_tables=False): + ''' + Sets up a connection with the database server. + Set create_tables to True if the tables don't exist. + ''' + if password is None: + password = getpass.getpass(prompt='postgres user password: ') + db.bind(provider='postgres', user=user, password=password, host=host, database=db_name) + db.generate_mapping(create_tables = create_tables) + global conn_url + conn_url = 'postgresql://{}:{}@{}:5432/{}'.format(user, password, host, db_name) + if create_tables: + set_station_trigger(db) + + +@porm.db_session +def _insert_without_pandas(df, table_name): + table_obj = db.entities[table_name] + pk = table_obj._pk_columns_ + + if df.index.name is None: + df_q = df.set_index(pk) + else: + df_q = df.copy() + + for i in df_q.index: + try: + table_obj[i] + except ObjectNotFound: + try: + table_obj(**{**dict(zip(pk, i)), + **df_q.loc[i].to_dict()}) + except TypeError: + table_obj(**{**{pk : i}, + **df_q.loc[i].to_dict()}) + + +@porm.db_session +def _insert_with_pandas(df, table_name, auto_id=False, overwrite=False): + indices_to_keep = [] + rows_to_delete = [] + table_obj = db.entities[table_name] + + if df.index.name is None and not auto_id: + df_q = df.set_index(table_obj._pk_columns_) + else: + df_q = df.copy() + + try: + df_q.to_sql(table_name.lower(), conn_url, if_exists='append', index=not auto_id) + except: + for i in df_q.index: + try: + row = table_obj[i] + + if overwrite: + rows_to_delete.append(row) + indices_to_keep.append(i) + + except ObjectNotFound: + indices_to_keep.append(i) + + except: + print(i) + + if overwrite: + table_obj.select(lambda x: x in rows_to_delete).delete(bulk = True) + porm.commit() + + print('starting insert') + df_to_insert = df_q.loc[indices_to_keep] + df_to_insert.to_sql(table_name.lower(), conn_url, if_exists='append', index=not auto_id) + + +@porm.db_session +def insert_into_table(df, table_name, use_pandas=True, auto_id=False, overwrite=False): + if use_pandas: + _insert_with_pandas(df, table_name, auto_id, overwrite) + else: + _insert_without_pandas(df, table_name) + + +@porm.db_session +def query_to_dataframe(query): + try: + return pd.read_sql_query(query.get_sql(), conn_url) + except: + return pd.DataFrame([o.to_dict() for o in query]) \ No newline at end of file diff --git a/db_info.py b/db_info.py new file mode 100644 index 0000000..a83cb98 --- /dev/null +++ b/db_info.py @@ -0,0 +1,5 @@ +#Credentials to log into the database + +db_name = "db_webscraping" +db_user = "webscrapers" +db_password = "bCCnw3b" diff --git a/db_manager.py b/db_manager.py new file mode 100644 index 0000000..c803111 --- /dev/null +++ b/db_manager.py @@ -0,0 +1,8 @@ +import database as db +import db_info + +db.set_up_connection(db.db, db_info.db_name, user=db_info.db_user, password=db_info.db_password) +#TODO add docstring and exceptions +def insert_df(table_name, df): + db.insert_into_table(df, table_name, auto_id=True) + diff --git a/hourly_db.py b/hourly_db.py new file mode 100644 index 0000000..83ca30a --- /dev/null +++ b/hourly_db.py @@ -0,0 +1,118 @@ +import requests +import time +import datetime +import json +import constants +import pandas as pd +import pickle +import db_manager + +def get_response(query): + """ + Access wunderground API to do a get request + """ + try: + response = requests.get(constants.BASE_URL + query+ ".json") + return response.json() if response.ok else None + except Exception as e: + raise e + + +def collect_forecast_coords(coords, city): + """ + Stores the json object corresponding to the weather forecast of city in a file. + Parameters: + coords: dictionary with the city names as keys, and tuple of coordinates as value + city: name of the city in a string format + """ + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + simple_forecast = response.get("hourly_forecast") + filename = str(time.time()) + "_" + city + "_" + constants.FILENAME + f = open(filename, 'w') + json.dump(simple_forecast, f) + f.close() + +def extract_parameters(hourly_forecast, city, data): + fcttime = hourly_forecast.get('FCTTIME') + year, month, day, hour = fcttime.get('year'), fcttime.get('mon_padded'), fcttime.get('mday_padded'), fcttime.get('hour_padded') + temperature = hourly_forecast.get('temp').get('metric') + wind_speed = hourly_forecast.get('wspd').get('metric') + humidity = hourly_forecast.get('humidity') + precipitation_per = hourly_forecast.get('qpf').get('metric') #convert + wind_direction = hourly_forecast.get('wdir').get('dir') + condition = hourly_forecast.get('condition') + snow = hourly_forecast.get('snow').get('metric') + UVI = hourly_forecast.get('uvi') + precipitation_l = None + website = 'The Weather Channel' + + data['website'].append(website) + data['city'].append(city) + data['date_of_acquisition'].append(datetime.datetime.now().strftime('%Y%m%d%H')) + data['date_for_which_weather_is_predicted'].append(year + month + day + hour) + data['temperature'].append(temperature) + data['wind_speed'].append(wind_speed) + data['humidity'].append(humidity) + data['precipitation_per'].append(precipitation_per ) + data['precipitation_l'].append(precipitation_l) + data['wind_direction'].append(wind_direction) + data['condition'].append(condition) + data['snow'].append(snow) + data['uvi'].append(UVI) + return data + #df = pd.DataFrame(data, index=[0]) + +def gather_hourly_city(city, data): + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + iterations = 100 + while(response == None and iterations > 0): + response = get_response(location) + iterations -= 1 + time.sleep(10) + if(response == None): + return data + + hourly_forecasts = response.get("hourly_forecast") + + for hourly_forecast in hourly_forecasts: + data = extract_parameters(hourly_forecast, city, data) + return data + +def gather_hourly_information(): + data = { + 'website' : [], + 'city' : [], + 'date_of_acquisition' : [], + 'date_for_which_weather_is_predicted' : [], + 'temperature' : [], + 'wind_speed' : [], + 'humidity' : [], + 'precipitation_per' : [], + 'precipitation_l' : [], + 'wind_direction' : [], + 'condition' : [], + 'snow' : [], + 'uvi' : [], + } + for city in constants.coordinates.keys(): + data = gather_hourly_city(city, data) + + df = pd.DataFrame(data) + df.date_for_which_weather_is_predicted = df.date_for_which_weather_is_predicted.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H')) + df.date_of_acquisition = df.date_of_acquisition.apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H')) + return df + +df = gather_hourly_information() + +try: + if(df.size > 0): + db_manager.insert_df("HourlyPrediction", df) +finally: + if(df.size > 0): + timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') + filename = "/home/danielv/Documents/webscraping_2018/data_hourly/" + timestamp + ".pkl" + df.to_pickle(filename) diff --git a/hourly_structured.py b/hourly_structured.py new file mode 100644 index 0000000..e7f4ffa --- /dev/null +++ b/hourly_structured.py @@ -0,0 +1,111 @@ +import requests +import time +import datetime +import json +import constants +import pandas as pd +import pickle + +def get_response(query): + """ + Access wunderground API to do a get request + """ + try: + response = requests.get(constants.BASE_URL + query+ ".json") + return response.json() if response.ok else None + except Exception as e: + raise e + + +def collect_forecast_coords(coords, city): + """ + Stores the json object corresponding to the weather forecast of city in a file. + Parameters: + coords: dictionary with the city names as keys, and tuple of coordinates as value + city: name of the city in a string format + """ + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + simple_forecast = response.get("hourly_forecast") + filename = str(time.time()) + "_" + city + "_" + constants.FILENAME + f = open(filename, 'w') + json.dump(simple_forecast, f) + f.close() + +def extract_parameters(hourly_forecast, city, data): + fcttime = hourly_forecast.get('FCTTIME') + year, month, day, hour = fcttime.get('year'), fcttime.get('mon_padded'), fcttime.get('mday_padded'), fcttime.get('hour_padded') + temperature = hourly_forecast.get('temp').get('metric') + wind_speed = hourly_forecast.get('wspd').get('metric') + humidity = hourly_forecast.get('humidity') + precipitation_per = hourly_forecast.get('qpf').get('metric') #convert + wind_direction = hourly_forecast.get('wdir').get('dir') + condition = hourly_forecast.get('condition') + snow = hourly_forecast.get('snow').get('metric') + UVI = hourly_forecast.get('uvi') + precipitation_l = None + website = 'The Weather Channel' + + data['website'].append(website) + data['city'].append(city) + data['date_of_acquisition'].append(datetime.datetime.now().strftime('%Y%m%d%H')) + data['date_for_which_weather_is_predicted'].append(year + month + day + hour) + data['temperature'].append(temperature) + data['wind_speed'].append(wind_speed) + data['humidity'].append(humidity) + data['precipitation_per'].append(precipitation_per ) + data['precipitation_l'].append(precipitation_l) + data['wind_direction'].append(wind_direction) + data['condition'].append(condition) + data['snow'].append(snow) + data['uvi'].append(UVI) + return data + #df = pd.DataFrame(data, index=[0]) + +def gather_hourly_city(city, data): + latitude, longitude= constants.coordinates.get(city) + location = str(latitude)+ "," + str(longitude) + response = get_response(location) + iterations = 100 + while(response == None and iterations > 0): + response = get_response(location) + iterations -= 1 + time.sleep(10) + if(response == None): + return data + + hourly_forecasts = response.get("hourly_forecast") + + for hourly_forecast in hourly_forecasts: + data = extract_parameters(hourly_forecast, city, data) + return data + +def gather_hourly_information(): + data = { + 'website' : [], + 'city' : [], + 'date_of_acquisition' : [], + 'date_for_which_weather_is_predicted' : [], + 'temperature' : [], + 'wind_speed' : [], + 'humidity' : [], + 'precipitation_per' : [], + 'precipitation_l' : [], + 'wind_direction' : [], + 'condition' : [], + 'snow' : [], + 'uvi' : [], + } + for city in constants.coordinates.keys(): + data = gather_hourly_city(city, data) + + df = pd.DataFrame(data) + return df + +df = gather_hourly_information() + +if(df.size > 0): + timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') + filename = "/home/danielv/Documents/webscraping_2018/data_hourly/" + timestamp + ".pkl" + df.to_pickle(filename)