Skip to content
This repository has been archived by the owner on Sep 30, 2022. It is now read-only.

Commit

Permalink
Merge branch 'master' into presentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Patryk Gronkiewicz authored Jul 20, 2021
2 parents a04a2ff + 75e1dd6 commit 15c4ab8
Show file tree
Hide file tree
Showing 7 changed files with 725 additions and 167 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Corona analysis 1

This repository is made to gather all data about spreading of SARS-CoV-2 in
This repository exists to gather all data about spreading of SARS-CoV-2 in
Poland.

## Requirements
Expand Down
132 changes: 76 additions & 56 deletions corona_analysis/scrapers/koronawirusunas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@
import numpy as np

URL = "https://www.koronawirusunas.pl/{}"
PATTERN = re.compile(r"var\sdataSource_przyrost")
SUB_SITES = ("",
"wojewodztwo-slaskie",
"wojewodztwo-mazowieckie",
"wojewodztwo-dolnoslaskie",
"wojewodztwo-wielkopolskie",
"wojewodztwo-lodzkie",
"wojewodztwo-malopolskie",
"wojewodztwo-opolskie",
"wojewodztwo-kujawsko-pomorskie",
"wojewodztwo-pomorskie",
"wojewodztwo-zachodniopomorskie",
"wojewodztwo-podlaskie",
"wojewodztwo-lubelskie",
"wojewodztwo-podkarpackie",
"wojewodztwo-swietokrzyskie",
"wojewodztwo-warminsko-mazurskie",
"wojewodztwo-lubuskie")
PATTERN = re.compile(r"var dataSource_przyrost|var dataSource_koronawirus")
SUB_SITES = (
"",
"wojewodztwo-slaskie",
"wojewodztwo-mazowieckie",
"wojewodztwo-dolnoslaskie",
"wojewodztwo-wielkopolskie",
"wojewodztwo-lodzkie",
"wojewodztwo-malopolskie",
"wojewodztwo-opolskie",
"wojewodztwo-kujawsko-pomorskie",
"wojewodztwo-pomorskie",
"wojewodztwo-zachodniopomorskie",
"wojewodztwo-podlaskie",
"wojewodztwo-lubelskie",
"wojewodztwo-podkarpackie",
"wojewodztwo-swietokrzyskie",
"wojewodztwo-warminsko-mazurskie",
"wojewodztwo-lubuskie",
)


def scrape(url=URL) -> dict:
Expand All @@ -35,11 +37,10 @@ def scrape(url=URL) -> dict:
soup = BeautifulSoup(web.read(), "lxml")

# Get first script tag that contains PATTERN
script = soup.find('script', text=PATTERN)
script = soup.find("script", text=re.compile(PATTERN))

# Group all vars to ('var name,'[data]')
jsdata = re.findall(
r'var\s*(.*?)\s*=(\s*\[[\s\S]*?\]);', script.string)
jsdata = re.findall(r"var\s*(.*?)\s*=(\s*\[[\s\S]*?]);", script.string)

return {t[0]: pd.DataFrame(demjson.decode(t[1])) for t in jsdata}

Expand Down Expand Up @@ -80,8 +81,9 @@ def clean_regions(data: dict):
data[key].fillna(axis=1, inplace=True, value=0)
data[key]["wojewodztwo"] = np.nan
data[key].fillna(axis=1, inplace=True, value=key)
data[key]["date"] = (data[key].index - data[
key].index.min()) / np.timedelta64(1, 'D')
data[key]["date"] = (data[key].index - data[key].index.min()) / np.timedelta64(
1, "D"
)
df_regions = df_regions.append(data[key])

return df_regions
Expand All @@ -96,8 +98,8 @@ def clean_country(data: pd.DataFrame):
data : pd.DataFrame
dict containing pd.DataFrame
"""
data['kwar_z'].fillna(0, inplace=True)
data.fillna(method='ffill', inplace=True)
data["kwar_z"].fillna(0, inplace=True)
data.fillna(method="ffill", inplace=True)
data.fillna(0, inplace=True)

return data
Expand All @@ -112,46 +114,64 @@ def get_data():
data = [scrape(url=URL.format(sub)) for sub in SUB_SITES]

# Clean data for whole country
testy = clean(data[0]["dataSource_testy"],
cols=data[0]["dataSource_testy"].columns,
new_index_name="dzien")

przyrost = clean(data[0]["dataSource_przyrost"],
cols=["country", "zar", "chor", "zgo", "wyl"],
new_index_name="country")

mobilnosc = clean(data[0]["dataSource_mobilnosc"],
cols=["dzien", "pieszo", "pojazdem"],
new_index_name="dzien")

hospitalizacja = clean(data[0]["dataSource_hospitalizacja"],
cols=["country", "hosp",
"kwar", "kwar_z", "nadzor"],
new_index_name="country")
testy = clean(
data[0]["dataSource_testy"],
cols=data[0]["dataSource_testy"].columns,
new_index_name="dzien",
)

przyrost = clean(
data[0]["dataSource_przyrost"],
cols=["country", "zar", "chor", "zgo", "wyl"],
new_index_name="country",
)

mobilnosc = clean(
data[0]["dataSource_mobilnosc"],
cols=["dzien", "chorzy_prev", "pieszo", "pojazdem"],
new_index_name="dzien",
)

hospitalizacja = clean(
data[0]["dataSource_hospitalizacja"],
cols=["country", "kwar", "kwar_z", "nadzor"],
new_index_name="country",
)

# Clean each region
regions = {SUB_SITES[i][12:]: clean(data[i]["dataSource_przyrost"],
cols=["country", "zar", "chor",
"zgo", "wyl"],
new_index_name="country")
for i in range(1, len(data))}
regions = {
SUB_SITES[i][12:]: clean(
data[i]["dataSource_koronawirus"],
cols=["dzien", "woj_zar", "woj_chor", "woj_zgo", "woj_wyl"],
new_index_name="dzien",
)
for i in range(1, len(data))
}

df_regions = clean_regions(regions)

# Merge and return DataFrames
df_poland = pd.merge(testy, przyrost, how='outer', left_index=True,
right_index=True)
df_poland = pd.merge(df_poland, mobilnosc, how='outer', left_index=True,
right_index=True)
df_poland = pd.merge(df_poland, hospitalizacja, how='outer',
left_index=True,
right_index=True)
df_poland = pd.merge(
testy, przyrost, how="outer", left_index=True, right_index=True
)
df_poland = pd.merge(
df_poland, mobilnosc, how="outer", left_index=True, right_index=True
)
df_poland = pd.merge(
df_poland,
hospitalizacja,
how="outer",
left_index=True,
right_index=True,
)

df_poland = clean_country(df_poland)

return {"koronawirusunas_poland": df_poland,
"koronawirusunas_regions": df_regions}
return {
"koronawirusunas_poland": df_poland,
"koronawirusunas_regions": df_regions,
}


if __name__ == '__main__':
if __name__ == "__main__":
print(get_data())
59 changes: 39 additions & 20 deletions corona_analysis/scrapers/rmf24.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
from urllib import request
from typing import Dict, Any

import pandas as pd
from bs4 import BeautifulSoup
Expand All @@ -14,11 +15,10 @@
URL = "https://www.rmf.fm/inc/outer/korona-wykres/wykres.html"


def scrape(data_url: str = URL) -> dict:
def scrape(data_url: str = URL) -> Dict[str, Any]:
"""
Scrapes data from rmf's chart
...
Attributes
----------
Expand All @@ -34,11 +34,12 @@ def scrape(data_url: str = URL) -> dict:
data = str(soup.body.find("script"))

# find lists of data
(sick, deaths, recovers) = re.findall(r"\[\[.*\]\]", data)
(sick, deaths, recoveries, vaccinations) = re.findall(r"\[\[.*\]\]", data)

sick = sick.split("],[")
deaths = deaths.split("],[")
recovers = recovers.split("],[")
recoveries = recoveries.split("],[")
vaccinations = vaccinations.split("],[")

sick = [
remove_suffix(remove_prefix(remove_prefix(i, "[["), "Date.UTC"), "]]")
Expand All @@ -48,28 +49,36 @@ def scrape(data_url: str = URL) -> dict:
remove_suffix(remove_prefix(remove_prefix(i, "[["), "Date.UTC"), "]]")
for i in deaths
]
recovers = [
recoveries = [
remove_suffix(remove_prefix(remove_prefix(i, "[["), "Date.UTC"), "]]")
for i in recovers
for i in recoveries
]
vaccinations = [
remove_suffix(remove_prefix(remove_prefix(i, "[["), "Date.UTC"), "]]")
for i in vaccinations
]

sick = [i.split("),") for i in sick]
deaths = [i.split("),") for i in deaths]
recovers = [i.split("),") for i in recovers]
recoveries = [i.split("),") for i in recoveries]
vaccinations = [i.split("),") for i in vaccinations]

sick = [["-".join(i[0][1:].split(",")), i[1]] for i in sick]
deaths = [["-".join(i[0][1:].split(",")), i[1]] for i in deaths]
recovers = [["-".join(i[0][1:].split(",")), i[1]] for i in recovers]
vaccinations = [["-".join(i[0][1:].split(",")), i[1]] for i in vaccinations]

return {"sick": sick, "deaths": deaths, "recovers": recovers}
return {
"sick": sick,
"deaths": deaths,
"recovers": recoveries,
"vaccinations": vaccinations,
}


def get_data(sick: list, deaths: list, recovers: list) -> dict:
def get_data(url: str = URL) -> Dict[str, pd.DataFrame]:
"""
Returns dataframe of deaths, recovered and sick people
...
Attributes
----------
sick : list
Expand All @@ -78,18 +87,28 @@ def get_data(sick: list, deaths: list, recovers: list) -> dict:
list of lists with date and amount of dead people
recovers : list
list of lists with date and amount of recovered people
Returns
-------
dict
"rmf24" with dataframe
"""
data = scrape(URL)

sickDF = pd.DataFrame(data["sick"])
sickDF.columns = ("date", "sick")

sick = pd.DataFrame(sick)
sick.columns = ("date", "sick")
deathsDF = pd.DataFrame(data["deaths"])
deathsDF.columns = ("date", "deaths")

deaths = pd.DataFrame(deaths)
deaths.columns = ("date", "deaths")
recoveriesDF = pd.DataFrame(data["recoveries"])
recoveriesDF.columns = ("date", "recoveries")

recovers = pd.DataFrame(recovers)
recovers.columns = ("date", "recovers")
vaccinationsDF = pd.DataFrame(data["vaccinations"])
vaccinationsDF.columns = ("date", "vaccinations")

tmp = pd.merge(sick, deaths, how="outer", on="date")
tmp = pd.merge(tmp, recovers, how="outer", on="date")
tmp = pd.merge(sickDF, deathsDF, how="outer", on="date")
tmp = pd.merge(tmp, recoveriesDF, how="outer", on="date")
tmp = pd.merge(tmp, vaccinationsDF, how="outer", on="date")

return {"rmf24": tmp}
Loading

0 comments on commit 15c4ab8

Please sign in to comment.