forked from CoronaWhy/task-geo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request CoronaWhy#36 from oussamanaji/oussamanaji-frcovidata
France COVID19 Granular Data Extraction
- Loading branch information
Showing
5 changed files
with
299 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from task_geo.data_sources.covid.fr_covidata.fr_covidata import fr_covidata | ||
|
||
__all__ = ['fr_covidata'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import argparse | ||
|
||
from fr_covidata import fr_covidata | ||
|
||
|
||
def get_argparser(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
'-o', '--output', required=True, | ||
help='Destination file to store the processed dataset.') | ||
return parser | ||
|
||
|
||
def main(): | ||
parser = get_argparser() | ||
args = parser.parse_args() | ||
|
||
dataset = fr_covidata() | ||
dataset.to_csv(args.output, index=False, header=True) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# COVID-19 Granular Data - France | ||
|
||
## General information | ||
|
||
- **Description**: COVID-19 daily confirmed cases/hospitalized/recovering/recovered/deaths data in different subregions of France | ||
- **Credits**: @opencovid19-fr | ||
- **Source**: https://github.com/opencovid19-fr/data/ | ||
|
||
## Column Details | ||
|
||
**subregion_code** | ||
- Description: French Maille department code | ||
- Type: str | ||
|
||
**subregion_name** | ||
- Description: French department name | ||
- Type: str | ||
|
||
**country** | ||
- Description: Country Name - France | ||
- Type: str | ||
|
||
**date** | ||
- Description: yyyy/mm/dd Date | ||
- Type: datetime.date | ||
|
||
**confirmed** | ||
- Description: Cumulative number of confirmed covid-19 cases at the given location until the given time | ||
- Type: float | ||
|
||
**hospitalized** | ||
- Description: Number of hospitalized people due to covid-19 at the given location at the given time | ||
- Type: float | ||
|
||
**recovering** | ||
- Description: Number of people recovering from covid-19 at the given location at the given time | ||
- Type: float | ||
|
||
**recovered** | ||
- Description: Cumulative number of people healed from covid-19 at the given location until the given time | ||
- Type: float | ||
|
||
**deaths** | ||
- Description: Cumulative number of people who passed away from covid-19 at the given location until the given time | ||
- Type: float | ||
|
||
**source_name** | ||
- Description: Name of the source of information relevant to the given location at the given time | ||
- Type: str | ||
|
||
**source_url** | ||
- Description: Uniform Resource Locator link of the source of information relevant to the given location at the given time | ||
- Type: str | ||
|
||
**source_name** | ||
- Description: Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time | ||
- Type: str | ||
|
||
**source_name** | ||
- Description: Type of the source of information: National Health Ministry/Regional health agencies | ||
- Type: str | ||
|
||
## Transformations applied | ||
|
||
- Deleting the rows non-related to the departments inside Metropolitan France ('region', 'monde', 'pays', 'collectivite-outremer', 'DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976') | ||
- Dropping the columns 'depistes' and 'granularite' | ||
- Deleting the dupicate rows containing the same numerical data | ||
- Switching the 'date' column format from str to datetime.date | ||
- Switching the 'source_nom', 'source_url', 'source_archive', 'source_type'' columns format to str | ||
- Merging all the rows that have the same 'date' and 'subregion_name' column value, by taking the maximum of every numerical column value and aggregating both sources columns' info | ||
- Renaming/Translating the column titles from French to English, and rearranging them |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
{ | ||
"title": "COVID-19 Granular Data - France", | ||
"description": "Sourcing of COVID-19 cases granular data in different subregions of France, aggregated by @opencovid19-fr, https://github.com/opencovid19-fr/", | ||
"licenses": [{"name": "copyright-authors"}], | ||
"fields": [ | ||
{ | ||
"name": "subregion_code", | ||
"description": "Geographical location - maille code", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "subregion_name", | ||
"description": "Geographical location - department", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "country", | ||
"description": "Geographical location - country", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "date", | ||
"description": "Date", | ||
"type": "datetime.date" | ||
}, | ||
{ | ||
"name": "confirmed", | ||
"description": "Cumulative number of confirmed covid-19 cases at the given location until the given time", | ||
"type": "float" | ||
}, | ||
{ | ||
"name": "hospitalized", | ||
"description": "Number of hospitalized people due to covid-19 at the given location at the given time", | ||
"type": "float" | ||
}, | ||
{ | ||
"name": "recovering", | ||
"description": "Number of people recovering from covid-19 at the given location at the given time", | ||
"type": "float" | ||
}, | ||
{ | ||
"name": "recovered", | ||
"description": "Cumulative number of people healed from covid-19 at the given location until the given time", | ||
"type": "float" | ||
}, | ||
{ | ||
"name": "deaths", | ||
"description": "Cumulative number of people who passed away from covid-19 at the given location until the given time", | ||
"type": "float" | ||
}, | ||
{ | ||
"name": "source_name", | ||
"description": "Name of the source of information relevant to the given location at the given time", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "source_url", | ||
"description": "Uniform Resource Locator link of the source of information relevant to the given location at the given time", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "source_archive", | ||
"description": "Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time", | ||
"type": "str" | ||
}, | ||
{ | ||
"name": "source_type", | ||
"description": "Type of the source of information: National Health Ministry/Regional health agencies", | ||
"type": "str" | ||
} | ||
], | ||
"keywords": [ | ||
"France", | ||
"COVID-19", | ||
"granular data" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
""" | ||
fr_covidata.py | ||
Functions: | ||
- fr_covidata_connector: Extracts data from CSV URL | ||
- fr_covidata_formatter: Cleans CSV data | ||
- fr_covidata: Combines the two previous functions | ||
Data Credits: | ||
OpenCOVID19-fr | ||
https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/ | ||
https://github.com/opencovid19-fr/data | ||
""" | ||
|
||
import io | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import requests | ||
|
||
url = ( | ||
'https://raw.githubusercontent.com/opencovid19-fr/' | ||
'data/master/dist/chiffres-cles.csv' | ||
) | ||
|
||
|
||
def fr_covidata(): | ||
"""Data Source for the French COVID-19 Data. | ||
Arguments: | ||
None | ||
Returns: | ||
pandas.DataFrame | ||
""" | ||
df = fr_covidata_connector() | ||
return fr_covidata_formatter(df) | ||
|
||
|
||
def fr_covidata_connector(): | ||
"""Extract data from OpenCOVID19-fr's Github repository. | ||
Description: | ||
- Downloads the URL's data in a Unicode CSV Format | ||
- Unicode CSV Format: ACS 5Y UTF-8 | ||
Returns: | ||
dataset (DataFrame with CSV Data) | ||
""" | ||
|
||
urlData = requests.get(url).content | ||
|
||
dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8'))) | ||
return dataset | ||
|
||
|
||
def fr_covidata_formatter(dataset): | ||
"""Formatter for FR COVID-19 Data. | ||
Arguments: | ||
dataset(pandas.DataFrame): Data as returned by fr_covidata_connector. | ||
Description: | ||
- Drop unnecessary rows with irrelevant regions' info and only keep | ||
info related to subregions in Metropolitan France, as well as | ||
repetitive data | ||
- Check the dataset for instances where there are more than one source | ||
of data in the same subregion for the same date, then complement all | ||
the sources information, and take the highest value in case there are | ||
different values for the same column, while aggregating the sources | ||
info | ||
- Rename/Translate the column titles, and add a country column (France) | ||
Returns: | ||
frcovidata(pandas.DataFrame) | ||
""" | ||
|
||
no_gr = ['region', 'monde', 'pays', 'collectivite-outremer'] | ||
no_mc = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976'] | ||
dataset = dataset[ | ||
(~dataset.granularite.isin(no_gr)) & (~dataset.maille_code.isin(no_mc)) | ||
] | ||
dataset = dataset.drop(['depistes', 'granularite'], axis=1) | ||
dataset = dataset.drop_duplicates( | ||
subset=['date', 'maille_code', 'cas_confirmes', 'deces', | ||
'reanimation', | ||
'hospitalises', 'gueris'], keep=False) | ||
dataset['date'] = pd.to_datetime(dataset['date'].astype(str)).dt.date | ||
|
||
# Reset indices: | ||
dataset = dataset.reset_index(drop=True) | ||
|
||
# Turn source columns' values type to string: | ||
str_columns = ['source_nom', 'source_url', | ||
'source_archive', 'source_type'] | ||
dataset[str_columns] = dataset[str_columns].astype(str) | ||
|
||
aggre = { | ||
'cas_confirmes': np.max, | ||
'cas_ehpad': np.max, | ||
'cas_confirmes_ehpad': np.max, | ||
'cas_possibles_ehpad': np.max, | ||
'deces': np.max, | ||
'deces_ehpad': np.max, | ||
'reanimation': np.max, | ||
'hospitalises': np.max, | ||
'gueris': np.max, | ||
'source_nom': ','.join, | ||
'source_url': ','.join, | ||
'source_archive': ','.join, | ||
'source_type': ','.join | ||
} | ||
dataset = dataset.groupby(['date', | ||
'maille_code', | ||
'maille_nom']).aggregate(aggre).reset_index() | ||
|
||
# Rename/Translate the column titles: | ||
dataset = dataset.rename( | ||
columns={"maille_code": "subregion_code", | ||
"maille_nom": "subregion_name", "cas_confirmes": "confirmed", | ||
"deces": "deaths", "reanimation": "recovering", | ||
"hospitalises": "hospitalized", "gueris": "recovered", | ||
"source_nom": "source_name"}) | ||
dataset['country'] = 'France' | ||
frcovidata = dataset[ | ||
'subregion_code', 'subregion_name', 'country', 'date', 'confirmed', | ||
'hospitalized', 'recovering', 'recovered', | ||
'deaths', 'source_name', 'source_url', 'source_archive', | ||
'source_type'] | ||
|
||
return frcovidata |