From fe9bbfac77952175e34f23fbb17de244175132d7 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Fri, 3 Apr 2020 23:33:15 +0300 Subject: [PATCH 01/12] France COVID19 Granular Data Extraction --- .../covid/fr_covidata/__init__.py | 3 + .../covid/fr_covidata/__main__.py | 24 +++ .../covid/fr_covidata/fr_covidata.py | 185 ++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 task_geo/data_sources/covid/fr_covidata/__init__.py create mode 100644 task_geo/data_sources/covid/fr_covidata/__main__.py create mode 100644 task_geo/data_sources/covid/fr_covidata/fr_covidata.py diff --git a/task_geo/data_sources/covid/fr_covidata/__init__.py b/task_geo/data_sources/covid/fr_covidata/__init__.py new file mode 100644 index 0000000..7c7bae7 --- /dev/null +++ b/task_geo/data_sources/covid/fr_covidata/__init__.py @@ -0,0 +1,3 @@ +from task_geo.data_sources.covid.fr_covidata.fr_covidata import fr_covidata + +__all__ = ['fr_covidata'] \ No newline at end of file diff --git a/task_geo/data_sources/covid/fr_covidata/__main__.py b/task_geo/data_sources/covid/fr_covidata/__main__.py new file mode 100644 index 0000000..101d3b8 --- /dev/null +++ b/task_geo/data_sources/covid/fr_covidata/__main__.py @@ -0,0 +1,24 @@ +import argparse + +from fr_covidata import fr_covidata + + +def get_argparser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + '-o', '--output', required=True, + help='Destination file to store the processed dataset.') + return parser + + +def main(): + parser = get_argparser() + args = parser.parse_args() + + dataset = fr_covidata() + dataset.to_csv(args.output, index=False, header=True) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py new file mode 100644 index 0000000..a40d2fe --- /dev/null +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -0,0 +1,185 @@ +""" +fr_covidata.py + +Functions: + - fr_covidata_connector: Extracts data from CSV URL + - fr_covidata_formatter: Cleans CSV data + - fr_covidata: Combines the two previous functions + +Data Credits: + OpenCOVID19-fr + https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/ + https://github.com/opencovid19-fr/data +""" + +import requests +import io + +import pandas as pd + +url = 'https://raw.githubusercontent.com/opencovid19-fr/data/master/dist/chiffres-cles.csv' + + +def fr_covidata(): + """Data Source for the French COVID-19 Data. + Arguments: + None + Returns: + pandas.DataFrame + """ + df = fr_covidata_connector() + return fr_covidata_formatter(df) + + +def fr_covidata_connector(): + """Extract data from OpenCOVID19-fr's Github repository. + Description: + - Downloads the URL's data in a Unicode CSV Format + - Unicode CSV Format: ACS 5Y UTF-8 + Returns: + dataset (DataFrame with CSV Data) + """ + + urlData = requests.get(url).content + + dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8'))) + return dataset + + +def fr_covidata_formatter(dataset): + """Formatter for FR COVID-19 Data. + Arguments: + dataset(pandas.DataFrame): Data as returned by fr_covidata_connector. + Description: + - Drop unnecessary rows with irrelevant regions' info and only keep info + related to subregions in Metropolitan France, as well as repetitive data + - Check the dataset for instances where there are more than one source + of data in the same subregion for the same date, then complement all the + sources information, and take the highest value in case there are + different values for the same column, while aggregating the sources info + - Rename/Translate the column titles, and add a country column (France) + Returns: + frcovidata(pandas.DataFrame) + """ + + dataset = dataset[dataset.granularite != 'region'] + dataset = dataset[dataset.granularite != 'monde'] + dataset = dataset[dataset.granularite != 'pays'] + dataset = dataset[dataset.granularite != 'collectivite-outremer'] + dataset = dataset[dataset.maille_code != 'DEP-971'] + dataset = dataset[dataset.maille_code != 'DEP-972'] + dataset = dataset[dataset.maille_code != 'DEP-973'] + dataset = dataset[dataset.maille_code != 'DEP-974'] + dataset = dataset[dataset.maille_code != 'DEP-976'] + dataset = dataset.drop(['depistes'], axis=1) + dataset = dataset.drop(['granularite'], axis=1) + dataset = dataset.drop_duplicates(subset=['date', 'maille_code', 'cas_confirmes', 'deces', 'reanimation', 'hospitalises', 'gueris'], keep=False) + + #Reset indices: + dataset = dataset.reset_index() + dataset = dataset.drop(['index'], axis=1) + + #Turn source columns' values type to string: + dataset['source_nom'] = dataset['source_nom'].astype(str) + dataset['source_url'] = dataset['source_url'].astype(str) + dataset['source_archive'] = dataset['source_archive'].astype(str) + dataset['source_type'] = dataset['source_type'].astype(str) + + + for i in range(len(dataset)-1): + if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'cas_confirmes'] != 'inv' and i != len(dataset): + #Combine Source names, url, archive and type for repetitive subregions at the same date: + dataset.loc[i,'source_nom'] = dataset.loc[i,'source_nom'] + dataset.loc[i+1,'source_nom'] + dataset.loc[i,'source_url'] = dataset.loc[i,'source_url'] + dataset.loc[i+1,'source_url'] + dataset.loc[i,'source_archive'] = dataset.loc[i,'source_archive'] + dataset.loc[i+1,'source_archive'] + dataset.loc[i,'source_type'] = dataset.loc[i,'source_type'] + dataset.loc[i+1,'source_type'] + if pd.isnull(dataset.loc[i,'cas_confirmes']) == True and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == False: + dataset.loc[i,'cas_confirmes'] = dataset.loc[i+1,'cas_confirmes'] + dataset.loc[i+1,'cas_confirmes'] = 'inv' + elif pd.isnull(dataset.loc[i,'cas_confirmes']) == False and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == True: + dataset.loc[i+1,'cas_confirmes'] = 'inv' + elif pd.isnull(dataset.loc[i,'cas_confirmes']) == True and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == True: + dataset.loc[i+1,'cas_confirmes'] = 'inv' + elif dataset.loc[i,'cas_confirmes'] == dataset.loc[i+1,'cas_confirmes']: + dataset.loc[i+1,'cas_confirmes'] = 'inv' + elif dataset.loc[i,'cas_confirmes'] > dataset.loc[i+1,'cas_confirmes']: + dataset.loc[i+1,'cas_confirmes'] = 'inv' + elif dataset.loc[i,'cas_confirmes'] < dataset.loc[i+1,'cas_confirmes']: + dataset.loc[i,'cas_confirmes'] = dataset.loc[i+1,'cas_confirmes'] + dataset.loc[i+1,'cas_confirmes'] = 'inv' + + if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'deces'] != 'inv': + if pd.isnull(dataset.loc[i,'deces']) == True and pd.isnull(dataset.loc[i+1,'deces']) == False: + dataset.loc[i,'deces'] = dataset.loc[i+1,'deces'] + dataset.loc[i+1,'deces'] = 'inv' + elif pd.isnull(dataset.loc[i,'deces']) == False and pd.isnull(dataset.loc[i+1,'deces']) == True: + dataset.loc[i+1,'deces'] = 'inv' + elif pd.isnull(dataset.loc[i,'deces']) == True and pd.isnull(dataset.loc[i+1,'deces']) == True: + dataset.loc[i+1,'deces'] = 'inv' + elif dataset.loc[i,'deces'] == dataset.loc[i+1,'deces']: + dataset.loc[i+1,'deces'] = 'inv' + elif dataset.loc[i,'deces'] > dataset.loc[i+1,'deces']: + dataset.loc[i+1,'deces'] = 'inv' + elif dataset.loc[i,'deces'] < dataset.loc[i+1,'deces']: + dataset.loc[i,'deces'] = dataset.loc[i+1,'deces'] + dataset.loc[i+1,'deces'] = 'inv' + + if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'reanimation'] != 'inv': + if pd.isnull(dataset.loc[i,'reanimation']) == True and pd.isnull(dataset.loc[i+1,'reanimation']) == False: + dataset.loc[i,'reanimation'] = dataset.loc[i+1,'reanimation'] + dataset.loc[i+1,'reanimation'] = 'inv' + elif pd.isnull(dataset.loc[i,'reanimation']) == False and pd.isnull(dataset.loc[i+1,'reanimation']) == True: + dataset.loc[i+1,'reanimation'] = 'inv' + elif pd.isnull(dataset.loc[i,'reanimation']) == True and pd.isnull(dataset.loc[i+1,'reanimation']) == True: + dataset.loc[i+1,'reanimation'] = 'inv' + elif dataset.loc[i,'reanimation'] == dataset.loc[i+1,'reanimation']: + dataset.loc[i+1,'reanimation'] = 'inv' + elif dataset.loc[i,'reanimation'] > dataset.loc[i+1,'reanimation']: + dataset.loc[i+1,'reanimation'] = 'inv' + elif dataset.loc[i,'reanimation'] < dataset.loc[i+1,'reanimation']: + dataset.loc[i,'reanimation'] = dataset.loc[i+1,'reanimation'] + dataset.loc[i+1,'reanimation'] = 'inv' + + if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'hospitalises'] != 'inv': + if pd.isnull(dataset.loc[i,'hospitalises']) == True and pd.isnull(dataset.loc[i+1,'hospitalises']) == False: + dataset.loc[i,'hospitalises'] = dataset.loc[i+1,'hospitalises'] + dataset.loc[i+1,'hospitalises'] = 'inv' + elif pd.isnull(dataset.loc[i,'hospitalises']) == False and pd.isnull(dataset.loc[i+1,'hospitalises']) == True: + dataset.loc[i+1,'hospitalises'] = 'inv' + elif pd.isnull(dataset.loc[i,'hospitalises']) == True and pd.isnull(dataset.loc[i+1,'hospitalises']) == True: + dataset.loc[i+1,'hospitalises'] = 'inv' + elif dataset.loc[i,'hospitalises'] == dataset.loc[i+1,'hospitalises']: + dataset.loc[i+1,'hospitalises'] = 'inv' + elif dataset.loc[i,'hospitalises'] > dataset.loc[i+1,'hospitalises']: + dataset.loc[i+1,'hospitalises'] = 'inv' + elif dataset.loc[i,'hospitalises'] < dataset.loc[i+1,'hospitalises']: + dataset.loc[i,'hospitalises'] = dataset.loc[i+1,'hospitalises'] + dataset.loc[i+1,'hospitalises'] = 'inv' + + if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'gueris'] != 'inv': + if pd.isnull(dataset.loc[i,'gueris']) == True and pd.isnull(dataset.loc[i+1,'gueris']) == False: + dataset.loc[i,'gueris'] = dataset.loc[i+1,'gueris'] + dataset.loc[i+1,'gueris'] = 'inv' + elif pd.isnull(dataset.loc[i,'gueris']) == False and pd.isnull(dataset.loc[i+1,'gueris']) == True: + dataset.loc[i+1,'gueris'] = 'inv' + elif pd.isnull(dataset.loc[i,'gueris']) == True and pd.isnull(dataset.loc[i+1,'gueris']) == True: + dataset.loc[i+1,'gueris'] = 'inv' + elif dataset.loc[i,'gueris'] == dataset.loc[i+1,'gueris']: + dataset.loc[i+1,'gueris'] = 'inv' + elif dataset.loc[i,'gueris'] > dataset.loc[i+1,'gueris']: + dataset.loc[i+1,'gueris'] = 'inv' + elif dataset.loc[i,'gueris'] < dataset.loc[i+1,'gueris']: + dataset.loc[i,'gueris'] = dataset.loc[i+1,'gueris'] + dataset.loc[i+1,'gueris'] = 'inv' + + #Delete the redundant resulting rows and reset the indices: + dataset = dataset[dataset.cas_confirmes != 'inv'] + dataset = dataset.reset_index() + dataset = dataset.drop(['index'], axis=1) + + #Rename/Translate the column titles: + dataset = dataset.rename(columns={"maille_code":"subregion_code", "maille_nom":"subregion_name", "cas_confirmes": "confirmed", "deces": "deaths", "reanimation": "recovering", "hospitalises": "hospitalized", "gueris": "recovered", "source_nom": "source_name"}) + dataset['country'] = 'France' + frcovidata = dataset[['subregion_code', 'subregion_name', 'country', 'date', 'confirmed', 'hospitalized', 'recovering', 'recovered', 'deaths', 'source_name', 'source_url', 'source_archive', 'source_type']] + + return frcovidata \ No newline at end of file From 533df325a001b6b6d52bc7b07110b0283d467f67 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Sun, 5 Apr 2020 16:22:38 +0300 Subject: [PATCH 02/12] Update __init__.py --- task_geo/data_sources/covid/fr_covidata/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/data_sources/covid/fr_covidata/__init__.py b/task_geo/data_sources/covid/fr_covidata/__init__.py index 7c7bae7..80d4d99 100644 --- a/task_geo/data_sources/covid/fr_covidata/__init__.py +++ b/task_geo/data_sources/covid/fr_covidata/__init__.py @@ -1,3 +1,3 @@ from task_geo.data_sources.covid.fr_covidata.fr_covidata import fr_covidata -__all__ = ['fr_covidata'] \ No newline at end of file +__all__ = ['fr_covidata'] From 459b44a941431b6eb0f67a8a1961b247bcf13cc0 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Sun, 5 Apr 2020 16:24:27 +0300 Subject: [PATCH 03/12] Update __main__.py --- task_geo/data_sources/covid/fr_covidata/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/data_sources/covid/fr_covidata/__main__.py b/task_geo/data_sources/covid/fr_covidata/__main__.py index 101d3b8..116d7f6 100644 --- a/task_geo/data_sources/covid/fr_covidata/__main__.py +++ b/task_geo/data_sources/covid/fr_covidata/__main__.py @@ -21,4 +21,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() From f4fc5bc39a962d491ecb11e8fa08fbd45d2df889 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Sun, 5 Apr 2020 16:26:59 +0300 Subject: [PATCH 04/12] Update fr_covidata.py --- .../covid/fr_covidata/fr_covidata.py | 276 +++++++++++------- 1 file changed, 174 insertions(+), 102 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index a40d2fe..aaa0c0b 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -12,12 +12,15 @@ https://github.com/opencovid19-fr/data """ -import requests import io import pandas as pd +import requests -url = 'https://raw.githubusercontent.com/opencovid19-fr/data/master/dist/chiffres-cles.csv' +url = ( + 'https://raw.githubusercontent.com/opencovid19-fr/' + 'data/master/dist/chiffres-cles.csv' + ) def fr_covidata(): @@ -39,7 +42,7 @@ def fr_covidata_connector(): Returns: dataset (DataFrame with CSV Data) """ - + urlData = requests.get(url).content dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8'))) @@ -51,12 +54,14 @@ def fr_covidata_formatter(dataset): Arguments: dataset(pandas.DataFrame): Data as returned by fr_covidata_connector. Description: - - Drop unnecessary rows with irrelevant regions' info and only keep info - related to subregions in Metropolitan France, as well as repetitive data + - Drop unnecessary rows with irrelevant regions' info and only keep + info related to subregions in Metropolitan France, as well as + repetitive data - Check the dataset for instances where there are more than one source - of data in the same subregion for the same date, then complement all the - sources information, and take the highest value in case there are - different values for the same column, while aggregating the sources info + of data in the same subregion for the same date, then complement all + the sources information, and take the highest value in case there are + different values for the same column, while aggregating the sources + info - Rename/Translate the column titles, and add a country column (France) Returns: frcovidata(pandas.DataFrame) @@ -73,113 +78,180 @@ def fr_covidata_formatter(dataset): dataset = dataset[dataset.maille_code != 'DEP-976'] dataset = dataset.drop(['depistes'], axis=1) dataset = dataset.drop(['granularite'], axis=1) - dataset = dataset.drop_duplicates(subset=['date', 'maille_code', 'cas_confirmes', 'deces', 'reanimation', 'hospitalises', 'gueris'], keep=False) + dataset = dataset.drop_duplicates( + subset=['date', 'maille_code', 'cas_confirmes', 'deces', + 'reanimation', + 'hospitalises', 'gueris'], keep=False) - #Reset indices: + # Reset indices: dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis=1) - #Turn source columns' values type to string: + # Turn source columns' values type to string: dataset['source_nom'] = dataset['source_nom'].astype(str) dataset['source_url'] = dataset['source_url'].astype(str) dataset['source_archive'] = dataset['source_archive'].astype(str) dataset['source_type'] = dataset['source_type'].astype(str) + for i in range(len(dataset) - 1): + if dataset.loc[i, 'maille_code'] == dataset.loc[ + i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ + i + 1, 'date'] and dataset.loc[ + i, 'cas_confirmes'] != 'inv' and i != len(dataset): + # Combine Source names, url, archive and type for repetitive + # subregions at the same date: + dataset.loc[i, 'source_nom'] = dataset.loc[i, 'source_nom'] + \ + dataset.loc[i + 1, 'source_nom'] + dataset.loc[i, 'source_url'] = dataset.loc[i, 'source_url'] + \ + dataset.loc[i + 1, 'source_url'] + dataset.loc[i, 'source_archive'] = dataset.loc[ + i, 'source_archive'] + \ + dataset.loc[ + i + 1, 'source_archive'] + dataset.loc[i, 'source_type'] = dataset.loc[i, 'source_type'] + \ + dataset.loc[i + 1, 'source_type'] + if pd.isnull( + dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( + dataset.loc[i + 1, 'cas_confirmes']) is False: + dataset.loc[i, 'cas_confirmes'] = dataset.loc[ + i + 1, 'cas_confirmes'] + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'cas_confirmes']) is False and pd.isnull( + dataset.loc[i + 1, 'cas_confirmes']) is True: + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( + dataset.loc[i + 1, 'cas_confirmes']) is True: + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + elif dataset.loc[i, 'cas_confirmes'] == dataset.loc[ + i + 1, 'cas_confirmes']: + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + elif dataset.loc[i, 'cas_confirmes'] > dataset.loc[ + i + 1, 'cas_confirmes']: + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + elif dataset.loc[i, 'cas_confirmes'] < dataset.loc[ + i + 1, 'cas_confirmes']: + dataset.loc[i, 'cas_confirmes'] = dataset.loc[ + i + 1, 'cas_confirmes'] + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + + if dataset.loc[i, 'maille_code'] == dataset.loc[ + i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ + i + 1, 'date'] and dataset.loc[i, 'deces'] != 'inv': + if pd.isnull(dataset.loc[i, 'deces']) is True and pd.isnull( + dataset.loc[i + 1, 'deces']) is False: + dataset.loc[i, 'deces'] = dataset.loc[i + 1, 'deces'] + dataset.loc[i + 1, 'deces'] = 'inv' + elif pd.isnull(dataset.loc[i, 'deces']) is False and pd.isnull( + dataset.loc[i + 1, 'deces']) is True: + dataset.loc[i + 1, 'deces'] = 'inv' + elif pd.isnull(dataset.loc[i, 'deces']) is True and pd.isnull( + dataset.loc[i + 1, 'deces']) is True: + dataset.loc[i + 1, 'deces'] = 'inv' + elif dataset.loc[i, 'deces'] == dataset.loc[i + 1, 'deces']: + dataset.loc[i + 1, 'deces'] = 'inv' + elif dataset.loc[i, 'deces'] > dataset.loc[i + 1, 'deces']: + dataset.loc[i + 1, 'deces'] = 'inv' + elif dataset.loc[i, 'deces'] < dataset.loc[i + 1, 'deces']: + dataset.loc[i, 'deces'] = dataset.loc[i + 1, 'deces'] + dataset.loc[i + 1, 'deces'] = 'inv' + + if dataset.loc[i, 'maille_code'] == dataset.loc[ + i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ + i + 1, 'date'] and dataset.loc[i, 'reanimation'] != 'inv': + if pd.isnull(dataset.loc[i, 'reanimation']) is True and pd.isnull( + dataset.loc[i + 1, 'reanimation']) is False: + dataset.loc[i, 'reanimation'] = dataset.loc[ + i + 1, 'reanimation'] + dataset.loc[i + 1, 'reanimation'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'reanimation']) is False and pd.isnull( + dataset.loc[i + 1, 'reanimation']) is True: + dataset.loc[i + 1, 'reanimation'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'reanimation']) is True and pd.isnull( + dataset.loc[i + 1, 'reanimation']) is True: + dataset.loc[i + 1, 'reanimation'] = 'inv' + elif dataset.loc[i, 'reanimation'] == dataset.loc[ + i + 1, 'reanimation']: + dataset.loc[i + 1, 'reanimation'] = 'inv' + elif dataset.loc[i, 'reanimation'] > dataset.loc[ + i + 1, 'reanimation']: + dataset.loc[i + 1, 'reanimation'] = 'inv' + elif dataset.loc[i, 'reanimation'] < dataset.loc[ + i + 1, 'reanimation']: + dataset.loc[i, 'reanimation'] = dataset.loc[ + i + 1, 'reanimation'] + dataset.loc[i + 1, 'reanimation'] = 'inv' + + if dataset.loc[i, 'maille_code'] == dataset.loc[ + i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ + i + 1, 'date'] and dataset.loc[i, 'hospitalises'] != 'inv': + if pd.isnull( + dataset.loc[i, 'hospitalises']) is True and pd.isnull( + dataset.loc[i + 1, 'hospitalises']) is False: + dataset.loc[i, 'hospitalises'] = dataset.loc[ + i + 1, 'hospitalises'] + dataset.loc[i + 1, 'hospitalises'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'hospitalises']) is False and pd.isnull( + dataset.loc[i + 1, 'hospitalises']) is True: + dataset.loc[i + 1, 'hospitalises'] = 'inv' + elif pd.isnull( + dataset.loc[i, 'hospitalises']) is True and pd.isnull( + dataset.loc[i + 1, 'hospitalises']) is True: + dataset.loc[i + 1, 'hospitalises'] = 'inv' + elif dataset.loc[i, 'hospitalises'] == dataset.loc[ + i + 1, 'hospitalises']: + dataset.loc[i + 1, 'hospitalises'] = 'inv' + elif dataset.loc[i, 'hospitalises'] > dataset.loc[ + i + 1, 'hospitalises']: + dataset.loc[i + 1, 'hospitalises'] = 'inv' + elif dataset.loc[i, 'hospitalises'] < dataset.loc[ + i + 1, 'hospitalises']: + dataset.loc[i, 'hospitalises'] = dataset.loc[ + i + 1, 'hospitalises'] + dataset.loc[i + 1, 'hospitalises'] = 'inv' + + if dataset.loc[i, 'maille_code'] == dataset.loc[ + i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ + i + 1, 'date'] and dataset.loc[i, 'gueris'] != 'inv': + if pd.isnull(dataset.loc[i, 'gueris']) is True and pd.isnull( + dataset.loc[i + 1, 'gueris']) is False: + dataset.loc[i, 'gueris'] = dataset.loc[i + 1, 'gueris'] + dataset.loc[i + 1, 'gueris'] = 'inv' + elif pd.isnull(dataset.loc[i, 'gueris']) is False and pd.isnull( + dataset.loc[i + 1, 'gueris']) is True: + dataset.loc[i + 1, 'gueris'] = 'inv' + elif pd.isnull(dataset.loc[i, 'gueris']) is True and pd.isnull( + dataset.loc[i + 1, 'gueris']) is True: + dataset.loc[i + 1, 'gueris'] = 'inv' + elif dataset.loc[i, 'gueris'] == dataset.loc[i + 1, 'gueris']: + dataset.loc[i + 1, 'gueris'] = 'inv' + elif dataset.loc[i, 'gueris'] > dataset.loc[i + 1, 'gueris']: + dataset.loc[i + 1, 'gueris'] = 'inv' + elif dataset.loc[i, 'gueris'] < dataset.loc[i + 1, 'gueris']: + dataset.loc[i, 'gueris'] = dataset.loc[i + 1, 'gueris'] + dataset.loc[i + 1, 'gueris'] = 'inv' - for i in range(len(dataset)-1): - if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'cas_confirmes'] != 'inv' and i != len(dataset): - #Combine Source names, url, archive and type for repetitive subregions at the same date: - dataset.loc[i,'source_nom'] = dataset.loc[i,'source_nom'] + dataset.loc[i+1,'source_nom'] - dataset.loc[i,'source_url'] = dataset.loc[i,'source_url'] + dataset.loc[i+1,'source_url'] - dataset.loc[i,'source_archive'] = dataset.loc[i,'source_archive'] + dataset.loc[i+1,'source_archive'] - dataset.loc[i,'source_type'] = dataset.loc[i,'source_type'] + dataset.loc[i+1,'source_type'] - if pd.isnull(dataset.loc[i,'cas_confirmes']) == True and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == False: - dataset.loc[i,'cas_confirmes'] = dataset.loc[i+1,'cas_confirmes'] - dataset.loc[i+1,'cas_confirmes'] = 'inv' - elif pd.isnull(dataset.loc[i,'cas_confirmes']) == False and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == True: - dataset.loc[i+1,'cas_confirmes'] = 'inv' - elif pd.isnull(dataset.loc[i,'cas_confirmes']) == True and pd.isnull(dataset.loc[i+1,'cas_confirmes']) == True: - dataset.loc[i+1,'cas_confirmes'] = 'inv' - elif dataset.loc[i,'cas_confirmes'] == dataset.loc[i+1,'cas_confirmes']: - dataset.loc[i+1,'cas_confirmes'] = 'inv' - elif dataset.loc[i,'cas_confirmes'] > dataset.loc[i+1,'cas_confirmes']: - dataset.loc[i+1,'cas_confirmes'] = 'inv' - elif dataset.loc[i,'cas_confirmes'] < dataset.loc[i+1,'cas_confirmes']: - dataset.loc[i,'cas_confirmes'] = dataset.loc[i+1,'cas_confirmes'] - dataset.loc[i+1,'cas_confirmes'] = 'inv' - - if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'deces'] != 'inv': - if pd.isnull(dataset.loc[i,'deces']) == True and pd.isnull(dataset.loc[i+1,'deces']) == False: - dataset.loc[i,'deces'] = dataset.loc[i+1,'deces'] - dataset.loc[i+1,'deces'] = 'inv' - elif pd.isnull(dataset.loc[i,'deces']) == False and pd.isnull(dataset.loc[i+1,'deces']) == True: - dataset.loc[i+1,'deces'] = 'inv' - elif pd.isnull(dataset.loc[i,'deces']) == True and pd.isnull(dataset.loc[i+1,'deces']) == True: - dataset.loc[i+1,'deces'] = 'inv' - elif dataset.loc[i,'deces'] == dataset.loc[i+1,'deces']: - dataset.loc[i+1,'deces'] = 'inv' - elif dataset.loc[i,'deces'] > dataset.loc[i+1,'deces']: - dataset.loc[i+1,'deces'] = 'inv' - elif dataset.loc[i,'deces'] < dataset.loc[i+1,'deces']: - dataset.loc[i,'deces'] = dataset.loc[i+1,'deces'] - dataset.loc[i+1,'deces'] = 'inv' - - if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'reanimation'] != 'inv': - if pd.isnull(dataset.loc[i,'reanimation']) == True and pd.isnull(dataset.loc[i+1,'reanimation']) == False: - dataset.loc[i,'reanimation'] = dataset.loc[i+1,'reanimation'] - dataset.loc[i+1,'reanimation'] = 'inv' - elif pd.isnull(dataset.loc[i,'reanimation']) == False and pd.isnull(dataset.loc[i+1,'reanimation']) == True: - dataset.loc[i+1,'reanimation'] = 'inv' - elif pd.isnull(dataset.loc[i,'reanimation']) == True and pd.isnull(dataset.loc[i+1,'reanimation']) == True: - dataset.loc[i+1,'reanimation'] = 'inv' - elif dataset.loc[i,'reanimation'] == dataset.loc[i+1,'reanimation']: - dataset.loc[i+1,'reanimation'] = 'inv' - elif dataset.loc[i,'reanimation'] > dataset.loc[i+1,'reanimation']: - dataset.loc[i+1,'reanimation'] = 'inv' - elif dataset.loc[i,'reanimation'] < dataset.loc[i+1,'reanimation']: - dataset.loc[i,'reanimation'] = dataset.loc[i+1,'reanimation'] - dataset.loc[i+1,'reanimation'] = 'inv' - - if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'hospitalises'] != 'inv': - if pd.isnull(dataset.loc[i,'hospitalises']) == True and pd.isnull(dataset.loc[i+1,'hospitalises']) == False: - dataset.loc[i,'hospitalises'] = dataset.loc[i+1,'hospitalises'] - dataset.loc[i+1,'hospitalises'] = 'inv' - elif pd.isnull(dataset.loc[i,'hospitalises']) == False and pd.isnull(dataset.loc[i+1,'hospitalises']) == True: - dataset.loc[i+1,'hospitalises'] = 'inv' - elif pd.isnull(dataset.loc[i,'hospitalises']) == True and pd.isnull(dataset.loc[i+1,'hospitalises']) == True: - dataset.loc[i+1,'hospitalises'] = 'inv' - elif dataset.loc[i,'hospitalises'] == dataset.loc[i+1,'hospitalises']: - dataset.loc[i+1,'hospitalises'] = 'inv' - elif dataset.loc[i,'hospitalises'] > dataset.loc[i+1,'hospitalises']: - dataset.loc[i+1,'hospitalises'] = 'inv' - elif dataset.loc[i,'hospitalises'] < dataset.loc[i+1,'hospitalises']: - dataset.loc[i,'hospitalises'] = dataset.loc[i+1,'hospitalises'] - dataset.loc[i+1,'hospitalises'] = 'inv' - - if dataset.loc[i,'maille_code'] == dataset.loc[i+1,'maille_code'] and dataset.loc[i,'date'] == dataset.loc[i+1,'date'] and dataset.loc[i,'gueris'] != 'inv': - if pd.isnull(dataset.loc[i,'gueris']) == True and pd.isnull(dataset.loc[i+1,'gueris']) == False: - dataset.loc[i,'gueris'] = dataset.loc[i+1,'gueris'] - dataset.loc[i+1,'gueris'] = 'inv' - elif pd.isnull(dataset.loc[i,'gueris']) == False and pd.isnull(dataset.loc[i+1,'gueris']) == True: - dataset.loc[i+1,'gueris'] = 'inv' - elif pd.isnull(dataset.loc[i,'gueris']) == True and pd.isnull(dataset.loc[i+1,'gueris']) == True: - dataset.loc[i+1,'gueris'] = 'inv' - elif dataset.loc[i,'gueris'] == dataset.loc[i+1,'gueris']: - dataset.loc[i+1,'gueris'] = 'inv' - elif dataset.loc[i,'gueris'] > dataset.loc[i+1,'gueris']: - dataset.loc[i+1,'gueris'] = 'inv' - elif dataset.loc[i,'gueris'] < dataset.loc[i+1,'gueris']: - dataset.loc[i,'gueris'] = dataset.loc[i+1,'gueris'] - dataset.loc[i+1,'gueris'] = 'inv' - - #Delete the redundant resulting rows and reset the indices: + # Delete the redundant resulting rows and reset the indices: dataset = dataset[dataset.cas_confirmes != 'inv'] dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis=1) - #Rename/Translate the column titles: - dataset = dataset.rename(columns={"maille_code":"subregion_code", "maille_nom":"subregion_name", "cas_confirmes": "confirmed", "deces": "deaths", "reanimation": "recovering", "hospitalises": "hospitalized", "gueris": "recovered", "source_nom": "source_name"}) + # Rename/Translate the column titles: + dataset = dataset.rename( + columns={"maille_code": "subregion_code", + "maille_nom": "subregion_name", "cas_confirmes": "confirmed", + "deces": "deaths", "reanimation": "recovering", + "hospitalises": "hospitalized", "gueris": "recovered", + "source_nom": "source_name"}) dataset['country'] = 'France' - frcovidata = dataset[['subregion_code', 'subregion_name', 'country', 'date', 'confirmed', 'hospitalized', 'recovering', 'recovered', 'deaths', 'source_name', 'source_url', 'source_archive', 'source_type']] + frcovidata = dataset[ + ['subregion_code', 'subregion_name', 'country', 'date', 'confirmed', + 'hospitalized', 'recovering', 'recovered', + 'deaths', 'source_name', 'source_url', 'source_archive', + 'source_type']] - return frcovidata \ No newline at end of file + return frcovidata From 64af1ab5929c66982a33b2e3adc84f7ad445a3b4 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Mon, 6 Apr 2020 19:17:49 +0300 Subject: [PATCH 05/12] Update fr_covidata.py --- .../covid/fr_covidata/fr_covidata.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index aaa0c0b..6e71a00 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -20,7 +20,7 @@ url = ( 'https://raw.githubusercontent.com/opencovid19-fr/' 'data/master/dist/chiffres-cles.csv' - ) +) def fr_covidata(): @@ -101,37 +101,37 @@ def fr_covidata_formatter(dataset): # Combine Source names, url, archive and type for repetitive # subregions at the same date: dataset.loc[i, 'source_nom'] = dataset.loc[i, 'source_nom'] + \ - dataset.loc[i + 1, 'source_nom'] + dataset.loc[i + 1, 'source_nom'] dataset.loc[i, 'source_url'] = dataset.loc[i, 'source_url'] + \ dataset.loc[i + 1, 'source_url'] dataset.loc[i, 'source_archive'] = dataset.loc[ - i, 'source_archive'] + \ + i, 'source_archive'] + \ dataset.loc[ - i + 1, 'source_archive'] + i + 1, 'source_archive'] dataset.loc[i, 'source_type'] = dataset.loc[i, 'source_type'] + \ dataset.loc[i + 1, 'source_type'] if pd.isnull( - dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( + dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( dataset.loc[i + 1, 'cas_confirmes']) is False: - dataset.loc[i, 'cas_confirmes'] = dataset.loc[ - i + 1, 'cas_confirmes'] - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + dataset.loc[i, 'cas_confirmes'] = dataset.loc[ + i + 1, 'cas_confirmes'] + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' elif pd.isnull( - dataset.loc[i, 'cas_confirmes']) is False and pd.isnull( + dataset.loc[i, 'cas_confirmes']) is False and pd.isnull( dataset.loc[i + 1, 'cas_confirmes']) is True: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' elif pd.isnull( - dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( + dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( dataset.loc[i + 1, 'cas_confirmes']) is True: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' + dataset.loc[i + 1, 'cas_confirmes'] = 'inv' elif dataset.loc[i, 'cas_confirmes'] == dataset.loc[ - i + 1, 'cas_confirmes']: + i + 1, 'cas_confirmes']: dataset.loc[i + 1, 'cas_confirmes'] = 'inv' elif dataset.loc[i, 'cas_confirmes'] > dataset.loc[ - i + 1, 'cas_confirmes']: + i + 1, 'cas_confirmes']: dataset.loc[i + 1, 'cas_confirmes'] = 'inv' elif dataset.loc[i, 'cas_confirmes'] < dataset.loc[ - i + 1, 'cas_confirmes']: + i + 1, 'cas_confirmes']: dataset.loc[i, 'cas_confirmes'] = dataset.loc[ i + 1, 'cas_confirmes'] dataset.loc[i + 1, 'cas_confirmes'] = 'inv' @@ -174,13 +174,13 @@ def fr_covidata_formatter(dataset): dataset.loc[i + 1, 'reanimation']) is True: dataset.loc[i + 1, 'reanimation'] = 'inv' elif dataset.loc[i, 'reanimation'] == dataset.loc[ - i + 1, 'reanimation']: + i + 1, 'reanimation']: dataset.loc[i + 1, 'reanimation'] = 'inv' elif dataset.loc[i, 'reanimation'] > dataset.loc[ - i + 1, 'reanimation']: + i + 1, 'reanimation']: dataset.loc[i + 1, 'reanimation'] = 'inv' elif dataset.loc[i, 'reanimation'] < dataset.loc[ - i + 1, 'reanimation']: + i + 1, 'reanimation']: dataset.loc[i, 'reanimation'] = dataset.loc[ i + 1, 'reanimation'] dataset.loc[i + 1, 'reanimation'] = 'inv' @@ -191,25 +191,25 @@ def fr_covidata_formatter(dataset): if pd.isnull( dataset.loc[i, 'hospitalises']) is True and pd.isnull( dataset.loc[i + 1, 'hospitalises']) is False: - dataset.loc[i, 'hospitalises'] = dataset.loc[ - i + 1, 'hospitalises'] - dataset.loc[i + 1, 'hospitalises'] = 'inv' + dataset.loc[i, 'hospitalises'] = dataset.loc[ + i + 1, 'hospitalises'] + dataset.loc[i + 1, 'hospitalises'] = 'inv' elif pd.isnull( dataset.loc[i, 'hospitalises']) is False and pd.isnull( dataset.loc[i + 1, 'hospitalises']) is True: dataset.loc[i + 1, 'hospitalises'] = 'inv' elif pd.isnull( - dataset.loc[i, 'hospitalises']) is True and pd.isnull( + dataset.loc[i, 'hospitalises']) is True and pd.isnull( dataset.loc[i + 1, 'hospitalises']) is True: - dataset.loc[i + 1, 'hospitalises'] = 'inv' + dataset.loc[i + 1, 'hospitalises'] = 'inv' elif dataset.loc[i, 'hospitalises'] == dataset.loc[ - i + 1, 'hospitalises']: + i + 1, 'hospitalises']: dataset.loc[i + 1, 'hospitalises'] = 'inv' elif dataset.loc[i, 'hospitalises'] > dataset.loc[ - i + 1, 'hospitalises']: + i + 1, 'hospitalises']: dataset.loc[i + 1, 'hospitalises'] = 'inv' elif dataset.loc[i, 'hospitalises'] < dataset.loc[ - i + 1, 'hospitalises']: + i + 1, 'hospitalises']: dataset.loc[i, 'hospitalises'] = dataset.loc[ i + 1, 'hospitalises'] dataset.loc[i + 1, 'hospitalises'] = 'inv' From 2120e7927523125d04aba6bceb71d5efce41dc3c Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Fri, 10 Apr 2020 15:45:36 +0300 Subject: [PATCH 06/12] Create datapackage.json --- .../covid/fr_covidata/datapackage.json | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 task_geo/data_sources/covid/fr_covidata/datapackage.json diff --git a/task_geo/data_sources/covid/fr_covidata/datapackage.json b/task_geo/data_sources/covid/fr_covidata/datapackage.json new file mode 100644 index 0000000..cc68e5b --- /dev/null +++ b/task_geo/data_sources/covid/fr_covidata/datapackage.json @@ -0,0 +1,77 @@ +{ + "title": "COVID-19 Granular Data - France", + "description": "Sourcing of COVID-19 cases granular data in different subregions of France, aggregated by @opencovid19-fr, https://github.com/opencovid19-fr/", + "licenses": [{"name": "copyright-authors"}], + "fields": [ + { + "name": "subregion_code", + "description": "Geographical location - maille code", + "type": "str" + }, + { + "name": "subregion_name", + "description": "Geographical location - department", + "type": "str" + }, + { + "name": "country", + "description": "Geographical location - country", + "type": "str" + }, + { + "name": "date", + "description": "Date", + "type": "datetime.date" + }, + { + "name": "confirmed", + "description": "Cumulative number of confirmed covid-19 cases at the given location until the given time", + "type": "float" + }, + { + "name": "hospitalized", + "description": "Number of hospitalized people due to covid-19 at the given location at the given time", + "type": "float" + }, + { + "name": "recovering", + "description": "Number of people recovering from covid-19 at the given location at the given time", + "type": "float" + }, + { + "name": "recovered", + "description": "Cumulative number of people healed from covid-19 at the given location until the given time", + "type": "float" + }, + { + "name": "deaths", + "description": "Cumulative number of people who passed away from covid-19 at the given location until the given time", + "type": "float" + }, + { + "name": "source_name", + "description": "Name of the source of information relevant to the given location at the given time", + "type": "str" + }, + { + "name": "source_url", + "description": "Uniform Resource Locator link of the source of information relevant to the given location at the given time", + "type": "str" + }, + { + "name": "source_archive", + "description": "Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time", + "type": "str" + }, + { + "name": "source_type", + "description": "Type of the source of information: National Health Ministry/Regional health agencies", + "type": "str" + } + ], + "keywords": [ + "France", + "COVID-19", + "granular data" + ] +} From 1f840b3daf7d4e65a4239d859c31fc60b82516c4 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Fri, 10 Apr 2020 16:48:58 +0300 Subject: [PATCH 07/12] Update fr_covidata.py --- .../covid/fr_covidata/fr_covidata.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index 6e71a00..4ecbeb0 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -67,31 +67,26 @@ def fr_covidata_formatter(dataset): frcovidata(pandas.DataFrame) """ - dataset = dataset[dataset.granularite != 'region'] - dataset = dataset[dataset.granularite != 'monde'] - dataset = dataset[dataset.granularite != 'pays'] - dataset = dataset[dataset.granularite != 'collectivite-outremer'] - dataset = dataset[dataset.maille_code != 'DEP-971'] - dataset = dataset[dataset.maille_code != 'DEP-972'] - dataset = dataset[dataset.maille_code != 'DEP-973'] - dataset = dataset[dataset.maille_code != 'DEP-974'] - dataset = dataset[dataset.maille_code != 'DEP-976'] - dataset = dataset.drop(['depistes'], axis=1) - dataset = dataset.drop(['granularite'], axis=1) + no_granularites = ['region', 'monde', 'pays', 'collectivite-outremer'] + no_maille_codes = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976'] + dataset = dataset[ + (~dataset.granularite.isin(no_granularites)) & + (~dataset.maille_code.isin(no_maille_codes)) + ] + dataset = dataset.drop(['depistes', 'granularite'], axis=1) dataset = dataset.drop_duplicates( subset=['date', 'maille_code', 'cas_confirmes', 'deces', 'reanimation', 'hospitalises', 'gueris'], keep=False) + dataset['date'] = pd.to_datetime(dataset['date'].astype(str)).dt.date # Reset indices: - dataset = dataset.reset_index() - dataset = dataset.drop(['index'], axis=1) + dataset = dataset.reset_index(drop=True) # Turn source columns' values type to string: - dataset['source_nom'] = dataset['source_nom'].astype(str) - dataset['source_url'] = dataset['source_url'].astype(str) - dataset['source_archive'] = dataset['source_archive'].astype(str) - dataset['source_type'] = dataset['source_type'].astype(str) + str_columns = ['source_nom', 'source_url', + 'source_archive', 'source_type'] + dataset[str_columns] = dataset[str_columns].astype(str) for i in range(len(dataset) - 1): if dataset.loc[i, 'maille_code'] == dataset.loc[ @@ -101,15 +96,15 @@ def fr_covidata_formatter(dataset): # Combine Source names, url, archive and type for repetitive # subregions at the same date: dataset.loc[i, 'source_nom'] = dataset.loc[i, 'source_nom'] + \ - dataset.loc[i + 1, 'source_nom'] + " " + dataset.loc[i + 1, 'source_nom'] dataset.loc[i, 'source_url'] = dataset.loc[i, 'source_url'] + \ - dataset.loc[i + 1, 'source_url'] + " " + dataset.loc[i + 1, 'source_url'] dataset.loc[i, 'source_archive'] = dataset.loc[ - i, 'source_archive'] + \ + i, 'source_archive'] + " " + \ dataset.loc[ i + 1, 'source_archive'] dataset.loc[i, 'source_type'] = dataset.loc[i, 'source_type'] + \ - dataset.loc[i + 1, 'source_type'] + " " + dataset.loc[i + 1, 'source_type'] if pd.isnull( dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( dataset.loc[i + 1, 'cas_confirmes']) is False: @@ -237,8 +232,7 @@ def fr_covidata_formatter(dataset): # Delete the redundant resulting rows and reset the indices: dataset = dataset[dataset.cas_confirmes != 'inv'] - dataset = dataset.reset_index() - dataset = dataset.drop(['index'], axis=1) + dataset = dataset.reset_index(drop=True) # Rename/Translate the column titles: dataset = dataset.rename( From 567332695bcdab799f94e958bb7e60b945a29472 Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Fri, 10 Apr 2020 17:14:28 +0300 Subject: [PATCH 08/12] Update fr_covidata.py --- task_geo/data_sources/covid/fr_covidata/fr_covidata.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index 4ecbeb0..2b21c7d 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -67,11 +67,10 @@ def fr_covidata_formatter(dataset): frcovidata(pandas.DataFrame) """ - no_granularites = ['region', 'monde', 'pays', 'collectivite-outremer'] - no_maille_codes = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976'] + no_gr = ['region', 'monde', 'pays', 'collectivite-outremer'] + no_mc = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976'] dataset = dataset[ - (~dataset.granularite.isin(no_granularites)) & - (~dataset.maille_code.isin(no_maille_codes)) + (~dataset.granularite.isin(no_gr)) & (~dataset.maille_code.isin(no_mc)) ] dataset = dataset.drop(['depistes', 'granularite'], axis=1) dataset = dataset.drop_duplicates( From bf409589ddcf9c329cd9c86538d9eedd031ac56c Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Mon, 13 Apr 2020 19:39:55 +0300 Subject: [PATCH 09/12] Create audit.md --- .../data_sources/covid/fr_covidata/audit.md | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 task_geo/data_sources/covid/fr_covidata/audit.md diff --git a/task_geo/data_sources/covid/fr_covidata/audit.md b/task_geo/data_sources/covid/fr_covidata/audit.md new file mode 100644 index 0000000..1b9d996 --- /dev/null +++ b/task_geo/data_sources/covid/fr_covidata/audit.md @@ -0,0 +1,71 @@ +# COVID-19 Granular Data - France + +## General information + +- **Description**: COVID-19 daily confirmed cases/hospitalized/recovering/recovered/deaths data in different subregions of France +- **Credits**: @opencovid19-fr +- **Source**: https://github.com/opencovid19-fr/data/ + +## Column Details + +**subregion_code** +- Description: French Maille department code +- Type: str + +**subregion_name** +- Description: French department name +- Type: str + +**country** +- Description: Country Name - France +- Type: str + +**date** +- Description: yyyy/mm/dd Date +- Type: datetime.date + +**confirmed** +- Description: Cumulative number of confirmed covid-19 cases at the given location until the given time +- Type: float + +**hospitalized** +- Description: Number of hospitalized people due to covid-19 at the given location at the given time +- Type: float + +**recovering** +- Description: Number of people recovering from covid-19 at the given location at the given time +- Type: float + +**recovered** +- Description: Cumulative number of people healed from covid-19 at the given location until the given time +- Type: float + +**deaths** +- Description: Cumulative number of people who passed away from covid-19 at the given location until the given time +- Type: float + +**source_name** +- Description: Name of the source of information relevant to the given location at the given time +- Type: str + +**source_url** +- Description: Uniform Resource Locator link of the source of information relevant to the given location at the given time +- Type: str + +**source_name** +- Description: Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time +- Type: str + +**source_name** +- Description: Type of the source of information: National Health Ministry/Regional health agencies +- Type: str + +## Transformations applied + +- Deleting the rows non-related to the departments inside Metropolitan France ('region', 'monde', 'pays', 'collectivite-outremer', 'DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976') +- Dropping the columns 'depistes' and 'granularite' +- Deleting the dupicate rows containing the same numerical data +- Switching the 'date' column format from str to datetime.date +- Switching the 'source_nom', 'source_url', 'source_archive', 'source_type'' columns format to str +- Merging all the rows that have the same 'date' and 'subregion_name' column value, by taking the maximum of every numerical column value and aggregating both sources columns' info +- Renaming/Translating the column titles from French to English, and rearranging them From 673b25039d401d24d5b43ff6d4925a6e7930135f Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Wed, 15 Apr 2020 02:26:04 +0300 Subject: [PATCH 10/12] Update fr_covidata.py --- .../covid/fr_covidata/fr_covidata.py | 167 +++--------------- 1 file changed, 20 insertions(+), 147 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index 2b21c7d..41644c7 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -87,151 +87,24 @@ def fr_covidata_formatter(dataset): 'source_archive', 'source_type'] dataset[str_columns] = dataset[str_columns].astype(str) - for i in range(len(dataset) - 1): - if dataset.loc[i, 'maille_code'] == dataset.loc[ - i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ - i + 1, 'date'] and dataset.loc[ - i, 'cas_confirmes'] != 'inv' and i != len(dataset): - # Combine Source names, url, archive and type for repetitive - # subregions at the same date: - dataset.loc[i, 'source_nom'] = dataset.loc[i, 'source_nom'] + \ - " " + dataset.loc[i + 1, 'source_nom'] - dataset.loc[i, 'source_url'] = dataset.loc[i, 'source_url'] + \ - " " + dataset.loc[i + 1, 'source_url'] - dataset.loc[i, 'source_archive'] = dataset.loc[ - i, 'source_archive'] + " " + \ - dataset.loc[ - i + 1, 'source_archive'] - dataset.loc[i, 'source_type'] = dataset.loc[i, 'source_type'] + \ - " " + dataset.loc[i + 1, 'source_type'] - if pd.isnull( - dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( - dataset.loc[i + 1, 'cas_confirmes']) is False: - dataset.loc[i, 'cas_confirmes'] = dataset.loc[ - i + 1, 'cas_confirmes'] - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'cas_confirmes']) is False and pd.isnull( - dataset.loc[i + 1, 'cas_confirmes']) is True: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'cas_confirmes']) is True and pd.isnull( - dataset.loc[i + 1, 'cas_confirmes']) is True: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - elif dataset.loc[i, 'cas_confirmes'] == dataset.loc[ - i + 1, 'cas_confirmes']: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - elif dataset.loc[i, 'cas_confirmes'] > dataset.loc[ - i + 1, 'cas_confirmes']: - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - elif dataset.loc[i, 'cas_confirmes'] < dataset.loc[ - i + 1, 'cas_confirmes']: - dataset.loc[i, 'cas_confirmes'] = dataset.loc[ - i + 1, 'cas_confirmes'] - dataset.loc[i + 1, 'cas_confirmes'] = 'inv' - - if dataset.loc[i, 'maille_code'] == dataset.loc[ - i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ - i + 1, 'date'] and dataset.loc[i, 'deces'] != 'inv': - if pd.isnull(dataset.loc[i, 'deces']) is True and pd.isnull( - dataset.loc[i + 1, 'deces']) is False: - dataset.loc[i, 'deces'] = dataset.loc[i + 1, 'deces'] - dataset.loc[i + 1, 'deces'] = 'inv' - elif pd.isnull(dataset.loc[i, 'deces']) is False and pd.isnull( - dataset.loc[i + 1, 'deces']) is True: - dataset.loc[i + 1, 'deces'] = 'inv' - elif pd.isnull(dataset.loc[i, 'deces']) is True and pd.isnull( - dataset.loc[i + 1, 'deces']) is True: - dataset.loc[i + 1, 'deces'] = 'inv' - elif dataset.loc[i, 'deces'] == dataset.loc[i + 1, 'deces']: - dataset.loc[i + 1, 'deces'] = 'inv' - elif dataset.loc[i, 'deces'] > dataset.loc[i + 1, 'deces']: - dataset.loc[i + 1, 'deces'] = 'inv' - elif dataset.loc[i, 'deces'] < dataset.loc[i + 1, 'deces']: - dataset.loc[i, 'deces'] = dataset.loc[i + 1, 'deces'] - dataset.loc[i + 1, 'deces'] = 'inv' - - if dataset.loc[i, 'maille_code'] == dataset.loc[ - i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ - i + 1, 'date'] and dataset.loc[i, 'reanimation'] != 'inv': - if pd.isnull(dataset.loc[i, 'reanimation']) is True and pd.isnull( - dataset.loc[i + 1, 'reanimation']) is False: - dataset.loc[i, 'reanimation'] = dataset.loc[ - i + 1, 'reanimation'] - dataset.loc[i + 1, 'reanimation'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'reanimation']) is False and pd.isnull( - dataset.loc[i + 1, 'reanimation']) is True: - dataset.loc[i + 1, 'reanimation'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'reanimation']) is True and pd.isnull( - dataset.loc[i + 1, 'reanimation']) is True: - dataset.loc[i + 1, 'reanimation'] = 'inv' - elif dataset.loc[i, 'reanimation'] == dataset.loc[ - i + 1, 'reanimation']: - dataset.loc[i + 1, 'reanimation'] = 'inv' - elif dataset.loc[i, 'reanimation'] > dataset.loc[ - i + 1, 'reanimation']: - dataset.loc[i + 1, 'reanimation'] = 'inv' - elif dataset.loc[i, 'reanimation'] < dataset.loc[ - i + 1, 'reanimation']: - dataset.loc[i, 'reanimation'] = dataset.loc[ - i + 1, 'reanimation'] - dataset.loc[i + 1, 'reanimation'] = 'inv' - - if dataset.loc[i, 'maille_code'] == dataset.loc[ - i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ - i + 1, 'date'] and dataset.loc[i, 'hospitalises'] != 'inv': - if pd.isnull( - dataset.loc[i, 'hospitalises']) is True and pd.isnull( - dataset.loc[i + 1, 'hospitalises']) is False: - dataset.loc[i, 'hospitalises'] = dataset.loc[ - i + 1, 'hospitalises'] - dataset.loc[i + 1, 'hospitalises'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'hospitalises']) is False and pd.isnull( - dataset.loc[i + 1, 'hospitalises']) is True: - dataset.loc[i + 1, 'hospitalises'] = 'inv' - elif pd.isnull( - dataset.loc[i, 'hospitalises']) is True and pd.isnull( - dataset.loc[i + 1, 'hospitalises']) is True: - dataset.loc[i + 1, 'hospitalises'] = 'inv' - elif dataset.loc[i, 'hospitalises'] == dataset.loc[ - i + 1, 'hospitalises']: - dataset.loc[i + 1, 'hospitalises'] = 'inv' - elif dataset.loc[i, 'hospitalises'] > dataset.loc[ - i + 1, 'hospitalises']: - dataset.loc[i + 1, 'hospitalises'] = 'inv' - elif dataset.loc[i, 'hospitalises'] < dataset.loc[ - i + 1, 'hospitalises']: - dataset.loc[i, 'hospitalises'] = dataset.loc[ - i + 1, 'hospitalises'] - dataset.loc[i + 1, 'hospitalises'] = 'inv' - - if dataset.loc[i, 'maille_code'] == dataset.loc[ - i + 1, 'maille_code'] and dataset.loc[i, 'date'] == dataset.loc[ - i + 1, 'date'] and dataset.loc[i, 'gueris'] != 'inv': - if pd.isnull(dataset.loc[i, 'gueris']) is True and pd.isnull( - dataset.loc[i + 1, 'gueris']) is False: - dataset.loc[i, 'gueris'] = dataset.loc[i + 1, 'gueris'] - dataset.loc[i + 1, 'gueris'] = 'inv' - elif pd.isnull(dataset.loc[i, 'gueris']) is False and pd.isnull( - dataset.loc[i + 1, 'gueris']) is True: - dataset.loc[i + 1, 'gueris'] = 'inv' - elif pd.isnull(dataset.loc[i, 'gueris']) is True and pd.isnull( - dataset.loc[i + 1, 'gueris']) is True: - dataset.loc[i + 1, 'gueris'] = 'inv' - elif dataset.loc[i, 'gueris'] == dataset.loc[i + 1, 'gueris']: - dataset.loc[i + 1, 'gueris'] = 'inv' - elif dataset.loc[i, 'gueris'] > dataset.loc[i + 1, 'gueris']: - dataset.loc[i + 1, 'gueris'] = 'inv' - elif dataset.loc[i, 'gueris'] < dataset.loc[i + 1, 'gueris']: - dataset.loc[i, 'gueris'] = dataset.loc[i + 1, 'gueris'] - dataset.loc[i + 1, 'gueris'] = 'inv' - - # Delete the redundant resulting rows and reset the indices: - dataset = dataset[dataset.cas_confirmes != 'inv'] - dataset = dataset.reset_index(drop=True) + aggre = { + 'cas_confirmes': np.max, + 'cas_ehpad': np.max, + 'cas_confirmes_ehpad': np.max, + 'cas_possibles_ehpad': np.max, + 'deces': np.max, + 'deces_ehpad': np.max, + 'reanimation': np.max, + 'hospitalises': np.max, + 'gueris': np.max, + 'source_nom': ','.join, + 'source_url': ','.join, + 'source_archive': ','.join, + 'source_type': ','.join + } + dataset = dataset.groupby(['date', + 'maille_code', + 'maille_nom']).aggregate(aggre).reset_index() # Rename/Translate the column titles: dataset = dataset.rename( @@ -242,9 +115,9 @@ def fr_covidata_formatter(dataset): "source_nom": "source_name"}) dataset['country'] = 'France' frcovidata = dataset[ - ['subregion_code', 'subregion_name', 'country', 'date', 'confirmed', + 'subregion_code', 'subregion_name', 'country', 'date', 'confirmed', 'hospitalized', 'recovering', 'recovered', 'deaths', 'source_name', 'source_url', 'source_archive', - 'source_type']] + 'source_type'] return frcovidata From d902d02b38f2787289406350c42c7104e4be961d Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Wed, 15 Apr 2020 02:33:08 +0300 Subject: [PATCH 11/12] Update fr_covidata.py --- task_geo/data_sources/covid/fr_covidata/fr_covidata.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index 41644c7..c872cab 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -15,6 +15,7 @@ import io import pandas as pd +import numpy as np import requests url = ( @@ -115,9 +116,9 @@ def fr_covidata_formatter(dataset): "source_nom": "source_name"}) dataset['country'] = 'France' frcovidata = dataset[ - 'subregion_code', 'subregion_name', 'country', 'date', 'confirmed', - 'hospitalized', 'recovering', 'recovered', - 'deaths', 'source_name', 'source_url', 'source_archive', - 'source_type'] + 'subregion_code', 'subregion_name', 'country', 'date', 'confirmed', + 'hospitalized', 'recovering', 'recovered', + 'deaths', 'source_name', 'source_url', 'source_archive', + 'source_type'] return frcovidata From 848de12f598494aecd792a98fa3186fa886cdebb Mon Sep 17 00:00:00 2001 From: oussamanaji Date: Wed, 15 Apr 2020 03:06:29 +0300 Subject: [PATCH 12/12] Update fr_covidata.py --- task_geo/data_sources/covid/fr_covidata/fr_covidata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py index c872cab..b2656b4 100644 --- a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py +++ b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py @@ -14,8 +14,8 @@ import io -import pandas as pd import numpy as np +import pandas as pd import requests url = (