Merge pull request CoronaWhy#36 from oussamanaji/oussamanaji-frcovidata

France COVID19 Granular Data Extraction
wwymak · Apr 15, 2020 · 3c2ffa7 · 3c2ffa7
2 parents 8665c4f + 848de12
commit 3c2ffa7
Show file tree

Hide file tree

Showing 5 changed files with 299 additions and 0 deletions.
diff --git a/task_geo/data_sources/covid/fr_covidata/__init__.py b/task_geo/data_sources/covid/fr_covidata/__init__.py
@@ -0,0 +1,3 @@
+from task_geo.data_sources.covid.fr_covidata.fr_covidata import fr_covidata
+
+__all__ = ['fr_covidata']
diff --git a/task_geo/data_sources/covid/fr_covidata/__main__.py b/task_geo/data_sources/covid/fr_covidata/__main__.py
@@ -0,0 +1,24 @@
+import argparse
+
+from fr_covidata import fr_covidata
+
+
+def get_argparser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '-o', '--output', required=True,
+        help='Destination file to store the processed dataset.')
+    return parser
+
+
+def main():
+    parser = get_argparser()
+    args = parser.parse_args()
+
+    dataset = fr_covidata()
+    dataset.to_csv(args.output, index=False, header=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/task_geo/data_sources/covid/fr_covidata/audit.md b/task_geo/data_sources/covid/fr_covidata/audit.md
@@ -0,0 +1,71 @@
+# COVID-19 Granular Data - France
+
+## General information
+
+- **Description**: COVID-19 daily confirmed cases/hospitalized/recovering/recovered/deaths data in different subregions of France
+- **Credits**: @opencovid19-fr
+- **Source**: https://github.com/opencovid19-fr/data/
+
+## Column Details
+
+**subregion_code**
+- Description: French Maille department code
+- Type: str
+
+**subregion_name**
+- Description: French department name
+- Type: str
+
+**country**
+- Description: Country Name - France
+- Type: str
+
+**date**
+- Description: yyyy/mm/dd Date
+- Type: datetime.date
+
+**confirmed**
+- Description: Cumulative number of confirmed covid-19 cases at the given location until the given time
+- Type: float
+
+**hospitalized**
+- Description: Number of hospitalized people due to covid-19 at the given location at the given time
+- Type: float
+
+**recovering**
+- Description: Number of people recovering from covid-19 at the given location at the given time
+- Type: float
+
+**recovered**
+- Description: Cumulative number of people healed from covid-19 at the given location until the given time
+- Type: float
+
+**deaths**
+- Description: Cumulative number of people who passed away from covid-19 at the given location until the given time
+- Type: float
+
+**source_name**
+- Description: Name of the source of information relevant to the given location at the given time
+- Type: str
+
+**source_url**
+- Description: Uniform Resource Locator link of the source of information relevant to the given location at the given time
+- Type: str
+
+**source_name**
+- Description: Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time
+- Type: str
+
+**source_name**
+- Description: Type of the source of information: National Health Ministry/Regional health agencies
+- Type: str
+
+## Transformations applied
+
+- Deleting the rows non-related to the departments inside Metropolitan France ('region', 'monde', 'pays', 'collectivite-outremer', 'DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976')
+- Dropping the columns 'depistes' and 'granularite'
+- Deleting the dupicate rows containing the same numerical data
+- Switching the 'date' column format from str to datetime.date
+- Switching the 'source_nom', 'source_url', 'source_archive', 'source_type'' columns format to str
+- Merging all the rows that have the same 'date' and 'subregion_name' column value, by taking the maximum of every numerical column value and aggregating both sources columns' info
+- Renaming/Translating the column titles from French to English, and rearranging them
diff --git a/task_geo/data_sources/covid/fr_covidata/datapackage.json b/task_geo/data_sources/covid/fr_covidata/datapackage.json
@@ -0,0 +1,77 @@
+{
+    "title": "COVID-19 Granular Data - France",
+    "description": "Sourcing of COVID-19 cases granular data in different subregions of France, aggregated by @opencovid19-fr, https://github.com/opencovid19-fr/",
+    "licenses": [{"name": "copyright-authors"}],
+    "fields": [
+        {
+            "name": "subregion_code",
+            "description": "Geographical location - maille code",
+            "type": "str"
+        },
+        {
+            "name": "subregion_name",
+            "description": "Geographical location - department",
+            "type": "str"
+        },
+        {
+            "name": "country",
+            "description": "Geographical location - country",
+            "type": "str"
+        },
+        {
+            "name": "date",
+            "description": "Date",
+            "type": "datetime.date"
+        },
+        {
+            "name": "confirmed",
+            "description": "Cumulative number of confirmed covid-19 cases at the given location until the given time",
+            "type": "float"
+        },
+        {
+            "name": "hospitalized",
+            "description": "Number of hospitalized people due to covid-19 at the given location at the given time",
+            "type": "float"
+        },
+        {
+            "name": "recovering",
+            "description": "Number of people recovering from covid-19 at the given location at the given time",
+            "type": "float"
+        },
+        {
+            "name": "recovered",
+            "description": "Cumulative number of people healed from covid-19 at the given location until the given time",
+            "type": "float"
+        },
+        {
+            "name": "deaths",
+            "description": "Cumulative number of people who passed away from covid-19 at the given location until the given time",
+            "type": "float"
+        },
+        {
+            "name": "source_name",
+            "description": "Name of the source of information relevant to the given location at the given time",
+            "type": "str"
+        },
+        {
+            "name": "source_url",
+            "description": "Uniform Resource Locator link of the source of information relevant to the given location at the given time",
+            "type": "str"
+        },
+        {
+            "name": "source_archive",
+            "description": "Uniform Resource Locator link of the archive of the source of information relevant to the given location at the given time",
+            "type": "str"
+        },
+        {
+            "name": "source_type",
+            "description": "Type of the source of information: National Health Ministry/Regional health agencies",
+            "type": "str"
+        }
+    ],
+    "keywords": [
+        "France",
+        "COVID-19",
+        "granular data"
+    ]
+}
diff --git a/task_geo/data_sources/covid/fr_covidata/fr_covidata.py b/task_geo/data_sources/covid/fr_covidata/fr_covidata.py
@@ -0,0 +1,124 @@
+"""
+fr_covidata.py
+
+Functions:
+    - fr_covidata_connector: Extracts data from CSV URL
+    - fr_covidata_formatter: Cleans CSV data
+    - fr_covidata: Combines the two previous functions
+
+Data Credits:
+    OpenCOVID19-fr
+    https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/
+    https://github.com/opencovid19-fr/data
+"""
+
+import io
+
+import numpy as np
+import pandas as pd
+import requests
+
+url = (
+    'https://raw.githubusercontent.com/opencovid19-fr/'
+    'data/master/dist/chiffres-cles.csv'
+)
+
+
+def fr_covidata():
+    """Data Source for the French COVID-19 Data.
+    Arguments:
+        None
+    Returns:
+        pandas.DataFrame
+    """
+    df = fr_covidata_connector()
+    return fr_covidata_formatter(df)
+
+
+def fr_covidata_connector():
+    """Extract data from OpenCOVID19-fr's Github repository.
+    Description:
+        - Downloads the URL's data in a Unicode CSV Format
+        - Unicode CSV Format: ACS 5Y UTF-8
+    Returns:
+        dataset (DataFrame with CSV Data)
+    """
+
+    urlData = requests.get(url).content
+
+    dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8')))
+    return dataset
+
+
+def fr_covidata_formatter(dataset):
+    """Formatter for FR COVID-19 Data.
+    Arguments:
+        dataset(pandas.DataFrame): Data as returned by fr_covidata_connector.
+    Description:
+        - Drop unnecessary rows with irrelevant regions' info and only keep
+        info related to subregions in Metropolitan France, as well as
+        repetitive data
+        - Check the dataset for instances where there are more than one source
+        of data in the same subregion for the same date, then complement all
+        the sources information, and take the highest value in case there are
+        different values for the same column, while aggregating the sources
+        info
+        - Rename/Translate the column titles, and add a country column (France)
+    Returns:
+        frcovidata(pandas.DataFrame)
+    """
+
+    no_gr = ['region', 'monde', 'pays', 'collectivite-outremer']
+    no_mc = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976']
+    dataset = dataset[
+        (~dataset.granularite.isin(no_gr)) & (~dataset.maille_code.isin(no_mc))
+    ]
+    dataset = dataset.drop(['depistes', 'granularite'], axis=1)
+    dataset = dataset.drop_duplicates(
+        subset=['date', 'maille_code', 'cas_confirmes', 'deces',
+                'reanimation',
+                'hospitalises', 'gueris'], keep=False)
+    dataset['date'] = pd.to_datetime(dataset['date'].astype(str)).dt.date
+
+    # Reset indices:
+    dataset = dataset.reset_index(drop=True)
+
+    # Turn source columns' values type to string:
+    str_columns = ['source_nom', 'source_url',
+                   'source_archive', 'source_type']
+    dataset[str_columns] = dataset[str_columns].astype(str)
+
+    aggre = {
+        'cas_confirmes': np.max,
+        'cas_ehpad': np.max,
+        'cas_confirmes_ehpad': np.max,
+        'cas_possibles_ehpad': np.max,
+        'deces': np.max,
+        'deces_ehpad': np.max,
+        'reanimation': np.max,
+        'hospitalises': np.max,
+        'gueris': np.max,
+        'source_nom': ','.join,
+        'source_url': ','.join,
+        'source_archive': ','.join,
+        'source_type': ','.join
+    }
+    dataset = dataset.groupby(['date',
+                               'maille_code',
+                               'maille_nom']).aggregate(aggre).reset_index()
+
+    # Rename/Translate the column titles:
+    dataset = dataset.rename(
+        columns={"maille_code": "subregion_code",
+                 "maille_nom": "subregion_name", "cas_confirmes": "confirmed",
+                 "deces": "deaths", "reanimation": "recovering",
+                 "hospitalises": "hospitalized", "gueris": "recovered",
+                 "source_nom": "source_name"})
+    dataset['country'] = 'France'
+    frcovidata = dataset[
+        'subregion_code', 'subregion_name', 'country', 'date', 'confirmed',
+        'hospitalized', 'recovering', 'recovered',
+        'deaths', 'source_name', 'source_url', 'source_archive',
+        'source_type']
+
+    return frcovidata
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from task_geo.data_sources.covid.fr_covidata.fr_covidata import fr_covidata

		__all__ = ['fr_covidata']