Merge branch 'fix_new_cases_begin_date_in_delay_cases'

# Conflicts: # covid19_inference/model/utility.py
Priesemann-Group · Jun 17, 2020 · 7ab7f54 · 7ab7f54
2 parents 307696a + ae1cd70
commit 7ab7f54
Show file tree

Hide file tree

Showing 35 changed files with 4,517 additions and 421 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include README.md
diff --git a/README.md b/README.md
@@ -6,19 +6,26 @@
 
 This is a Bayesian python toolbox for inference and forecast of the spread of the Coronavirus.
 
-The latest stable version is [v0.1.7](https://github.com/Priesemann-Group/covid19_inference/tree/v0.1.7).
+- [**Documentation**](https://covid19-inference.readthedocs.io/en/latest/index.html)
+- [**Getting started**](https://covid19-inference.readthedocs.io/en/latest/doc/gettingstarted.html)
+- [**Examples**](https://covid19-inference.readthedocs.io/en/latest/doc/examples.html)
+- [**Contributing**](https://covid19-inference.readthedocs.io/en/latest/doc/contributing.html)
+- [**Source code**](https://github.com/Priesemann-Group/covid19_inference)
 
-Check out our [documentation](https://covid19-inference.readthedocs.io/en/latest/doc/gettingstarted.html).
 
-An example notebook for one bundesland is [here](scripts/example_one_bundesland.ipynb), and for an hierarchical analysis of the bundeslaender [here](scripts/example_bundeslaender.ipynb) (could still have some problems).
+The latest stable version is [v0.1.7](https://github.com/Priesemann-Group/covid19_inference/tree/v0.1.7)!
+
 
 The research article [is available on arXiv](https://arxiv.org/abs/2004.01105) (**updated on April 13**).
 The code used to produce the figures is available in the other repository [here](https://github.com/Priesemann-Group/covid19_inference_forecast)
 
 
 **We are looking for support** to help us with analyzing other countries and to extend to an hierarchical regional model. We have received additional funding to do so and are recruiting PostDocs, PhD candidates and research assistants:
+
 https://www.ds.mpg.de/3568943/job_full_offer_14729553
+
 https://www.ds.mpg.de/3568926/job_full_offer_14729572
+
 https://www.ds.mpg.de/3568909/job_full_offer_14729591
 
 ### Please take notice of our [disclaimer](DISCLAIMER.md).

diff --git a/covid19_inference/__init__.py b/covid19_inference/__init__.py
@@ -5,7 +5,6 @@
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)-8s [%(name)s] %(message)s")
 log = logging.getLogger(__name__)
-from . import plotting
 
 # from .data_retrieval import GOOGLE
 from . import data_retrieval

diff --git a/covid19_inference/data_retrieval/_Financial_Times.py b/covid19_inference/data_retrieval/_Financial_Times.py
@@ -0,0 +1,224 @@
+import datetime
+import pandas as pd
+import logging
+
+# Import base class
+from .retrieval import Retrieval, _data_dir_fallback
+
+log = logging.getLogger(__name__)
+
+
+class FINANCIAL_TIMES(Retrieval):
+    """
+    This class can be used to retrieve the excess mortality data from the Financial Times
+    `github repository <https://github.com/Financial-Times/coronavirus-excess-mortality-data>`_.
+
+    Example
+    -------
+    .. code-block::
+
+        ft = cov19.data_retrieval.FINANCIAL_TIMES()
+        ft.download_all_available_data()
+
+        #Access the data by
+        ft.data
+        #or
+        ft.get(filter) #see below
+    """
+
+    def __init__(self, auto_download=False):
+        """
+        On init of this class the base Retrieval Class __init__ is called, with financial
+        times specific arguments.
+
+        Parameters
+        ----------
+        auto_download : bool, optional
+            Whether or not to automatically call the download_all_available_data() method.
+            One should explicitly call this method for more configuration options
+            (default: false)
+        """
+
+        # ------------------------------------------------------------------------------ #
+        #  Init Retrieval Base Class
+        # ------------------------------------------------------------------------------ #
+        """
+        A name mainly used for the Local Filename
+        """
+        name = "Financial_times"
+
+        """
+        The url to the main dataset as csv, if none if supplied the fallback routines get used
+        """
+        url_csv = "https://raw.githubusercontent.com/Financial-Times/coronavirus-excess-mortality-data/master/data/ft_excess_deaths.csv"
+
+        """
+        Kwargs for pandas read csv
+        """
+        kwargs = {}  # Suppress warning
+
+        """
+        If the local file is older than the update_interval it gets updated once the
+        download all function is called. Can be diffent values depending on the parent class
+        """
+        update_interval = datetime.timedelta(days=1)
+
+        # Init the retrieval base class
+        Retrieval.__init__(
+            self,
+            name,
+            url_csv,
+            [_data_dir_fallback + "/" + name + "_fallback.csv.gz"],
+            update_interval,
+            **kwargs,
+        )
+
+        if auto_download:
+            self.download_all_available_data()
+
+    def download_all_available_data(self, force_local=False, force_download=False):
+        """
+        Attempts to download from the main url (self.url_csv) which was given on initialization.
+        If this fails download from the fallbacks. It can also be specified to use the local files
+        or to force the download. The download methods get inhereted from the base retrieval class.
+
+        Parameters
+        ----------
+        force_local : bool, optional
+            If True forces to load the local files.
+        force_download : bool, optional
+            If True forces the download of new files
+        """
+        if force_local and force_download:
+            raise ValueError("force_local and force_download cant both be True!!")
+
+        # ------------------------------------------------------------------------------ #
+        # 1 Download or get local file
+        # ------------------------------------------------------------------------------ #
+        retrieved_local = False
+        if self._timestamp_local_old(force_local) or force_download:
+            self._download_helper(**self.kwargs)
+        else:
+            retrieved_local = self._local_helper()
+
+        # ------------------------------------------------------------------------------ #
+        # 2 Save local
+        # ------------------------------------------------------------------------------ #
+        self._save_to_local() if not retrieved_local else None
+
+        # ------------------------------------------------------------------------------ #
+        # 3 Convert to useable format
+        # ------------------------------------------------------------------------------ #
+        self._to_iso()
+
+    def _to_iso(self):
+        """
+        Converts the data to a usable format i.e. converts all date string to
+        datetime objects and some other column names.
+
+        This is most of the time the first place one has to look at if something breaks!
+
+        self.data -> self.data converted
+        """
+        try:
+            df = self.data
+            # datetime columns
+            df["date"] = pd.to_datetime(df["date"])
+            df = df.rename(columns={"region": "state"})  # For consistency
+            df = df.set_index("date")
+            self.data = df
+            return True
+        except Exception as e:
+            log.warning(f"There was an error formating the data! {e}")
+            raise e
+        return False
+
+    def get(
+        self,
+        value="excess_deaths",
+        country: str = "Germany",
+        state: str = None,
+        data_begin: datetime.datetime = None,
+        data_end: datetime.datetime = None,
+    ):
+        """
+        Retrieves specific data from the dataset, can be filtered by date, country and state.
+
+        Parameters
+        ----------
+        value : str, optional
+            Which data to return, possible values are
+            - "deaths",
+            - "expected_deaths",
+            - "excess_deaths",
+            - "excess_deaths_pct"
+            (default: "excess_deaths")
+        country : str, optional
+        state : str, optional
+            Possible countries and states can be retrieved by the `get_possible_countries_states()` method.
+        begin_date : datetime.datetime, optional
+            First day that should be filtered
+        end_date : datetime.datetime, optional
+            Last day that should be filtered
+        """
+
+        # ------------------------------------------------------------------------------ #
+        # Default Parameters
+        # ------------------------------------------------------------------------------ #
+        possible_values = [
+            "deaths",
+            "expected_deaths",
+            "excess_deaths_pct",
+            "excess_deaths",
+        ]
+        assert (
+            value in possible_values
+        ), f"Value '{value}' not possible! Use one from {possible_values}"
+
+        if state is None:
+            state = country  # somehow they publish the data like that ¯\_(ツ)_/¯
+
+        possible_countries_states = self.get_possible_countries_states()
+        assert [
+            country,
+            state,
+        ] in possible_countries_states, f"Country, state combination '[{country},{state}]' not possible! Check possible combinations by get_possible_countries_states()!"
+
+        if data_begin is None:
+            data_begin = self.__get_first_date()
+        if data_end is None:
+            data_end = self.__get_last_date()
+
+        # ------------------------------------------------------------------------------ #
+        # Filter the data
+        # ------------------------------------------------------------------------------ #
+
+        # Filter by country first
+        df = self.data[self.data["country"] == country]
+
+        # Filter by state next
+        df = df[df["state"] == state]
+
+        # Filter by value
+        df = df[value]
+
+        # Filter by date
+        df = df[data_begin:data_end]
+
+        return df
+
+    def get_possible_countries_states(self):
+        """
+        Can be used to obtain all different possible countries with there corresponding possible states and regions.
+
+        Returns
+        -------
+        : pandas.DataFrame
+        """
+        return self.data[["country", "state"]].drop_duplicates().to_numpy()
+
+    def __get_first_date(self):
+        return self.data.index.min()
+
+    def __get_last_date(self):
+        return self.data.index.max()
diff --git a/covid19_inference/data_retrieval/_RKI.py b/covid19_inference/data_retrieval/_RKI.py
@@ -245,6 +245,7 @@ def get_total(
         data_begin: datetime.datetime = None,
         data_end: datetime.datetime = None,
         date_type: str = "date",
+        age_group=None,
     ):
         """
         Gets all total confirmed cases for a region as dataframe with date index. Can be filtered with multiple arguments.
@@ -267,7 +268,8 @@ def get_total(
             last date, if no value is provided it will use the most recent possible date
         date_type : str, optional
             type of date to use: reported date 'date' (Meldedatum in the original dataset), or symptom date 'date_ref' (Refdatum in the original dataset)
-
+        age_group : str, optional
+            Choosen age group. To get the possible combinations use `possible_age_groups()`.
         Returns
         -------
         :pandas.DataFrame
@@ -301,7 +303,9 @@ def get_total(
         # ------------------------------------------------------------------------------ #
         # Retrieve data and filter it
         # ------------------------------------------------------------------------------ #
-        df = self.filter(data_begin, data_end, value, date_type, level, filter_value)
+        df = self.filter(
+            data_begin, data_end, value, date_type, level, filter_value, age_group
+        )
         return df
 
     def get_new(
@@ -312,6 +316,7 @@ def get_new(
         data_begin: datetime.datetime = None,
         data_end: datetime.datetime = None,
         date_type: str = "date",
+        age_group=None,
     ):
         """
         Retrieves all new cases from the Robert Koch Institute dataset as a DataFrame with datetime index.
@@ -334,7 +339,8 @@ def get_new(
             if none is given could yield errors
         data_end : datetime.datetime, optional
             last date for the returned data, if no value is given the most recent date in the dataset is used
-
+        age_group : str, optional
+            Choosen age group. To get the possible combinations use `possible_age_groups()`.
         Returns
         -------
         : pandas.DataFrame
@@ -385,6 +391,7 @@ def get_new(
             date_type,
             level,
             filter_value,
+            age_group,
         )
         # Get difference to the days beforehand
         df = (
@@ -400,6 +407,7 @@ def filter(
         date_type="date",
         level=None,
         value=None,
+        age_group=None,
     ):
         """
         Filters the obtained dataset for a given time period and returns an array ONLY containing only the desired variable.
@@ -423,10 +431,11 @@ def filter(
                 "None"       : return data from all Germany (default)
                 "Bundesland" : a state
                 "Landkreis"  : a region
-        value : None, optional
+        value : str, optional
             string of the state/region
             e.g. "Sachsen"
-
+        age_group : str, optional
+            Choosen age group. To get the possible combinations use `possible_age_groups()`.
         Returns
         -------
         : pd.DataFrame
@@ -458,8 +467,19 @@ def filter(
                 "Invalid data_begin, data_end: has to be datetime.datetime object"
             )
 
-        # Keeps only the relevant data
-        df = self.data
+        if age_group is None:
+            df = self.data
+        elif age_group in self.possible_age_groups():
+            df = self.data.loc[self.data["Altersgruppe"].isin([age_group])]
+        else:
+            raise ValueError(
+                f"Age group not possible use one of {self.possible_age_groups()}"
+            )
+
+        # If one uses Refdatum, only use data if isterkrakungsbeginn == 1
+
+        if date_type == "date_ref":
+            df = df.loc[df["IstErkrankungsbeginn"] == 1]
 
         if level is not None:
             df = df[df[level] == value][[date_type, variable]]
@@ -529,3 +549,9 @@ def filter_all_bundesland(
         df2.index = pd.to_datetime(df2.index)
         # Returns cumsum of variable
         return df2[begin_date:end_date].cumsum()
+
+    def possible_age_groups(self):
+        """
+        Returns the valid age groups in the dataset.
+        """
+        return self.data["Altersgruppe"].unique()
diff --git a/covid19_inference/data_retrieval/__init__.py b/covid19_inference/data_retrieval/__init__.py
@@ -6,4 +6,5 @@
 from ._RKI import *
 from ._RKI_situation_reports import *
 from ._OWD import *
+from ._Financial_Times import *
 from .retrieval import set_data_dir, get_data_dir, backup_instances
diff --git a/covid19_inference/model/__init__.py b/covid19_inference/model/__init__.py
@@ -1,7 +1,7 @@
 from .model import Cov19Model
 from .compartmental_models import SIR, SEIR, uncorrelated_prior_I
 from .delay import delay_cases
-from .spreading_rate import lambda_t_with_sigmoids
+from .spreading_rate import lambda_t_with_sigmoids, lambda_t_with_linear_interp
 from .likelihood import student_t_likelihood
 from .week_modulation import week_modulation