diff --git a/covid_xprize/examples/predictors/lstm/tests/figures_lstm.py b/covid_xprize/examples/predictors/lstm/tests/figures_lstm.py new file mode 100644 index 0000000..9546483 --- /dev/null +++ b/covid_xprize/examples/predictors/lstm/tests/figures_lstm.py @@ -0,0 +1,790 @@ +""" +Copyright 2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. + +This script creates plots for each stage of the LSTM pipeline. The plots are saved in the directory plots/. +""" +import argparse +import time +import sys +import os +import pandas as pd +from pathlib import Path +from datetime import datetime, timedelta +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +import tqdm + +from covid_xprize.oxford_data.oxford_data import load_ips_file +from covid_xprize.oxford_data.npi_static import MAX_NPIS_VECTOR +from covid_xprize.examples.predictors.lstm.xprize_predictor import XPrizePredictor +from covid_xprize.validation.scenario_generator import ID_COLS, NPI_COLUMNS +from covid_xprize import oxford_data + + +def _most_affected_countries(df: pd.DataFrame, nb_geos: int, min_historical_days: int) -> list[str]: + """ + Returns the list of most affected countries, in terms of confirmed deaths. + :param df: the data frame containing the historical data + :param nb_geos: the number of geos to return + :param min_historical_days: the minimum days of historical data the countries must have + :return: a list of country names of size nb_countries if there were enough, and otherwise a list of all the + country names that have at least min_look_back_days data points. + """ + # Don't include the region-level data, just the country-level summaries. + gdf = df[df.RegionName.isna()] + gdf = gdf.groupby('CountryName', group_keys=False)['ConfirmedDeaths'].agg(['max', 'count']).sort_values( + by='max', ascending=False) + filtered_gdf = gdf[gdf["count"] > min_historical_days] + geos = list(filtered_gdf.head(nb_geos).index) + return geos + + +def _write_ips_file_for_country(df: pd.DataFrame, country: str, start_date: datetime, end_date: datetime, file: str): + gdf = df[ + (df.CountryName == country) & + (df.RegionName.isna()) & # Don't include the region-level data, just the country-level summaries. + (df.Date >= start_date) & (df.Date <= end_date) + ] + ips_df = gdf[ID_COLS + NPI_COLUMNS].copy() + ips_df['RegionName'] = ips_df['RegionName'].fillna("") + for npi_col in NPI_COLUMNS: + ips_df.update(ips_df.groupby(['CountryName', 'RegionName'], group_keys=False)[npi_col].ffill().fillna(0)) + ips_df.to_csv(file, index=False) + + +def test_predictions_path(country, start_date_str) -> Path: + directory = Path(__file__).parent / 'data' / 'predictions'; directory.mkdir(exist_ok=True) + name_parts = [] + name_parts.append(f'country{country.replace(" ", "_")}') + name_parts.append(start_date_str) + return directory / ('_'.join(name_parts) + '.csv') + + +############################################################################### +# 06 +############################################################################### +def test_train_predict_pipeline(): + """We want to train a model on a very small piece of data and confirm that + the model is able to regurgitate the training data. If it is able to do so, + then we know that in theory, the mode can learn time-series patterns seen + in the data. This test is to establish basic pattern-recognition capabilities + of the model. + """ + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_root_dir = current_dir / 'data'; data_root_dir.mkdir(exist_ok=True) + data_dir = data_root_dir / '06_train_predict_pipeline'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare some metadata. + country = "Aruba" # Start with + ips_path = data_dir / f"ips_file.csv" + figure_name = f"06_train_predict_pipeline" + + # Initialize an untrained model. + df = oxford_data.load_original_oxford_data() + path_to_oxford_data = data_dir / f"OxCGRT_artificial.csv" + df.to_csv(path_to_oxford_data, index=False) + model = XPrizePredictor(None, path_to_oxford_data) + + # Prepare artificial training data by modifying the model's dataframe in-place. + model.df = model.df[model.df.GeoID == country].copy() + df = model.df + df[NPI_COLUMNS] = 0 + # The pattern is: + # 1. 14 days of NPIs==1.0, and case ratio equal to 1 + # 2. 28 days of NPIs==1.0, and case ratio equal to 0.9 + # 3. 14 days of NPIs==0.0, and case ratio equal to 1 + # 4. 28 days of NPIs==0.0, and case ratio equal to 1 / 0.9 + pattern_period = 14 + 28 + 14 + 28 + for row_i in range(len(df)): + pattern_phase = row_i % pattern_period + if pattern_phase < 14: + df.loc[row_i, "PredictionRatio"] = 1.0 + df.loc[row_i, NPI_COLUMNS] = 1.0 + elif pattern_phase < 14 + 28: + df.loc[row_i, "PredictionRatio"] = 0.9 + df.loc[row_i, NPI_COLUMNS] = 1.0 + elif pattern_phase < 14 + 28 + 14: + df.loc[row_i, "PredictionRatio"] = 1.0 + df.loc[row_i, NPI_COLUMNS] = 0.0 + else: + df.loc[row_i, "PredictionRatio"] = 1 / 0.9 + df.loc[row_i, NPI_COLUMNS] = 0.0 + + # Get the start date from the data + start_date = df.iloc[pattern_period + 14].Date + n_prediction_days = 31 + end_date = start_date + timedelta(days=n_prediction_days-1) + + # Train the model on our artificial training data. + model.train() + + # Create the prediction vectors for each scenario. + prediction_ratios_by_npi_const: dict[float, np.ndarray] = {} + initial_context_by_npi_const: dict[float, np.ndarray] = {} + npis_vals = [0.0, 1.0] + for npis_val in tqdm.tqdm(npis_vals, desc="Predictions (scenarios)"): + # Get the initial context & action vectors. + context, action = model._initial_context_action_vectors([country], start_date) + context = context[country] # shape (Days, 1) + action = action [country] # shape (Days, NPIs) + + # Edit the action & context vector to match our scenario. + action [:] = npis_val + context[:] = 1.0 + + # Get the predictions with the passed NPIs + npis_sequence = np.full((n_prediction_days, action.shape[1]), npis_val) # shape (PredictionDates, NPIs) + pred_ratios = model._roll_out_predictions( # shape (PredictionDates,) + model.predictor, + context, + action, + npis_sequence) + + # Add result to collection. + prediction_ratios_by_npi_const[npis_val] = pred_ratios + initial_context_by_npi_const[npis_val] = context.flatten() + + # Compile the results in to a figure. + n_rows = 3 + n_cols = 2 + fig = plt.figure(figsize=(5 * n_cols, 2 * n_rows)) + do_legend = True + + ax = plt.subplot(n_rows, n_cols, 1) + ax.axis("off") + ax.text(0.1, 0.5, "\n".join([ + figure_name, + "PredictionRatios are the immediate output of the v1 LSTM model." + ]), in_layout=False) + + plot_i = n_cols + 1 + for npi_val in tqdm.tqdm(npis_vals, desc="Plots (Scenarios)"): + # predictions = oxford_data.process_submission(predictions, start_date, end_date) + prediction_ratios = prediction_ratios_by_npi_const[npi_val] + initial_context = initial_context_by_npi_const[npi_val] + + # data = df[(df.Date >= start_date) & (df.Date <= end_date) & (df.GeoID == country)] + # data = data.copy() + # data["PredictedPredictionRatio"] = prediction_ratios + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot(prediction_ratios, legend=do_legend, label="Predicted", color="black", ax=ax) + # plot = sns.lineplot(data, x="Date", y="PredictedPredictionRatio", + # legend=do_legend, label="Predicted PredictionRatio", color="red", ax=ax) + # plot = sns.lineplot(data, x="Date", y="PredictionRatio", + # legend=do_legend, label="Ground Truth PredictionRatio", color="black", ax=ax) + # plot.get_figure().autofmt_xdate() + ax.axhline(1.0, color="red", linestyle='--') # Draw a line at identity. + ax.set_ylabel("") + ax.set_ylim(0.8, 1.2) + ax.set_title(f"NPIs={npi_val}") + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot(initial_context, legend=do_legend, label="Initial Context", color="green", ax=ax) + ax.set_ylabel("") + ax.set_ylim(0.9, 1.2) + ax.set_title(f"NPIs={npi_val}") + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + + plt.tight_layout() + figure_path = figures_dir / f"{figure_name}.png" + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 05 +############################################################################### + +def test_artificial_data(path_to_model_weights: str, start_date_str: str): + _test_artificial_data_helper(path_to_model_weights, start_date_str, context_const=0.95) + _test_artificial_data_helper(path_to_model_weights, start_date_str, context_const=1.05) + + +def _test_artificial_data_helper(path_to_model_weights: str, start_date_str: str, context_const: float): + """We want to better understand the factors that the model takes into account. + So, we will create artificial data in which individual parameters are changed, + and generate predictions to evaluate the model sensitivitiy to these parameters. + :param path_to_model_weights: The model used for the evaluation. + :param start_date_str: The start date of the prediction period. + """ + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_root_dir = current_dir / 'data'; data_root_dir.mkdir(exist_ok=True) + data_dir = data_root_dir / '05_artificial_data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare the original data. + df = oxford_data.load_original_oxford_data() + # df = df[df.Date < start_date] # Truncate the data if we're using it for training. + path_to_oxford_data = data_dir / f"OxCGRT_orig.csv" + df.to_csv(path_to_oxford_data, index=False) + + # Load the given model weights. + model = XPrizePredictor(path_to_model_weights, path_to_oxford_data) + + # Prepare some metadata. + start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + n_prediction_days = 31 + end_date = start_date + timedelta(days=n_prediction_days-1) + country = "France" # Start with + ips_path = data_dir / f"ips_file.csv" + figure_name = f"05_artificial_data_{start_date_str}_context_{context_const}" + + # Prepare a base IPs df which we will modify to create the artificial data. + _write_ips_file_for_country( + df, + country, + start_date, + end_date, + ips_path, + ) + npis_df = load_ips_file(ips_path) + npis_sequence = np.array(npis_df[NPI_COLUMNS]) # shape (PredictionDates, NPIs) + + # Create the artificial test scenarios. + test_scenarios: list[tuple[str, np.ndarray]] = [] # pairs (label, npis) + n_npis = len(MAX_NPIS_VECTOR) + test_scenarios.append(("all_zero", np.zeros((n_prediction_days, n_npis)))) # shape (PredictionDates, NPIs) + test_scenarios.append(("all_max", np.full((n_prediction_days, n_npis), MAX_NPIS_VECTOR))) # shape (PredictionDates, NPIs) + for npi_i, npi_label in enumerate(tqdm.tqdm(NPI_COLUMNS, desc="Make scenarios")): + npis = npis_sequence.copy() # shape (PredictionDates, NPIs) + npis[:, npi_i] = 0.0 + test_scenarios.append((f"{npi_label}_zero", npis)) + npis = npis_sequence.copy() # shape (PredictionDates, NPIs) + npis[:, npi_i] = MAX_NPIS_VECTOR[npi_i] + test_scenarios.append((f"{npi_label}_max", npis)) + labels = [scenario[0] for scenario in test_scenarios] + assert(len(set(labels)) == len(labels)) # Make sure there's no duplicate labels. + + # Create the prediction vectors for each scenario. + prediction_ratios_by_label: dict[str, np.ndarray] = {} + for scenario_label, scenario in tqdm.tqdm(test_scenarios, desc="Predictions (scenarios)"): + # Check the cache for already-made results. + pred_ratios_cache_path = data_dir / f"{scenario_label}_{start_date_str}_{context_const}.npy" + if pred_ratios_cache_path.exists(): + prediction_ratios_by_label[scenario_label] = np.load(pred_ratios_cache_path) + continue + + # Get the initial context & action vectors. + context, action = model._initial_context_action_vectors([country], start_date) + context = context[country] # shape (Days, 1) + action = action [country] # shape (Days, NPIs) + + # Edit the action context vector to match our scenario by repeating the first day backwards. + action[:, :] = scenario[0:1, :] # broadcast shape (Days, NPIs) <- shape (1, NPIs) + + # Edit the context vector to fill in our data. + context[:] = context_const + + # Get the predictions with the passed NPIs + npis_sequence = scenario # shape (PredictionDates, NPIs) + pred_ratios = model._roll_out_predictions( # shape (PredictionDates,) + model.predictor, + context, + action, + npis_sequence) + + # Add result to cache. + np.save(pred_ratios_cache_path, pred_ratios) + + # Add result to collection. + prediction_ratios_by_label[scenario_label] = pred_ratios + + # Compute some additional columns on the GT data including prediction ratios. + df = oxford_data.prepare_cases_dataframe() + + # Compile the results in to a figure. + n_scenarios = len(test_scenarios) + n_rows = n_scenarios // 2 + 2 + n_cols = 2 + fig = plt.figure(figsize=(5 * n_cols, 2 * n_rows)) + do_legend = True + + ax = plt.subplot(n_rows, n_cols, 1) + ax.axis("off") + ax.text(0.1, 0.5, "\n".join([ + figure_name, + "PredictionRatios are the immediate output of the v1 LSTM model." + ]), in_layout=False) + + plot_i = n_cols + 1 + for label, scenario in tqdm.tqdm(test_scenarios, desc="Plots (Scenarios)"): + # predictions = oxford_data.process_submission(predictions, start_date, end_date) + prediction_ratios = prediction_ratios_by_label[label] + + # data = df[(df.Date >= start_date) & (df.Date <= end_date) & (df.GeoID == country)] + # data = data.copy() + # data["PredictedPredictionRatio"] = prediction_ratios + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot(prediction_ratios, legend=do_legend, label="Predicted", color="black", ax=ax) + # plot = sns.lineplot(data, x="Date", y="PredictedPredictionRatio", + # legend=do_legend, label="Predicted PredictionRatio", color="red", ax=ax) + # plot = sns.lineplot(data, x="Date", y="PredictionRatio", + # legend=do_legend, label="Ground Truth PredictionRatio", color="black", ax=ax) + # plot.get_figure().autofmt_xdate() + ax.axhline(1.0, color="red", linestyle='--') # Draw a line at identity. + ax.set_ylabel("") + ax.set_ylim(0.9, 1.2) + ax.set_title(label) + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + plt.tight_layout() + figure_path = figures_dir / f"{figure_name}.png" + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 04 +############################################################################### + + +def _test_plot_prediction_ratios_helper(path_to_model_weights: str, start_date_str: str, on_training_data: bool = False): + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_root_dir = current_dir / 'data'; data_root_dir.mkdir(exist_ok=True) + data_dir = data_root_dir / '06_prediction_ratios'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare the original data. + df = oxford_data.load_original_oxford_data() + # df = df[df.Date < start_date] # Truncate the data if we're using it for training. + path_to_oxford_data = data_dir / f"OxCGRT_orig.csv" + df.to_csv(path_to_oxford_data, index=False) + + # Load the given model weights. + model = XPrizePredictor(path_to_model_weights, path_to_oxford_data) + + # Prepare some metadata. + if on_training_data: # The training evaluations should be done on earlier data. + pred_start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + start_date: datetime = pred_start_date - timedelta(days=60) + start_date_str = start_date.strftime('%Y-%m-%d') + else: + start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + end_date = start_date + timedelta(days=30) + countries = _most_affected_countries(df, 30, 50) + ips_path = data_dir / f"ips_file.csv" + + tr_or_st_label = "tr" if on_training_data else "st" + figure_name = f"04_prediction_ratios_{start_date_str}_{tr_or_st_label}" + + # Create the prediction vectors for each country. + prediction_ratios_by_country: dict[str, np.ndarray] = {} + for country_i, country in enumerate(tqdm.tqdm(sorted(countries), desc="Predictions (Countries)")): + # Check the cache for already-made results. + pred_ratios_cache_path = data_dir / f"{country}_{start_date_str}.npy" + if pred_ratios_cache_path.exists(): + prediction_ratios_by_country[country] = np.load(pred_ratios_cache_path) + continue + + # Get the initial context & action vectors. + context, action = model._initial_context_action_vectors([country], start_date) + context = context[country] # shape (Days, 1) + action = action [country] # shape (Days, NPIs) + + # Write the true intervention plan during this time period. + _write_ips_file_for_country( + df, + country, + start_date, + end_date, + ips_path, + ) + npis_df = load_ips_file(ips_path) + + # Prepare the DF and the NPIs for the country. + # cdf = df[(df.CountryName == country) & (df.RegionName.isna())] + # cdf = cdf[cdf.ConfirmedCases.notnull()] + cnpis_df = npis_df[npis_df.GeoID == country] # TODO double check this isn't empty + npis_sequence = np.array(cnpis_df[NPI_COLUMNS]) # shape (PredictionDates, NPIs) + # Get the predictions with the passed NPIs + pred_ratios = model._roll_out_predictions( # shape (PredictionDates,) + model.predictor, + context, + action, + npis_sequence) + + # Add result to cache. + np.save(pred_ratios_cache_path, pred_ratios) + + # Add result to collection. + prediction_ratios_by_country[country] = pred_ratios + + # Compute some additional columns on the GT data including prediction ratios. + df = oxford_data.prepare_cases_dataframe() + + # Compile the results in to a figure. + n_countries = len(countries) + n_rows = int(np.floor(np.sqrt(n_countries))) + 2 + n_cols = int(np.ceil(n_countries / n_rows)) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + do_legend = True + + ax = plt.subplot(n_rows, n_cols, 1) + ax.axis("off") + ax.text(0.1, 0.5, "\n".join([ + figure_name, + "PredictionRatios are the immediate output of the v1 LSTM model." + ]), in_layout=False) + + plot_i = n_cols + 1 + for country in tqdm.tqdm(sorted(countries), desc="Plots (Countries)"): + # predictions = oxford_data.process_submission(predictions, start_date, end_date) + prediction_ratios = prediction_ratios_by_country[country] + + data = df[(df.Date >= start_date) & (df.Date <= end_date) & (df.GeoID == country)] + data = data.copy() + data["PredictedPredictionRatio"] = prediction_ratios + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + # plot = sns.lineplot(prediction_ratios, legend=do_legend, label="Predicted", color="red", ax=ax) + plot = sns.lineplot(data, x="Date", y="PredictedPredictionRatio", + legend=do_legend, label="Predicted PredictionRatio", color="red", ax=ax) + plot = sns.lineplot(data, x="Date", y="PredictionRatio", + legend=do_legend, label="Ground Truth PredictionRatio", color="black", ax=ax) + plot.get_figure().autofmt_xdate() + ax.set_ylabel("") + ax.set_ylim(0.9, 1.2) + ax.set_title(country) + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + plt.tight_layout() + figure_path = figures_dir / f"{figure_name}.png" + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +def test_plot_prediction_ratios(path_to_model_weights: str, start_date_str: str): + """Plot the immediate output of the neural network against the true data.""" + _test_plot_prediction_ratios_helper(path_to_model_weights, start_date_str, on_training_data=False) + _test_plot_prediction_ratios_helper(path_to_model_weights, start_date_str, on_training_data=True) + + + +############################################################################### +# 03 +############################################################################### + + +def test_plot_input_output_pipeline(path_to_model_weights: str, start_date_str: str): + """Plot the inputs to the neural network along with the raw prediction ratio outputs.""" + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare the original data. + df = oxford_data.load_original_oxford_data() + # df = df[df.Date < start_date] # Truncate the data if we're using it for training. + path_to_oxford_data = data_dir / f"OxCGRT_orig.csv" + df.to_csv(path_to_oxford_data, index=False) + + # Load the given model weights. + model = XPrizePredictor(path_to_model_weights, path_to_oxford_data) + + # Prepare some related parameters. + start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + end_date = start_date + timedelta(days=30) + countries = _most_affected_countries(df, 30, 50) + ips_path = data_dir / f"ips_file.csv" + + # Compile the results in to a figure. + n_countries = len(countries) + n_country_plots = int(np.floor(np.sqrt(n_countries))) + n_cols = int(np.ceil(n_countries / n_country_plots)) + n_rows = 2 * n_country_plots + 2 + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + plot_i = n_cols + 1 + do_legend = True + + # Create the context vectors for each country and plot them. + for country_i, country in tqdm.tqdm(enumerate(sorted(countries)), desc="test_plot_input_output_pipeline (Countries)", total=len(countries)): + # Get the initial context & action vectors. + context, action = model._initial_context_action_vectors([country], start_date) + context = context[country] # shape (Days, 1) + action = action [country] # shape (Days, NPIs) + + # Write the true intervention plan during this time period. + _write_ips_file_for_country( + df, + country, + start_date, + end_date, + ips_path, + ) + npis_df = load_ips_file(ips_path) + + # Prepare the DF and the NPIs for the country. + # cdf = df[(df.CountryName == country) & (df.RegionName.isna())] + # cdf = cdf[cdf.ConfirmedCases.notnull()] + cnpis_df = npis_df[npis_df.GeoID == country] # TODO double check this isn't empty + npis_sequence = np.array(cnpis_df[NPI_COLUMNS]) # shape (PredictionDates, NPIs) + # Get the predictions with the passed NPIs + pred_ratios = model._roll_out_predictions(model.predictor, # shape (PredictionDates,) + context, + action, + npis_sequence) + + # Plot the predictions. + row = 1 + 2 * (country_i // n_cols) # Skip the first row, then 2 rows per country. + col = country_i % n_cols + plot_i = 1 + n_cols * row + col + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + plt.subplots_adjust(hspace=0) + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot( + x=cnpis_df.Date, + y=pred_ratios, + color="red", legend=do_legend, label="Prediction Ratios", ax=ax) + plot.get_figure().autofmt_xdate() + ax.set_ylabel("") + ax.set_title(country) + ax.axhline(y=1.0, color='blue', linestyle='--', alpha=0.8) # Show the identity prediction ratio. + # Also show the 2% increase and decrease ratios. (Remark: these two are not actually inverse operations.) + r = 0.02 + ax.axhline(y=1+r, color='blue', linestyle='--', alpha=0.4) + ax.axhline(y=1-r, color='blue', linestyle='--', alpha=0.4) + + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + # Plot the NPIs. + # Normalize the NPIs by dividing by the max values for correct color scaling. + npis_max_v = np.array(MAX_NPIS_VECTOR) # shape (NPIs) + npis_image = npis_sequence / npis_max_v # shape (PredictionDates, NPIs) + row = 1 + 1 + 2 * (country_i // n_cols) # Skip the first row, then 2 rows per country. + col = country_i % n_cols + plot_i = 1 + n_cols * row + col + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + plt.subplots_adjust(hspace=0) + ax.imshow(npis_image.T, cmap='inferno') + + # Plot some debug information + plot_i = 1 + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + debug = "\n".join([ + f"test_plot_pred_ratios: Prediction Ratios output along with plus/minus {r:.2} marks (blue dashes).", + f"Prediction Ratios are the raw output of the V1 LSTM-based neural network.", + f"Below is the NPIs, divided by the max possible value in each NPI.", + ]) + ax.axis("off") + ax.text(0, 1, debug) + + figure_path = figures_dir / f"03_pred_ratios_{start_date}.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 02 +############################################################################### + + +def test_plot_initial_context(path_to_model_weights: str, start_date_str: str): + + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare the original data. + df = oxford_data.load_original_oxford_data() + # df = df[df.Date < start_date] # Truncate the data if we're using it for training. + path_to_oxford_data = data_dir / f"OxCGRT_orig.csv" + df.to_csv(path_to_oxford_data, index=False) + + # Load the given model weights. + model = XPrizePredictor(path_to_model_weights, path_to_oxford_data) + + # Prepare some related parameters. + start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + countries = _most_affected_countries(df, 30, 50) + + # Compile the results in to a figure. + n_countries = len(countries) + n_rows = int(np.floor(np.sqrt(n_countries))) + 2 + n_cols = int(np.ceil(n_countries / n_rows)) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + plot_i = n_cols + 1 + do_legend = True + + # Create the context vectors for each country and plot them. + for country in tqdm.tqdm(sorted(countries), desc="test_plot_initial_context (Countries)"): + # Get the initial context & action vectors. + context, action = model._initial_context_action_vectors([country], start_date) + context = context[country] # shape (Days, 1) + action = action [country] # shape (Days, NPIs) + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot(context[:, 0], color="red", legend=do_legend, label="Context", ax=ax) + # plot = sns.lineplot(gt , x="Date", y="ActualDailyNewCases7DMA", legend=False, color="black", ax=ax) + plot.get_figure().autofmt_xdate() + ax.set_ylabel("") + ax.set_title(country) + + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + figure_path = figures_dir / f"02_context_vectors_{start_date}.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 01 +############################################################################### + + +def test_plot_predictions(path_to_model_weights: str, start_date_str: str): + print(f"test_plot_predictions:") + print(f" path_to_model_weights: {path_to_model_weights}") + print(f" start_date_str: {start_date_str}") + + _test_plot_predictions_helper(path_to_model_weights, start_date_str, on_training_data=False) + _test_plot_predictions_helper(path_to_model_weights, start_date_str, on_training_data=True) + + +def _test_plot_predictions_helper(path_to_model_weights: str, start_date_str: str, on_training_data=False): + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Prepare the original data. + df = oxford_data.load_original_oxford_data() + # df = df[df.Date < start_date] # Truncate the data if we're using it for training. + path_to_oxford_data = data_dir / f"OxCGRT_orig.csv" + df.to_csv(path_to_oxford_data, index=False) + + # Load the given model weights. + model = XPrizePredictor(path_to_model_weights, path_to_oxford_data) + + # Create the model predictions. + if on_training_data: # The training evaluations should be done on earlier data. + pred_start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + start_date: datetime = pred_start_date - timedelta(days=60) + start_date_str = start_date.strftime('%Y-%m-%d') + else: + start_date = datetime.strptime(start_date_str, '%Y-%m-%d') + end_date = start_date + timedelta(days=30) + countries = _most_affected_countries(df, 30, 50) + ips_path = data_dir / f"ips_file.csv" + for country in tqdm.tqdm(countries, desc="Predictions (Countries)"): + # Skip if already predicted. + predictions_path = test_predictions_path(country, start_date_str) + if predictions_path.exists(): + continue + + # Write the true intervention plan during this time period. + _write_ips_file_for_country( + df, + country, + start_date, + end_date, + ips_path, + ) + + # Evaluate the model predicitons. + preds_df = model.predict(start_date, end_date, ips_path) + preds_df.to_csv(predictions_path, index=False) + + ########################################################################### + # Compile the results in to a figure. + n_countries = len(countries) + n_rows = int(np.floor(np.sqrt(n_countries))) + 2 + n_cols = int(np.ceil(n_countries / n_rows)) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + plot_i = n_cols + 1 + do_legend = True + for country in tqdm.tqdm(sorted(countries), desc="Plots (Countries)"): + predictions_path = test_predictions_path(country, start_date_str) + predictions = pd.read_csv(predictions_path, parse_dates=['Date']) + predictions = oxford_data.process_submission(predictions, start_date, end_date) + + gt_start = start_date - timedelta(days=14) # Show some additional context. + gt = df[(df.Date >= gt_start) & (df.Date <= end_date) & (df.CountryName == country) & (df.RegionName.isna())] + gt = oxford_data.process_submission(gt, gt_start, end_date) + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + sns.set_palette(sns.husl_palette(8)) + plot = sns.lineplot(predictions, x="Date", y="PredictedDailyNewCases7DMA", legend=do_legend, label="Predicted", color="red", ax=ax) + plot = sns.lineplot(gt , x="Date", y="ActualDailyNewCases7DMA", legend=do_legend, label="Ground Truth", color="black", ax=ax) + plot.get_figure().autofmt_xdate() + ax.set_ylabel("") + ax.set_title(country) + if do_legend: + # Put the lower left of the bounding box at axes coordinates (0, 1.20). + sns.move_legend(ax, "lower left", bbox_to_anchor=(0, 1.20)) + do_legend = False + + tr_or_st_label = "tr" if on_training_data else "st" + figure_path = figures_dir / f"01_final_predictions_{start_date_str}_{tr_or_st_label}.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +def main(path_to_model_weights: str, start_date_str: str): + """ + :param path_to_model_weights: Path to the XPrizePredictor model weights. + :param start_date_str: Date in the format %Y-%m-%d. """ + print(f"Test pipeline beginning with the following arguments:") + print(f" path_to_model_weights: {path_to_model_weights}") + print(f" start_date_str: {start_date_str}") + test_plot_predictions(path_to_model_weights, start_date_str) + test_plot_initial_context(path_to_model_weights, start_date_str) + test_plot_input_output_pipeline(path_to_model_weights, start_date_str) + test_plot_prediction_ratios(path_to_model_weights, start_date_str) + test_artificial_data(path_to_model_weights, start_date_str) + test_train_predict_pipeline() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument("--path_to_model_weights", + help="path to LSTM weights", + default="/Users/964353/PandemicResilience/Cognizant/models/scenario2_lstm_trained_model_weights.h5") + + # Scenario 2: Argentina, 2021-03-1 - 2021-04-31 + # Scenario 4: Kenya, 2020-10-01 - 2020-11-30 + parser.add_argument("--start_date_str", + help="start date of predictions", + default="2021-03-01") + args = parser.parse_args() + main(**vars(args)) + + diff --git a/covid_xprize/examples/predictors/lstm/xprize_predictor.py b/covid_xprize/examples/predictors/lstm/xprize_predictor.py index fd433ee..9020a05 100644 --- a/covid_xprize/examples/predictors/lstm/xprize_predictor.py +++ b/covid_xprize/examples/predictors/lstm/xprize_predictor.py @@ -3,6 +3,7 @@ import os import urllib.request + # Suppress noisy Tensorflow debug logging os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -19,7 +20,8 @@ from keras.models import Model from covid_xprize.oxford_data import prepare_cases_dataframe, load_ips_file, create_country_samples, NPI_COLUMNS, \ - create_prediction_initial_context_and_action_vectors + create_prediction_initial_context_and_action_vectors, convert_ratio_to_new_cases +from covid_xprize import oxford_data CONTEXT_COLUMN = 'PredictionRatio' NB_LOOKBACK_DAYS = 21 @@ -67,6 +69,11 @@ def predict(self, start_date_str: str, end_date_str: str, path_to_ips_file: str) -> pd.DataFrame: + """Main predict function following a standard interface. + :param start_date_str: Prediction start date, in format '%Y-%m-%d'. + :param end_date_str: Prediction end date, in format '%Y-%m-%d'. + :param path_to_ips_file: Path to a CSV file containing the intervention plan for the prediction period. + """ start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d') nb_days = (end_date - start_date).days + 1 @@ -84,33 +91,33 @@ def predict(self, geos = npis_df.GeoID.unique() # Prepare context vectors. - initial_context, initial_action = create_prediction_initial_context_and_action_vectors( - self.df, - geos, - CONTEXT_COLUMN, - start_date, - NB_LOOKBACK_DAYS, - ) + initial_context, initial_action = self._initial_context_action_vectors(geos, start_date) + + # Make predictions for each GeoID asked for. for g in geos: cdf = self.df[self.df.GeoID == g] if len(cdf) == 0: # we don't have historical data for this geo: return zeroes + geo_pred_start_date = start_date pred_new_cases = [0] * nb_days - geo_start_date = start_date else: + # Start predicting from start_date, unless there's a gap since last known date, + # in which case regenerate the entire timeline since the last known data. last_known_date = cdf.Date.max() - # Start predicting from start_date, unless there's a gap since last known date - geo_start_date = min(last_known_date + np.timedelta64(1, 'D'), start_date) - npis_gdf = npis_df[(npis_df.Date >= geo_start_date) & (npis_df.Date <= end_date)] - pred_new_cases = self._get_new_cases_preds(cdf, g, npis_gdf, initial_context[g], initial_action[g]) + geo_pred_start_date = min(last_known_date + np.timedelta64(1, 'D'), start_date) - # Append forecast data to results to return + # Make the predictions. + pred_new_cases = self._get_new_cases_preds(cdf, g, npis_df, initial_context[g], initial_action[g], geo_pred_start_date, end_date) + + # Read some meta columns from the first row of the NPIs. country = npis_df[npis_df.GeoID == g].iloc[0].CountryName - region = npis_df[npis_df.GeoID == g].iloc[0].RegionName + region = npis_df[npis_df.GeoID == g].iloc[0].RegionName + + # Append forecast data to results to return for i, pred in enumerate(pred_new_cases): forecast["CountryName"].append(country) forecast["RegionName"].append(region) - current_date = geo_start_date + pd.offsets.Day(i) + current_date = geo_pred_start_date + pd.offsets.Day(i) forecast["Date"].append(current_date) forecast["PredictedDailyNewCases"].append(pred) @@ -118,23 +125,38 @@ def predict(self, # Return only the requested predictions return forecast_df[(forecast_df.Date >= start_date) & (forecast_df.Date <= end_date)] - def _get_new_cases_preds(self, c_df, g, npis_df, initial_context_input, initial_action_input): - cdf = c_df[c_df.ConfirmedCases.notnull()] - # Predictions with passed npis - cnpis_df = npis_df[npis_df.GeoID == g] - npis_sequence = np.array(cnpis_df[NPI_COLUMNS]) + def _initial_context_action_vectors(self, geos, start_date): + return create_prediction_initial_context_and_action_vectors( + self.df, + geos, + CONTEXT_COLUMN, + start_date, + NB_LOOKBACK_DAYS, + ) + + def _get_new_cases_preds(self, c_df, g, npis_df, initial_context_input, initial_action_input, start_date, end_date) -> np.ndarray: + """Run the neural network to compute the context column, and convert the context column to new cases. + :return: An array of NewCases as predicted by the network.""" + # Extract an array of the NPI data. + cnpis_df = npis_df[ (npis_df.Date >= start_date) & (npis_df.Date <= end_date) & # Match NPIs in prediction window; + (npis_df.GeoID == g)] # Match NPIs for region. + npis_sequence = np.array(cnpis_df[NPI_COLUMNS]) # shape (PredictionDates, NPIs) + # Get the predictions with the passed NPIs preds = self._roll_out_predictions(self.predictor, initial_context_input, initial_action_input, npis_sequence) + # Gather info to convert to total cases - prev_confirmed_cases = np.array(cdf.ConfirmedCases) - prev_new_cases = np.array(cdf.NewCases) - initial_total_cases = prev_confirmed_cases[-1] + cdf = c_df[c_df.ConfirmedCases.notnull()] + prev_cdf = cdf[cdf.Date < start_date] + prev_new_cases = np.array(prev_cdf.NewCases) + initial_total_cases = np.array(prev_cdf.ConfirmedCases)[-1] pop_size = np.array(cdf.Population)[-1] # Population size doesn't change over time + # Compute predictor's forecast - pred_new_cases = self._convert_ratios_to_total_cases( + pred_new_cases = oxford_data.convert_prediction_ratios_to_new_cases( preds, WINDOW_SIZE, prev_new_cases, @@ -143,25 +165,9 @@ def _get_new_cases_preds(self, c_df, g, npis_df, initial_context_input, initial_ return pred_new_cases - @staticmethod - def _load_original_data(data_url): - latest_df = pd.read_csv(data_url, - parse_dates=['Date'], - encoding="ISO-8859-1", - dtype={"RegionName": str, - "RegionCode": str}, - on_bad_lines='skip') - # GeoID is CountryName / RegionName - # np.where usage: if A then B else C - latest_df["GeoID"] = np.where(latest_df["RegionName"].isnull(), - latest_df["CountryName"], - latest_df["CountryName"] + ' / ' + latest_df["RegionName"]) - return latest_df - - # Function for performing roll outs into the future - @staticmethod - def _roll_out_predictions(predictor, initial_context_input, initial_action_input, future_action_sequence): + def _roll_out_predictions(predictor, initial_context_input, initial_action_input, future_action_sequence) -> np.ndarray: + """Run the neural network autoregressively to compute a prediction of the context column.""" nb_roll_out_days = future_action_sequence.shape[0] pred_output = np.zeros(nb_roll_out_days) context_input = np.expand_dims(np.copy(initial_context_input), axis=0) @@ -177,43 +183,6 @@ def _roll_out_predictions(predictor, initial_context_input, initial_action_input context_input[:, -1] = pred return pred_output - # Functions for converting predictions back to number of cases - @staticmethod - def _convert_ratio_to_new_cases(ratio, - window_size, - prev_new_cases_list, - prev_pct_infected): - return (ratio * (1 - prev_pct_infected) - 1) * \ - (window_size * np.mean(prev_new_cases_list[-window_size:])) \ - + prev_new_cases_list[-window_size] - - def _convert_ratios_to_total_cases(self, - ratios, - window_size, - prev_new_cases, - initial_total_cases, - pop_size): - new_new_cases = [] - prev_new_cases_list = list(prev_new_cases) - curr_total_cases = initial_total_cases - for ratio in ratios: - new_cases = self._convert_ratio_to_new_cases(ratio, - window_size, - prev_new_cases_list, - curr_total_cases / pop_size) - # new_cases can't be negative! - new_cases = max(0, new_cases) - # Which means total cases can't go down - curr_total_cases += new_cases - # Update prev_new_cases_list for next iteration of the loop - prev_new_cases_list.append(new_cases) - new_new_cases.append(new_cases) - return new_new_cases - - @staticmethod - def _smooth_case_list(case_list, window): - return pd.Series(case_list).rolling(window).mean().to_numpy() - def train(self, num_trials=NUM_TRIALS, num_epochs=NUM_EPOCHS, return_all_trials=False): """Trains the weights of the predictor model on a prediction loss. :param num_trials: The number of LSTM models to train. The top performer is selected. @@ -415,27 +384,28 @@ def _lstm_get_test_rollouts(self, model, df, top_geos, country_samples): country_preds = {} country_cases = {} for g in top_geos: - X_test_context = country_samples[g]['X_test_context'] - X_test_action = country_samples[g]['X_test_action'] - country_indep[g] = model.predict([X_test_context, X_test_action], verbose=0) + X_test_context = country_samples[g]['X_test_context'] # shape (TestDays, LookbackDays, Context) + X_test_action = country_samples[g]['X_test_action'] # shape (TestDays, LookbackDays, Actions) + country_indep[g] = model.predict([X_test_context, X_test_action], verbose=0) # shape (TestDays, Context) - initial_context_input = country_samples[g]['X_test_context'][0] - initial_action_input = country_samples[g]['X_test_action'][0] - y_test = country_samples[g]['y_test'] + initial_context_input = country_samples[g]['X_test_context'][0] # shape (LookbackDays, Context) + initial_action_input = country_samples[g]['X_test_action'][0] # shape (LookbackDays, Actions) + y_test = country_samples[g]['y_test'] # shape (TestDays,) nb_test_days = y_test.shape[0] nb_actions = initial_action_input.shape[-1] - future_action_sequence = np.zeros((nb_test_days, nb_actions)) + future_action_sequence = np.zeros((nb_test_days, nb_actions)) # shape (TestDays, Actions) future_action_sequence[:nb_test_days] = country_samples[g]['X_test_action'][:, -1, :] current_action = country_samples[g]['X_test_action'][:, -1, :][-1] future_action_sequence[14:] = current_action preds = self._lstm_roll_out_predictions(model, initial_context_input, initial_action_input, - future_action_sequence) - country_preds[g] = preds + future_action_sequence) # preds shape (TestDays,) + country_preds[g] = preds # shape (TestDays,) + # All the data prior to the start of the prediction window. prev_confirmed_cases = np.array( df[df.GeoID == g].ConfirmedCases)[:-nb_test_days] prev_new_cases = np.array( @@ -443,7 +413,7 @@ def _lstm_get_test_rollouts(self, model, df, top_geos, country_samples): initial_total_cases = prev_confirmed_cases[-1] pop_size = np.array(df[df.GeoID == g].Population)[0] - pred_new_cases = self._convert_ratios_to_total_cases( + pred_new_cases = oxford_data.convert_prediction_ratios_to_new_cases( preds, WINDOW_SIZE, prev_new_cases, initial_total_cases, pop_size) country_cases[g] = pred_new_cases diff --git a/covid_xprize/oxford_data/oxford_data.py b/covid_xprize/oxford_data/oxford_data.py index 8c673c7..078e92b 100644 --- a/covid_xprize/oxford_data/oxford_data.py +++ b/covid_xprize/oxford_data/oxford_data.py @@ -111,6 +111,7 @@ def _add_geoid_column(df: pd.DataFrame) -> None: df["GeoID"] = np.where( df["RegionName"].isnull(), df["CountryName"], df["CountryName"] + ' / ' + df["RegionName"]) + return df def load_ips_file(file_path: str) -> pd.DataFrame: @@ -177,13 +178,17 @@ def add_population_column(df: pd.DataFrame) -> pd.DataFrame: return df.merge(pop_df[['GeoID', 'Population']], on=['GeoID'], how='left', suffixes=('', '_y')) -def prepare_cases_dataframe(data_url: str) -> pd.DataFrame: +def prepare_cases_dataframe(data_url: str = None) -> pd.DataFrame: """ Loads the cases dataset from the given file, cleans it, and computes cases columns. :param data_url: the url containing the original data :return: a Pandas DataFrame with the historical data """ - df = load_ips_file(data_url) + if data_url is None: + df = load_original_oxford_data() + _add_geoid_column(df) + else: + df = load_ips_file(data_url) # Additional context df (e.g Population for each country) df = add_population_column(df) @@ -328,22 +333,6 @@ def create_prediction_initial_context_and_action_vectors( return context_vectors, action_vectors -def most_affected_countries(df, nb_countries, min_historical_days): - """ - Returns the list of most affected countries, in terms of confirmed deaths. - :param df: the data frame containing the historical data - :param nb_countries: the number of countries to return - :param min_historical_days: the minimum days of historical data the countries must have - :return: a list of country names of size nb_countries if there were enough, and otherwise a list of all the - country names that have at least min_look_back_days data points. - """ - # By default use most affected countries with enough history - gdf = df.groupby('GeoID')['ConfirmedDeaths'].agg(['max', 'count']).sort_values(by='max', ascending=False) - filtered_gdf = gdf[gdf["count"] > min_historical_days] - countries = list(filtered_gdf.head(nb_countries).index) - return countries - - def convert_smooth_cases_per_100K_to_new_cases(smooth_cases_per_100K, window_size, prev_new_cases_list, @@ -362,11 +351,181 @@ def convert_smooth_cases_per_100K_to_new_cases(smooth_cases_per_100K, return unsmooth_cases.clip(min=0.0) -def convert_ratio_to_new_cases(ratio, - window_size, - prev_new_cases_list, - prev_pct_infected): +def convert_prediction_ratios_to_new_cases(ratios: np.ndarray, + window_size: int, + prev_new_cases: np.ndarray, + initial_total_cases: float, + pop_size: float) -> np.ndarray: + """Process a list of case ratios into a list of daily new cases. + :param ratios: Shape (PredictionDays,) + A column corresponding to the PredictionRatio. + :param window_size: The number of days in the smoothing window. + :param prev_new_cases: Shape (SmoothingWindow,). The array of NewCases leading up to the prediction window. + :param initial_total_cases: The value of ConfirmedCases indicating the total cases on the first day of the prediction window. + :param pop_size: The population of the region. + :return: NewCases, an array corresponding to the input ratios. + """ + new_new_cases = [] + prev_new_cases_list = list(prev_new_cases) + curr_total_cases = initial_total_cases + for ratio in ratios: + new_cases = convert_ratio_to_new_cases(ratio, + window_size, + prev_new_cases_list, + curr_total_cases / pop_size) + # new_cases can't be negative! + new_cases = max(0, new_cases) + # Which means total cases can't go down + curr_total_cases += new_cases + # Update prev_new_cases_list for next iteration of the loop + prev_new_cases_list.append(new_cases) + new_new_cases.append(new_cases) + return new_new_cases + + +def convert_ratio_to_new_cases(ratio: float, + window_size: int, + prev_new_cases_list: list[float], + prev_pct_infected: float) -> np.ndarray: + """Convert the PredictionRatio column into the NewCases column. + :param ratio: The value of PredictionRatio on the given day. + :param window_size: The number of days in the smoothing window. + :param prev_new_cases_list: The NewCases column values prior to, but not including, the given day. + :param prev_pct_infected: The value of the ProportionInfected column on the given day. + :return: The value of the NewCases column on the given day.""" + # if len(prev_new_cases_list) < window_size: + # raise TypeError("System needs more samples for conversion.") + # # Find the inverse transformation: + # # PredictionRatio = CaseRatio / (1 - ProportionInfected) + # # => CaseRatio = (1 - ProportionInfected) * PredictionRatio + # case_ratio = ratio * (1 - prev_pct_infected) + + + # # PredictionRatio = CaseRatio / (1 - ProportionInfected) + # # => (1 - ProportionInfected) = CaseRatio/PredictionRatio + # # => 1 - CaseRatio/PredictionRatio = ProportionInfected + # # => CaseRatio = (1 - ProportionInfected) * PredictionRatio + # case_ratio = ratio * (1 - prev_pct_infected) + + # # Find the inverse transformation: + # # CaseRatio[1] = SmoothNewCases[1] / SmoothNewCases[0] + # # => SmoothNewCases[1] = CaseRatio[1] * SmoothNewCases[0] + # smooth_new_cases = case_ratio * np.mean(prev_new_cases_list[-window_size:]) + + # # Find the inverse transformation: + # # SmoothNewCases[7] = (1/7) * (NewCases[1] + ... + NewCases[7]) + # # => NewCases[7] = 7 * SmoothNewCases[7] - (NewCases[1] + ... + NewCases[6]) + # new_cases = window_size * smooth_new_cases - np.sum(prev_new_cases_list[-(window_size-1):]) + # return new_cases + return (ratio * (1 - prev_pct_infected) - 1) * \ (window_size * np.mean(prev_new_cases_list[-window_size:])) \ + prev_new_cases_list[-window_size] + +def _select_meta_cases_and_deaths_cols(df: pd.DataFrame) -> pd.DataFrame: + desired_columns = ["CountryName", "RegionName", "Date", + "ConfirmedCases", "ConfirmedDeaths", + "PredictedDailyNewCases", "PredictedDailyNewDeaths"] + take_columns = df.columns.intersection(desired_columns) + return df[take_columns].copy(deep=True) + + +def _cut_to_range_and_add_diffs( + df: pd.DataFrame, + start_date: datetime, + end_date: datetime, + WINDOW_SIZE: int) -> pd.DataFrame: + # 1 day earlier to compute the daily diff + start_date_for_diff = start_date - pd.offsets.Day(WINDOW_SIZE) + # Filter out the data set to include all the data needed to compute the diff + df = df[(df.Date >= start_date_for_diff) & (df.Date <= end_date)].copy(deep=True) + df.sort_values(by=["GeoID","Date"], inplace=True) + # Compute the diff + for src_column, tgt_column in ( + ("ConfirmedCases", "ActualDailyNewCases"), + ("ConfirmedDeaths", "ActualDailyNewDeaths"), + ): + if not src_column in df.columns: + continue + df[tgt_column] = df.groupby("GeoID", group_keys=False)[src_column].diff() + return df + + +def _moving_average_inplace(df: pd.DataFrame, src_column: str, tgt_column: str, window_size: int): + """In-place operation to add a moving average column. + :param df: The Oxford dataframe. + :param src_column: The name of the column to use as the data soruce. + :param tgt_column: The name of the column to store the moving average. + :param window_size: The moving average window size.""" + df[tgt_column] = df.groupby( + "GeoID", group_keys=False)[src_column].rolling( + window_size, center=False).mean().reset_index(0, drop=True) + + +def _cut_to_range_and_add_diffs_and_smooth_diffs( + df: pd.DataFrame, + start_date: datetime, + end_date: datetime) -> pd.DataFrame: + """Compute moving average columns and add them to the dataframe. + Works with both cases & deaths and both actual and predicted cases.""" + df = _cut_to_range_and_add_diffs(df, start_date, end_date, WINDOW_SIZE) + for src_column, tgt_column in ( + ('ActualDailyNewCases', f'ActualDailyNewCases{WINDOW_SIZE}DMA'), + ('ActualDailyNewDeaths', f'ActualDailyNewDeaths{WINDOW_SIZE}DMA'), + ('PredictedDailyNewCases', f'PredictedDailyNewCases{WINDOW_SIZE}DMA'), + ('PredictedDailyNewDeaths', f'PredictedDailyNewDeaths{WINDOW_SIZE}DMA'), + ): + if not src_column in df.columns: + continue + _moving_average_inplace(df, src_column, tgt_column, WINDOW_SIZE) + return df + + +def _sort_by_id_cols(df: pd.DataFrame) -> pd.DataFrame: + df.sort_values(by=ID_COLS, inplace=True, ignore_index=True) + return df + + +def process_submission(dataset: pd.DataFrame, start_date: datetime, end_date: datetime) -> pd.DataFrame: + df = dataset + df = _select_meta_cases_and_deaths_cols(df) + df = _add_geoid_column(df) + df = _cut_to_range_and_add_diffs_and_smooth_diffs(df, start_date, end_date) + df = _sort_by_id_cols(df) + return df + + +def most_affected_countries(df: pd.DataFrame, nb_geos: int, min_historical_days: int) -> list[str]: + """ + Returns the list of most affected countries, in terms of confirmed deaths. + :param df: the data frame containing the historical data + :param nb_geos: the number of geos to return + :param min_historical_days: the minimum days of historical data the countries must have + :return: a list of country names of size nb_countries if there were enough, and otherwise a list of all the + country names that have at least min_look_back_days data points. + """ + # Don't include the region-level data, just the country-level summaries. + gdf = df[df.RegionName.isna()] + gdf = gdf.groupby('CountryName', group_keys=False)['ConfirmedDeaths'].agg(['max', 'count']).sort_values( + by='max', ascending=False) + filtered_gdf = gdf[gdf["count"] > min_historical_days] + geos = list(filtered_gdf.head(nb_geos).index) + return geos + + +def most_affected_geos(df, nb_geos, min_historical_days): + """ + Returns the list of most affected countries, in terms of confirmed deaths. + :param df: the data frame containing the historical data + :param nb_geos: the number of countries to return + :param min_historical_days: the minimum days of historical data the countries must have + :return: a list of country names of size nb_geos if there were enough, and otherwise a list of all the + country names that have at least min_look_back_days data points. + """ + # By default use most affected countries with enough history + gdf = df.groupby('GeoID')['ConfirmedDeaths'].agg(['max', 'count']).sort_values(by='max', ascending=False) + filtered_gdf = gdf[gdf["count"] > min_historical_days] + countries = list(filtered_gdf.head(nb_geos).index) + return countries + diff --git a/covid_xprize/oxford_data/scenarios.py b/covid_xprize/oxford_data/scenarios.py index 8f6c047..5554cf5 100644 --- a/covid_xprize/oxford_data/scenarios.py +++ b/covid_xprize/oxford_data/scenarios.py @@ -1,4 +1,5 @@ # Copyright 2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. SIMULATED_END_OF_AVAILABLE_TRAINING_DATA_FOR_DEMO: str = "2020-11-30" +SCENARIO_2_SIMULATED_END_OF_AVAILABLE_TRAINING_DATA_FOR_DEMO: str = "2021-11-30" diff --git a/covid_xprize/oxford_data/tests/figures_oxford_data.py b/covid_xprize/oxford_data/tests/figures_oxford_data.py new file mode 100644 index 0000000..ed6c540 --- /dev/null +++ b/covid_xprize/oxford_data/tests/figures_oxford_data.py @@ -0,0 +1,421 @@ +""" +Copyright 2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. + +Supplement the tests with visual figures to verify the system is working correctly.""" + +import sys +import os +from pathlib import Path +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +from covid_xprize import oxford_data + + +############################################################################### +# 01 +############################################################################### + +def test_plot_case_columns(): + _test_plot_case_columns_helper("2020-11-01", "2020-11-30") + _test_plot_case_columns_helper("2021-11-01", "2021-11-30") + + +def _test_plot_case_columns_helper(start_date, end_date): + """Plot the various case columns.""" + + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Test cases show the input / output relationships of our method. + test_cases: list[tuple[np.ndarray]] = [] + + # Create some test cases using real data. + df = oxford_data.prepare_cases_dataframe() + n_test_countries = 15 + real_data_test_countries = oxford_data.most_affected_geos(df, n_test_countries, 30) + for test_country in real_data_test_countries: + cdf = df[(df['GeoID'] == test_country) & (df['Date'] >= start_date) & (df['Date'] <= end_date)] + + # Converting back from CaseRatio should equals the original column NewCases + test_cases.append(( + test_country, cdf + )) + + # Initialize the figure. + n_cols = 8 + n_rows = 1 + len(test_cases) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + ax = plt.subplot(n_rows, n_cols, 1) + ax.axis("off") + ax.text(0.1, 0.5, "\n".join([ + "test_plot_case_columns", + "The real case columns and computed columns.", + ]), in_layout=False) + + for row, (test_country, cdf) in enumerate(test_cases): + label_titles = row == 0 + plot_i = ((row + 1) * n_cols) + 1 + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.axis('off') + ax.text(0.0, 0.5, test_country) + for column_name, color in ( + ("ConfirmedCases", "green"), + ("NewCases", "blue"), + ("SmoothNewCases", "black"), + ("SmoothNewCasesPer100K", "orange"), + ("CaseRatio", "black"), + ("ProportionInfected", "black"), + ("PredictionRatio", "purple"), + ): + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + plot = sns.lineplot(data=cdf, x="Date", y=column_name, ax=ax, color=color) + plot.get_figure().autofmt_xdate() + if label_titles: + ax.set_title(column_name) + + + figure_path = figures_dir / f"01_case_columns_{start_date}.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 02 +############################################################################### + + +def running_mean_convolve(x, N): + return np.convolve(x, np.ones(N) / float(N), 'valid') + + +def test_convert_prediction_ratios(): + """Test the conversion routines going "backwards".""" + + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Test cases show the input / output relationships of our method. + test_cases: list[tuple[np.ndarray]] = [] + + # Create some test cases using real data. + df = oxford_data.prepare_cases_dataframe() + n_test_countries = 7 + real_data_test_countries = oxford_data.most_affected_geos(df, n_test_countries, 30) + PRIOR_DATA_START = "2021-10-01" + PREDICTIONS_START_DATE = "2021-11-01" + PREDICTIONS_END_DATE = "2021-11-30" + for test_country in real_data_test_countries: + cdf = df[(df['GeoID'] == test_country) & (df['Date'] >= PREDICTIONS_START_DATE) & (df['Date'] <= PREDICTIONS_END_DATE)] + cdf_prior = df[(df['GeoID'] == test_country) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] + + # Converting back from PredictionRatio should equals the original column NewCases + pop_size = cdf['Population'].max() + curr_total_cases = cdf_prior['ConfirmedCases'].to_numpy()[-1] + ratios = cdf['PredictionRatio'].to_numpy() + prev_new_cases = cdf_prior['NewCases'].to_numpy() + new_cases_reconstructed = oxford_data.convert_prediction_ratios_to_new_cases( + ratios, + 7, + prev_new_cases, + curr_total_cases, + pop_size, + ) + original_new_cases = cdf['NewCases'].to_numpy() + original_smooth_new_cases = cdf['SmoothNewCases'].to_numpy() + test_cases.append(( + test_country, ratios, new_cases_reconstructed, original_new_cases, original_smooth_new_cases, + )) + + # Create a test case using fake data. + # Some common variables + for curr_total_cases in (100_000, 1_000_000): + for population in (10_000_000,): + for prev_new_cases_c in (2_000, 20_000): + label = f"curr_total_cases: {curr_total_cases:,}\npopulation:{population:,}\nprev_new_cases:{prev_new_cases_c:,}" + prev_new_cases = np.full((20,), prev_new_cases_c) + + # Example 1: Exponential decay + ratios = np.full((31,), 0.95) + new_cases_reconstructed = oxford_data.convert_prediction_ratios_to_new_cases( + ratios, + 7, + prev_new_cases, + curr_total_cases, + population + ) + test_cases.append(( + label, ratios, new_cases_reconstructed, None, None + )) + + # Example 2: Exponential growth + ratios = np.full((31,), 1.05) + new_cases_reconstructed = oxford_data.convert_prediction_ratios_to_new_cases( + ratios, + 7, + prev_new_cases, + curr_total_cases, + population + ) + test_cases.append(( + label, ratios, new_cases_reconstructed, None, None + )) + + # Initialize the figure. + n_cols = 7 + n_rows = 1 + len(test_cases) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + + for row, (label, gt_prediction_ratio, computed_new_cases, gt_new_cases, gt_smooth_cases) in enumerate(test_cases): + label_titles = row == 0 + plot_i = ((row + 1) * n_cols) + 1 + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.axis('off') + ax.text(0.1, 0.5, label) + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_prediction_ratio, color="purple") + if label_titles: + plt.title("Actual PredictionRatio") + ax = ax_output = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(computed_new_cases, color="blue") + if label_titles: + plt.title("Computed NewCases") + if gt_new_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i, sharey=ax_output); plot_i += 1 + ax.plot(gt_new_cases, color='blue') + if label_titles: + plt.title("Actual NewCases") + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_new_cases - computed_new_cases, color='red') + if label_titles: + plt.title("Residuals (Actual - Computed)") + plot_i = ((row + 1) * n_cols) + n_cols - 1 # final-1 column of the row + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + dma = running_mean_convolve(computed_new_cases, 7) + ax.plot(dma, color="black") + if label_titles: + plt.title("Computed SmoothNewCases") + if gt_smooth_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_smooth_cases[6:], color="black") + if label_titles: + plt.title("Actual SmoothNewCases") + + figure_path = figures_dir / f"02_convert_prediction_ratios.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 03 +############################################################################### + + +def running_mean_convolve(x, N): + return np.convolve(x, np.ones(N) / float(N), 'valid') + + +def test_convert_smooth_new_cases_per_100k(): + """Test the conversion routines going "backwards".""" + + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Test cases show the input / output relationships of our method. + test_cases: list[tuple[np.ndarray]] = [] + + # Create some test cases using real data. + df = oxford_data.prepare_cases_dataframe() + n_test_countries = 7 + real_data_test_countries = oxford_data.most_affected_geos(df, n_test_countries, 30) + PRIOR_DATA_START = "2021-10-01" + PREDICTIONS_START_DATE = "2021-11-01" + PREDICTIONS_END_DATE = "2021-11-30" + for test_country in real_data_test_countries: + cdf = df[(df['GeoID'] == test_country) & (df['Date'] >= PREDICTIONS_START_DATE) & (df['Date'] <= PREDICTIONS_END_DATE)] + cdf_prior = df[(df['GeoID'] == test_country) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] + + # Converting back from PredictionRatio should equals the original column NewCases + pop_size = cdf['Population'].max() + smooth_cases_per_100k = cdf['SmoothNewCasesPer100K'].to_numpy() + prev_new_cases = cdf_prior['NewCases'].to_numpy() + new_cases_reconstructed = oxford_data.convert_smooth_cases_per_100K_to_new_cases( + smooth_cases_per_100k, + 7, + prev_new_cases, + pop_size, + ) + original_new_cases = cdf['NewCases'].to_numpy() + original_smooth_new_cases = cdf['SmoothNewCases'].to_numpy() + test_cases.append(( + test_country, smooth_cases_per_100k, new_cases_reconstructed, original_new_cases, original_smooth_new_cases, + )) + + # Initialize the figure. + n_cols = 7 + n_rows = 1 + len(test_cases) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + + for row, (label, gt_smooth_cases_per_100k, computed_new_cases, gt_new_cases, gt_smooth_cases) in enumerate(test_cases): + label_titles = row == 0 + plot_i = ((row + 1) * n_cols) + 1 + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.axis('off') + ax.text(0.1, 0.5, label) + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_smooth_cases_per_100k, color="orange") + if label_titles: + plt.title("Actual SmoothNewCasesPer100K") + ax = ax_output = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(computed_new_cases, color="blue") + if label_titles: + plt.title("Computed NewCases") + if gt_new_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i, sharey=ax_output); plot_i += 1 + ax.plot(gt_new_cases, color='blue') + if label_titles: + plt.title("Actual NewCases") + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_new_cases - computed_new_cases, color='red') + if label_titles: + plt.title("Residuals (Actual - Computed)") + plot_i = ((row + 1) * n_cols) + n_cols - 1 # final-1 column of the row + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + dma = running_mean_convolve(computed_new_cases, 7) + ax.plot(dma, color="black") + if label_titles: + plt.title("Computed SmoothNewCases") + if gt_smooth_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_smooth_cases[6:], color="black") + if label_titles: + plt.title("Actual SmoothNewCases") + + figure_path = figures_dir / f"03_convert_smooth_new_cases_per_100k.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +############################################################################### +# 04 +############################################################################### + +def running_mean_convolve(x, N): + return np.convolve(x, np.ones(N) / float(N), 'valid') + + +def test_backward_pass(): + """Starting with conversion ratios, go step by step backwards.""" + + # Prepare some paths for output and working data. + current_dir = Path(__file__).parent + plots_dir = current_dir / 'plots'; plots_dir.mkdir(exist_ok=True) + data_dir = current_dir / 'data'; data_dir.mkdir(exist_ok=True) + figures_dir = current_dir / 'figures'; figures_dir.mkdir(exist_ok=True) + + # Test cases show the input / output relationships of our method. + test_cases: list[tuple[np.ndarray]] = [] + + # Create some test cases using real data. + df = oxford_data.prepare_cases_dataframe() + n_test_countries = 7 + real_data_test_countries = oxford_data.most_affected_geos(df, n_test_countries, 30) + PRIOR_DATA_START = "2021-10-01" + PREDICTIONS_START_DATE = "2021-11-01" + PREDICTIONS_END_DATE = "2021-11-30" + for test_country in real_data_test_countries: + cdf = df[(df['GeoID'] == test_country) & (df['Date'] >= PREDICTIONS_START_DATE) & (df['Date'] <= PREDICTIONS_END_DATE)] + cdf_prior = df[(df['GeoID'] == test_country) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] + + # Converting back from PredictionRatio should equals the original column NewCases + pop_size = cdf['Population'].max() + curr_total_cases = cdf_prior['ConfirmedCases'].to_numpy()[-1] + ratios = cdf['PredictionRatio'].to_numpy() + prev_new_cases = cdf_prior['NewCases'].to_numpy() + new_cases_reconstructed = oxford_data.convert_prediction_ratios_to_new_cases( + ratios, + 7, + prev_new_cases, + curr_total_cases, + pop_size, + ) + original_new_cases = cdf['NewCases'].to_numpy() + original_smooth_new_cases = cdf['SmoothNewCases'].to_numpy() + test_cases.append(( + test_country, ratios, new_cases_reconstructed, original_new_cases, original_smooth_new_cases, + )) + + # Initialize the figure. + n_cols = 7 + n_rows = 1 + len(test_cases) + fig = plt.figure(figsize=(3 * n_cols, 2 * n_rows)) + + for row, (label, gt_prediction_ratio, computed_new_cases, gt_new_cases, gt_smooth_cases) in enumerate(test_cases): + label_titles = row == 0 + plot_i = ((row + 1) * n_cols) + 1 + + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.axis('off') + ax.text(0.1, 0.5, label) + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_prediction_ratio, color="purple") + if label_titles: + plt.title("Actual PredictionRatio") + ax = ax_output = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(computed_new_cases, color="blue") + if label_titles: + plt.title("Computed NewCases") + if gt_new_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i, sharey=ax_output); plot_i += 1 + ax.plot(gt_new_cases, color='blue') + if label_titles: + plt.title("Actual NewCases") + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_new_cases - computed_new_cases, color='red') + if label_titles: + plt.title("Residuals (Actual - Computed)") + plot_i = ((row + 1) * n_cols) + n_cols - 1 # final-1 column of the row + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + dma = running_mean_convolve(computed_new_cases, 7) + ax.plot(dma, color="black") + if label_titles: + plt.title("Computed SmoothNewCases") + if gt_smooth_cases is not None: + ax = plt.subplot(n_rows, n_cols, plot_i); plot_i += 1 + ax.plot(gt_smooth_cases[6:], color="black") + if label_titles: + plt.title("Actual SmoothNewCases") + + figure_path = figures_dir / f"04_backward_pass.png" + plt.tight_layout() + plt.savefig(figure_path, dpi=100) + print(figure_path) + + +def main(): + # test_backward_pass() + test_convert_smooth_new_cases_per_100k() + # test_convert_prediction_ratios() + # test_plot_case_columns() + + +if __name__ == '__main__': + main() + + diff --git a/covid_xprize/oxford_data/tests/test_oxford_data.py b/covid_xprize/oxford_data/tests/test_oxford_data.py index f9359f9..b4b0694 100644 --- a/covid_xprize/oxford_data/tests/test_oxford_data.py +++ b/covid_xprize/oxford_data/tests/test_oxford_data.py @@ -18,7 +18,7 @@ class TestPrepareData(unittest.TestCase): def setUpClass(cls): if DATA_PATH.exists(): return - data = oxford_data.load_oxford_data_trimmed(end_date="2020-12-31") + data = oxford_data.load_oxford_data_trimmed(end_date="2021-12-31") data.to_csv(DATA_PATH, index=False) def test_load_oxford_data_trimmed(self): @@ -73,12 +73,12 @@ def test_create_prediction_initial_context_and_action_vectors(self): def test_convert_to_new_cases(self): df = oxford_data.prepare_cases_dataframe(DATA_PATH) - PRIOR_DATA_START = "2020-10-01" - PREDICTIONS_START_DATE = "2020-11-01" - PREDICTIONS_END_DATE = "2020-11-30" - TEST_COUNTRY = 'France' + PRIOR_DATA_START = "2021-10-01" + PREDICTIONS_START_DATE = "2021-11-01" + PREDICTIONS_END_DATE = "2021-11-30" + TEST_COUNTRY = 'United Kingdom' cdf = df[(df['GeoID'] == TEST_COUNTRY) & (df['Date'] >= PREDICTIONS_START_DATE) & (df['Date'] <= PREDICTIONS_END_DATE)] - cdf_prior = df[(df['GeoID'] == TEST_COUNTRY) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] + cdf_prior = df[(df['GeoID'] == TEST_COUNTRY) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] # print(cdf.columns) # print(cdf) @@ -95,6 +95,47 @@ def test_convert_to_new_cases(self): for i in range(original_new_cases.shape[0]): self.assertAlmostEqual(original_new_cases[i], new_cases_reconstructed[i]) + # Converting back from PredictionRatio should equals the original column NewCases + ratios = cdf['PredictionRatio'].to_numpy() + pop_size = cdf['Population'].max() + prev_new_cases = cdf_prior['NewCases'].to_numpy() + initial_total_cases = cdf_prior['ConfirmedCases'].iloc[-1] + new_cases_reconstructed = oxford_data.convert_prediction_ratios_to_new_cases( + ratios, + WINDOW_SIZE, + prev_new_cases, + initial_total_cases, + pop_size, + ) + original_new_cases = cdf['NewCases'].to_numpy() + for i in range(original_new_cases.shape[0]): + self.assertAlmostEqual(original_new_cases[i], new_cases_reconstructed[i], delta=original_new_cases[i] * 1e-1) + + def test_convert_ratio_to_new_cases(self): + """Test the `convert_ratio_to_new_cases` method.""" + df = oxford_data.prepare_cases_dataframe(DATA_PATH) + PRIOR_DATA_START = "2021-10-01" + PREDICTIONS_START_DATE = "2021-11-01" + PREDICTIONS_END_DATE = "2021-11-30" + TEST_COUNTRY = 'United Kingdom' + cdf = df[(df['GeoID'] == TEST_COUNTRY) & (df['Date'] >= PREDICTIONS_START_DATE) & (df['Date'] <= PREDICTIONS_END_DATE)] + cdf_prior = df[(df['GeoID'] == TEST_COUNTRY) & (df['Date'] >= PRIOR_DATA_START) & (df['Date'] < PREDICTIONS_START_DATE)] + + WINDOW_SIZE = 7 + + # Converting back from CaseRatio should equals the original column NewCases + ratio = cdf['PredictionRatio'].iloc[0] + prev_new_cases = cdf_prior['NewCases'].to_numpy() + prev_pct_infected = cdf['ProportionInfected'].iloc[0] + new_cases_reconstructed = oxford_data.convert_ratio_to_new_cases( + ratio, + WINDOW_SIZE, + prev_new_cases, + prev_pct_infected, + ) + original_new_cases = cdf['NewCases'].iloc[0] + self.assertAlmostEqual(new_cases_reconstructed, original_new_cases) + if __name__ == '__main__': - unittest.main() + unittest.main()