From 822971140031717905d34190127dc50a13dad99f Mon Sep 17 00:00:00 2001
From: Samuel Brand <48288458+SamuelBrand1@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:19:18 +0000
Subject: [PATCH] Issue 541: Changelog for prior changes and fix the data
 ingestion step (#548)

* Update changelog.md

* Update make_model_priors.jl

* Make `define_epiprob` more modular

_make_epidata can also be reused elsewhere

* restructure analysis dataframe func

* fix make_prediction_dataframe_from_output

And add simple unit test with committed test data

* Revert "fix make_prediction_dataframe_from_output"

This reverts commit 7e6238bc275a79381b318204527748754932657d.

* Reapply "fix make_prediction_dataframe_from_output"

This reverts commit 14d29b42608132bc8603daf9f16926251677da45.

* script to generate prediction dataframes

* create_prediction_df refactor + fix

* update to analyses failures

* Update create_prediction_dataframe.jl
---
 .../scripts/create_analysis_dataframes.jl     | 41 -------------
 .../scripts/create_prediction_dataframe.jl    | 58 +++++++++++++++++++
 2 files changed, 58 insertions(+), 41 deletions(-)
 delete mode 100644 pipeline/scripts/create_analysis_dataframes.jl
 create mode 100644 pipeline/scripts/create_prediction_dataframe.jl

diff --git a/pipeline/scripts/create_analysis_dataframes.jl b/pipeline/scripts/create_analysis_dataframes.jl
deleted file mode 100644
index 4101e23fe..000000000
--- a/pipeline/scripts/create_analysis_dataframes.jl
+++ /dev/null
@@ -1,41 +0,0 @@
-using Pkg
-Pkg.activate(joinpath(@__DIR__(), ".."))
-
-using EpiAwarePipeline, EpiAware, AlgebraOfGraphics, JLD2, DrWatson, Plots, DataFramesMeta,
-      Statistics, Distributions, DrWatson
-
-## load some data and create a dataframe for the plot
-files = readdir(datadir("epiaware_observables")) |>
-        strs -> filter(s -> occursin("jld2", s), strs)
-
-## Define scenarios
-pipelines = [
-    SmoothOutbreakPipeline(), MeasuresOutbreakPipeline(),
-    SmoothEndemicPipeline(), RoughEndemicPipeline()]
-
-## Set up EpiData objects: Used in the prediction dataframe for infection generating
-## processes that don't use directly in simulation.
-gi_params = make_gi_params(pipelines[1])
-epi_datas = map(gi_params["gi_means"]) do μ
-    σ = gi_params["gi_stds"][1]
-    shape = (μ / σ)^2
-    scale = σ^2 / μ
-    Gamma(shape, scale)
-end .|> gen_dist -> EpiData(gen_distribution = gen_dist)
-
-## Calculate the prediction and scoring dataframes
-double_vcat = (dfs1, dfs2) -> (
-    vcat(dfs1[1], dfs2[1]), vcat(dfs1[2], dfs2[2])
-)
-
-dfs = mapreduce(double_vcat, xs) do filename
-    output = load(joinpath(datadir("epiaware_observables"), filename))
-    (
-        make_prediction_dataframe_from_output(filename, output, epi_datas, pipelines),
-        make_scoring_dataframe_from_output(filename, output, epi_datas, pipelines)
-    )
-end
-
-## Save the prediction and scoring dataframes
-CSV.write(plotsdir("analysis_df.csv"), dfs[1])
-CSV.write(plotsdir("scoring_df.csv"), dfs[2])
diff --git a/pipeline/scripts/create_prediction_dataframe.jl b/pipeline/scripts/create_prediction_dataframe.jl
new file mode 100644
index 000000000..9d02c6de1
--- /dev/null
+++ b/pipeline/scripts/create_prediction_dataframe.jl
@@ -0,0 +1,58 @@
+using EpiAwarePipeline, EpiAware, AlgebraOfGraphics, JLD2, DrWatson, DataFramesMeta,
+      Statistics, Distributions, DrWatson, CSV
+
+## Define scenarios
+scenarios = ["measures_outbreak", "smooth_outbreak", "smooth_endemic", "rough_endemic"]
+
+## Define true GI means
+true_gi_means = [2.0, 10.0, 20.0]
+
+## Load the prediction dataframes or record fails
+failed_configs = Dict[]
+
+dfs = mapreduce(vcat, scenarios) do scenario
+    mapreduce(vcat, true_gi_means) do true_gi_mean
+        target_str = "truth_gi_mean_" * string(true_gi_mean) * "_"
+        files = readdir(datadir("epiaware_observables/" * scenario)) |>
+                strs -> filter(s -> occursin("jld2", s), strs) |>
+                        strs -> filter(s -> occursin(target_str, s), strs)
+
+        mapreduce(vcat, files) do filename
+            output = load(joinpath(datadir("epiaware_observables"), scenario, filename))
+            try
+                make_prediction_dataframe_from_output(output, true_gi_mean)
+            catch e
+                @warn "Error in $filename"
+                push!(failed_configs, output["inference_config"])
+                return DataFrame()
+            end
+        end
+    end
+end
+
+## Gather the failed data
+failed_df = mapreduce(vcat, failed_configs) do D
+    igp = D["igp"] |> str -> split(str, ".")[end]
+    latent_model = D["latent_model"]
+    gi_mean = D["gi_mean"]
+    T1, T2 = split(D["tspan"], "_")
+    runsuccess = D["priorpredictive"] .== "Pass"
+    df = DataFrame(
+        infection_gen_proc = igp,
+        latent_model = latent_model,
+        gi_mean = gi_mean,
+        T1 = T1,
+        T2 = T2,
+        T_diff = parse(Int, T2) - parse(Int, T1),
+        runsuccess = runsuccess
+    )
+end
+
+##
+grped_failed_df = failed_df |>
+                  df -> @groupby(df, :infection_gen_proc, :latent_model) |>
+                        gd -> @combine(gd, :n_fail=sum(1 .- :runsuccess))
+
+## Save the prediction and failed dataframes
+CSV.write(plotsdir("plotting_data/predictions.csv"), dfs)
+CSV.write(plotsdir("plotting_data/failed_preds.csv"), failed_df)