diff --git a/.Rbuildignore b/.Rbuildignore index 510725267..dc41e6223 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,4 +20,5 @@ ^doc$ ^Meta$ ^.lintr$ -^.venv$ \ No newline at end of file +^.venv$ +^inst/templates$ diff --git a/.github/workflows/doc-preview.yaml b/.github/workflows/doc-preview.yaml new file mode 100644 index 000000000..068184225 --- /dev/null +++ b/.github/workflows/doc-preview.yaml @@ -0,0 +1,65 @@ +on: + issue_comment: + types: [created] + +name: doc-preview.yaml + +permissions: read-all + +jobs: + preview: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/preview-docs') }} + + runs-on: ubuntu-latest + permissions: + # Needed to write a comment on the PR + pull-requests: write + # Needed to read the PR branch + contents: read + steps: + - uses: actions/checkout@v4 + with: + # Checkout the PR branch + ref: refs/pull/${{ github.event.issue.number }}/head + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to Netlify + uses: nwtgck/actions-netlify@v3.0 + with: + # Standard config + github-token: ${{ secrets.GITHUB_TOKEN }} + deploy-message: "Deploy from GitHub Actions" + # 'docs/' is the default directory for pkgdown::build_site() + # we add 'dev' because _pkgdown.yml has 'development: mode: devel' + publish-dir: './docs/dev' + # Development deploys only + production-deploy: false + # Enable pull request comment (default) + enable-pull-request-comment: true + # Overwrite the pull request comment with updated link (default) + overwrites-pull-request-comment: true + # Don't deploy to GitHub + enable-github-deployment: false + # Don't update the status of the commit + enable-commit-status: false + # Don't comment on the commit + enable-commit-comment: false + env: + # Netlify credentials (currently from Dmitry's account) + NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} + NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} + timeout-minutes: 1 diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml new file mode 100644 index 000000000..8bb12b9e2 --- /dev/null +++ b/.github/workflows/pr-commands.yaml @@ -0,0 +1,85 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + issue_comment: + types: [created] + +name: pr-commands.yaml + +permissions: read-all + +jobs: + document: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} + name: document + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/pr-fetch@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::roxygen2 + needs: pr-document + + - name: Document + run: roxygen2::roxygenise() + shell: Rscript {0} + + - name: commit + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add man/\* NAMESPACE + git commit -m 'Document' + + - uses: r-lib/actions/pr-push@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + style: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} + name: style + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/pr-fetch@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - uses: r-lib/actions/setup-r@v2 + + - name: Install dependencies + run: install.packages("styler") + shell: Rscript {0} + + - name: Style + run: styler::style_pkg() + shell: Rscript {0} + + - name: commit + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add \*.R + git commit -m 'Style' + + - uses: r-lib/actions/pr-push@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/styler.yml b/.github/workflows/styler.yml deleted file mode 100644 index 64c2390cd..000000000 --- a/.github/workflows/styler.yml +++ /dev/null @@ -1,81 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -on: - workflow_dispatch: - pull_request: - paths: - [ - "**.[rR]", - "**.[qrR]md", - "**.[rR]markdown", - "**.[rR]nw", - "**.[rR]profile", - ] - -name: Style - -jobs: - style: - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - name: Install dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::styler, any::roxygen2 - needs: styler - - - name: Enable styler cache - run: styler::cache_activate() - shell: Rscript {0} - - - name: Determine cache location - id: styler-location - run: | - cat( - "location=", - styler::cache_info(format = "tabular")$location, - "\n", - file = Sys.getenv("GITHUB_OUTPUT"), - append = TRUE, - sep = "" - ) - shell: Rscript {0} - - - name: Cache styler - uses: actions/cache@v3 - with: - path: ${{ steps.styler-location.outputs.location }} - key: ${{ runner.os }}-styler-${{ github.sha }} - restore-keys: | - ${{ runner.os }}-styler- - ${{ runner.os }}- - - - name: Style - run: styler::style_pkg() - shell: Rscript {0} - - - name: Commit and push changes - run: | - if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ - | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) - then - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)" - git pull --ff-only - git push origin - else - echo "No changes to commit." - fi diff --git a/.gitignore b/.gitignore index dd821f77e..ae7df4125 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ inst/doc .Rprofile renv.lock renv/ +.Renviron diff --git a/DESCRIPTION b/DESCRIPTION index 5abc8b610..1d04a3b2a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.0 +Version: 0.2.0 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), - person("Dmitry", "Shemetov", email = "dshemeto@andrew.cmu.edu", role = "aut"), - person("David", "Weber", email = "davidweb@andrew.cmu.edu", role = "aut"), + person("Dmitry", "Shemetov", , "dshemeto@andrew.cmu.edu", role = "aut"), + person("David", "Weber", , "davidweb@andrew.cmu.edu", role = "aut"), person("Delphi Research Group", role = c("cph", "fnd")), person("Logan", "Brooks", role = "aut"), person("Rachel", "Lobay", role = "aut"), @@ -24,21 +24,22 @@ URL: https://github.com/cmu-delphi/epipredict/, https://cmu-delphi.github.io/epipredict BugReports: https://github.com/cmu-delphi/epipredict/issues/ Depends: - epiprocess (>= 0.9.0), + epidatasets, parsnip (>= 1.0.0), R (>= 3.5.0) Imports: checkmate, cli, - distributional, dplyr, + epiprocess (>= 0.11.2), generics, ggplot2, glue, - hardhat (>= 1.3.0), + hardhat (>= 1.4.1), lifecycle, + lubridate, magrittr, - recipes (>= 1.0.4), + recipes (>= 1.1.1), rlang (>= 1.1.0), stats, tibble, @@ -48,18 +49,18 @@ Imports: vctrs, workflows (>= 1.0.0) Suggests: - covidcast, data.table, epidatr (>= 1.0.0), fs, grf, + here, knitr, - lubridate, poissonreg, purrr, quantreg, ranger, RcppRoll, + readr, rmarkdown, smoothqr, testthat (>= 3.0.0), @@ -69,12 +70,12 @@ VignetteBuilder: knitr Remotes: cmu-delphi/delphidocs, + cmu-delphi/epidatasets, cmu-delphi/epidatr, cmu-delphi/epiprocess, dajmcdon/smoothqr -Config/testthat/edition: 3 Config/Needs/website: cmu-delphi/delphidocs +Config/testthat/edition: 3 Encoding: UTF-8 -LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 67f1b3003..8922a7e69 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,10 +1,8 @@ ## Setting up the development environment ```r -install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies -devtools::install_deps(dependencies = TRUE) # install package dependencies -devtools::document() # generate package meta data and man files -devtools::build() # build package +install.packages(c('devtools', 'pkgdown', 'styler', 'lintr', 'pak')) # install dev dependencies +pak::pkg_install(".") # install package and dependencies ``` ## Validating the package @@ -13,33 +11,74 @@ devtools::build() # build package styler::style_pkg() # format code lintr::lint_package() # lint code +devtools::check() # run R CMD check, which runs everything below +devtools::document() # generate package meta data and man files devtools::test() # test package -devtools::check() # check package for errors +devtools::build_vignettes() # build vignettes only +devtools::run_examples() # run doc examples +devtools::check(vignettes = FALSE) # check package without vignettes ``` ## Developing the documentation site -The [documentation site](https://cmu-delphi.github.io/epipredict/) is built off of the `main` branch. The `dev` version of the site is available at https://cmu-delphi.github.io/epipredict/dev. +Our CI builds two versions of the documentation: -The documentation site can be previewed locally by running in R +- https://cmu-delphi.github.io/epipredict/ from the `main` branch and +- https://cmu-delphi.github.io/epipredict/dev from the `dev` branch. -```r -pkgdown::build_site(preview=TRUE) -``` +Commands for developing the documentation site: -The `main` version is available at `file:////epidatr/epipredict/index.html` and `dev` at `file:////epipredict/docs/dev/index.html`. - -You can also build the docs manually and launch the site with python. From the terminal, this looks like - -```bash +```sh +# Basic build and preview +R -e 'pkgdown::clean_site()' R -e 'devtools::document()' -python -m http.server -d docs +R -e 'pkgdown::build_site()' ``` +Note that sometimes the caches from either `pkgdown` or `knitr` can cause +difficulties. To clear those, run `make`, with either `clean_knitr`, +`clean_site`, or `clean` (which does both). + +If you work without R Studio and want to iterate on documentation, you might +find `Rscript pkgdown-watch.R` useful. +helpful. For updating references, you will need to manually call `pkgdown::build_reference()`. + ## Versioning Please follow the guidelines in the [PR template document](.github/pull_request_template.md). -## Release process +## Planned CRAN release process + +Open a release issue and then copy and follow this checklist in the issue (modified from the checklist generated by `usethis::use_release_issue(version = "1.0.2")`): + +- [ ] `git pull` on `dev` branch. +- [ ] Make sure all changes are committed and pushed. +- [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epipredict.html). +- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "false"))`. + - Aim for 10/10, no notes. +- [ ] If check works well enough, merge to main. Otherwise open a PR to fix up. +- [ ] [Polish NEWS](https://github.com/cmu-delphi/epipredict/blob/dev/NEWS.md). + - Some [guidelines](https://style.tidyverse.org/news.html#news-release). +- [ ] `git checkout main` +- [ ] `git pull` +- [ ] `urlchecker::url_check()`. + - This may choke on the MIT license url, and that's ok. +- [ ] `devtools::build_readme()` +- [ ] `devtools::check_win_devel()` +- [ ] Have maintainer ("cre" in description) check email for problems. +- [ ] `revdepcheck::revdep_check(num_workers = 4)`. + - This may choke, it is very sensitive to the binary versions of packages on a given system. Either bypass or ask someone else to run it if you're concerned. +- [ ] Update `cran-comments.md` +- [ ] PR with any changes (and go through the list again) into `dev` and run through the list again. + +Submit to CRAN: + +- [ ] `devtools::submit_cran()`. +- [ ] Maintainer approves email. + +Wait for CRAN... -TBD +- [ ] If accepted :tada:, move to next steps. If rejected, fix and resubmit. +- [ ] Open and merge a PR containing any updates made to `main` back to `dev`. +- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push) will create a draft release based on the commit hash in CRAN-SUBMISSION and push a tag to the GitHub repo. +- [ ] Go to the repo, verify the release notes, and publish when ready. diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..9f5790aca --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +## +# epipredict docs build +# + +# knitr doesn't actually clean it's own cache properly; this just deletes any of +# the article knitr caches in vignettes or the base +clean_knitr: + rm -r *_cache; rm -r vignettes/*_cache +clean_site: + Rscript -e "pkgdown::clean_cache(); pkgdown::clean_site()" +# this combines +clean: clean_knitr clean_site + +# end diff --git a/NAMESPACE b/NAMESPACE index e815203eb..96cf927c5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,8 +2,6 @@ S3method(Add_model,epi_workflow) S3method(Add_model,workflow) -S3method(Math,dist_quantiles) -S3method(Ops,dist_quantiles) S3method(Remove_model,epi_workflow) S3method(Remove_model,workflow) S3method(Update_model,epi_workflow) @@ -17,8 +15,10 @@ S3method(apply_frosting,epi_workflow) S3method(augment,epi_workflow) S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) -S3method(bake,check_enough_train_data) +S3method(bake,check_enough_data) S3method(bake,epi_recipe) +S3method(bake,step_adjust_latency) +S3method(bake,step_climate) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) @@ -28,7 +28,6 @@ S3method(bake,step_population_scaling) S3method(bake,step_training_window) S3method(detect_layer,frosting) S3method(detect_layer,workflow) -S3method(epi_recipe,default) S3method(epi_recipe,epi_df) S3method(epi_recipe,formula) S3method(extract_argument,epi_workflow) @@ -40,24 +39,22 @@ S3method(extract_frosting,default) S3method(extract_frosting,epi_workflow) S3method(extract_layers,frosting) S3method(extract_layers,workflow) -S3method(extrapolate_quantiles,dist_default) -S3method(extrapolate_quantiles,dist_quantiles) -S3method(extrapolate_quantiles,distribution) +S3method(extrapolate_quantiles,quantile_pred) S3method(fit,epi_workflow) S3method(flusight_hub_formatter,canned_epipred) S3method(flusight_hub_formatter,data.frame) S3method(forecast,epi_workflow) -S3method(format,dist_quantiles) -S3method(is.na,dist_quantiles) -S3method(is.na,distribution) S3method(key_colnames,epi_workflow) S3method(key_colnames,recipe) -S3method(mean,dist_quantiles) -S3method(median,dist_quantiles) +S3method(mean,quantile_pred) +S3method(plot,canned_epipred) +S3method(plot,epi_workflow) S3method(predict,epi_workflow) S3method(predict,flatline) -S3method(prep,check_enough_train_data) +S3method(prep,check_enough_data) S3method(prep,epi_recipe) +S3method(prep,step_adjust_latency) +S3method(prep,step_climate) S3method(prep,step_epi_ahead) S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) @@ -70,7 +67,8 @@ S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) S3method(print,cdc_baseline_fcast) -S3method(print,check_enough_train_data) +S3method(print,check_enough_data) +S3method(print,climate_fcast) S3method(print,epi_recipe) S3method(print,epi_workflow) S3method(print,flat_fcast) @@ -82,11 +80,12 @@ S3method(print,layer_naomit) S3method(print,layer_point_from_distn) S3method(print,layer_population_scaling) S3method(print,layer_predict) -S3method(print,layer_predictive_distn) S3method(print,layer_quantile_distn) S3method(print,layer_residual_quantiles) S3method(print,layer_threshold) S3method(print,layer_unnest) +S3method(print,step_adjust_latency) +S3method(print,step_climate) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_epi_slide) @@ -95,7 +94,7 @@ S3method(print,step_lag_difference) S3method(print,step_naomit) S3method(print,step_population_scaling) S3method(print,step_training_window) -S3method(quantile,dist_quantiles) +S3method(quantile,quantile_pred) S3method(refresh_blueprint,default_epi_recipe_blueprint) S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) @@ -106,25 +105,22 @@ S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) S3method(slather,layer_predict) -S3method(slather,layer_predictive_distn) S3method(slather,layer_quantile_distn) S3method(slather,layer_residual_quantiles) S3method(slather,layer_threshold) S3method(slather,layer_unnest) S3method(snap,default) -S3method(snap,dist_default) -S3method(snap,dist_quantiles) -S3method(snap,distribution) -S3method(tidy,check_enough_train_data) +S3method(snap,quantile_pred) +S3method(tidy,check_enough_data) S3method(tidy,frosting) S3method(tidy,layer) S3method(update,layer) -S3method(vec_ptype_abbr,dist_quantiles) -S3method(vec_ptype_full,dist_quantiles) -S3method(weighted_interval_score,default) -S3method(weighted_interval_score,dist_default) -S3method(weighted_interval_score,dist_quantiles) -S3method(weighted_interval_score,distribution) +S3method(vec_arith,quantile_pred) +S3method(vec_arith.numeric,quantile_pred) +S3method(vec_arith.quantile_pred,numeric) +S3method(vec_math,quantile_pred) +S3method(vec_proxy_equal,quantile_pred) +S3method(weighted_interval_score,quantile_pred) export("%>%") export(Add_model) export(Remove_model) @@ -142,12 +138,16 @@ export(arx_class_epi_workflow) export(arx_classifier) export(arx_fcast_epi_workflow) export(arx_forecaster) +export(as_epi_df) +export(as_tibble) export(autoplot) export(bake) export(cdc_baseline_args_list) export(cdc_baseline_forecaster) -export(check_enough_train_data) +export(check_enough_data) export(clean_f_name) +export(climate_args_list) +export(climatological_forecaster) export(default_epi_recipe_blueprint) export(detect_layer) export(dist_quantiles) @@ -157,7 +157,9 @@ export(epi_workflow) export(extract_argument) export(extract_frosting) export(extract_layers) +export(extract_quantile_levels) export(extrapolate_quantiles) +export(filter) export(fit) export(flatline) export(flatline_args_list) @@ -169,6 +171,7 @@ export(get_test_data) export(is_epi_recipe) export(is_epi_workflow) export(is_layer) +export(key_colnames) export(layer) export(layer_add_forecast_date) export(layer_add_target_date) @@ -182,19 +185,29 @@ export(layer_quantile_distn) export(layer_residual_quantiles) export(layer_threshold) export(layer_unnest) +export(mutate) export(nested_quantiles) export(new_default_epi_recipe_blueprint) export(new_epi_recipe_blueprint) +export(pivot_longer) +export(pivot_quantiles) export(pivot_quantiles_longer) export(pivot_quantiles_wider) +export(pivot_wider) export(prep) +export(quantile_pred) export(quantile_reg) export(rand_id) export(remove_epi_recipe) export(remove_frosting) export(remove_model) +export(rename) +export(select) export(slather) export(smooth_quantile_reg) +export(snap) +export(step_adjust_latency) +export(step_climate) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) @@ -205,12 +218,13 @@ export(step_population_scaling) export(step_training_window) export(tibble) export(tidy) +export(unnest) export(update_epi_recipe) export(update_frosting) export(update_model) export(validate_layer) export(weighted_interval_score) -import(distributional) +import(epidatasets) import(epiprocess) import(parsnip) import(recipes) @@ -225,6 +239,7 @@ importFrom(checkmate,test_numeric) importFrom(checkmate,test_scalar) importFrom(cli,cli_abort) importFrom(cli,cli_warn) +importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,any_of) @@ -235,16 +250,28 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) +importFrom(dplyr,group_by_at) +importFrom(dplyr,inner_join) +importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,n) +importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) +importFrom(dplyr,tibble) +importFrom(dplyr,tribble) importFrom(dplyr,ungroup) +importFrom(epiprocess,as_epi_df) importFrom(epiprocess,epi_slide) importFrom(epiprocess,growth_rate) +importFrom(epiprocess,growth_rate_params) +importFrom(epiprocess,is_epi_df) +importFrom(epiprocess,key_colnames) importFrom(generics,augment) importFrom(generics,fit) importFrom(generics,forecast) @@ -255,10 +282,21 @@ importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_linerange) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(glue,glue) +importFrom(hardhat,extract_quantile_levels) +importFrom(hardhat,extract_recipe) +importFrom(hardhat,quantile_pred) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) +importFrom(lifecycle,deprecated) +importFrom(lubridate,"%m-%") +importFrom(lubridate,leap_year) +importFrom(lubridate,month) +importFrom(lubridate,yday) importFrom(magrittr,"%>%") +importFrom(magrittr,extract2) importFrom(recipes,bake) +importFrom(recipes,detect_step) importFrom(recipes,prep) importFrom(recipes,rand_id) importFrom(rlang,"!!!") @@ -266,7 +304,6 @@ importFrom(rlang,"!!") importFrom(rlang,"%@%") importFrom(rlang,"%||%") importFrom(rlang,":=") -importFrom(rlang,abort) importFrom(rlang,arg_match) importFrom(rlang,as_function) importFrom(rlang,caller_arg) @@ -276,30 +313,38 @@ importFrom(rlang,enquos) importFrom(rlang,expr) importFrom(rlang,global_env) importFrom(rlang,inject) +importFrom(rlang,is_empty) importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) +importFrom(rlang,list2) importFrom(rlang,set_names) importFrom(rlang,sym) importFrom(stats,as.formula) -importFrom(stats,family) importFrom(stats,lm) -importFrom(stats,median) importFrom(stats,model.frame) +importFrom(stats,na.omit) importFrom(stats,poly) importFrom(stats,predict) -importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyr,drop_na) +importFrom(tidyr,expand_grid) +importFrom(tidyr,fill) +importFrom(tidyr,pivot_longer) +importFrom(tidyr,pivot_wider) +importFrom(tidyr,unnest) +importFrom(tidyselect,all_of) +importFrom(utils,capture.output) importFrom(vctrs,as_list_of) -importFrom(vctrs,field) -importFrom(vctrs,new_rcrd) importFrom(vctrs,new_vctr) +importFrom(vctrs,vec_arith) +importFrom(vctrs,vec_arith.numeric) importFrom(vctrs,vec_cast) -importFrom(vctrs,vec_data) -importFrom(vctrs,vec_ptype_abbr) -importFrom(vctrs,vec_ptype_full) +importFrom(vctrs,vec_math) +importFrom(vctrs,vec_proxy_equal) importFrom(vctrs,vec_recycle_common) +importFrom(workflows,extract_preprocessor) diff --git a/NEWS.md b/NEWS.md index 8edddae92..814bfb67c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,54 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicate PR's. +# epipredict 0.2 + +## Breaking changes + +- Moved example datasets from being hosted in the package to being loaded + from the `epidatasets` package. The datasets can no longer be loaded with + `data()`, but can be accessed with + `data(, package = "epidatasets")`, `epidatasets::` + or, after loading the package, the name of the dataset alone (#382). +- `step_adjust_latency()` no longer allows empty column selection. +- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`). + `step_growth_rate()` has lost its `additional_gr_args_list` argument and now + has an `na_rm` argument. +- Moves `epiprocess` out of depends (#440). No internals have changed, but downstream + users may need to add `library(epiprocess)` to existing code. +- Removes dependence on the `distributional` package, replacing the quantiles + with `hardhat::quantile_pred()`. Some associated functions are deprecated with + `lifecycle` messages. +- Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it + enough to use as a check on either training or testing. +- Add check for enough data to predict in `arx_forecaster()` +- Adds the `.facet_filter` option in `epiprocess::autoplot()` (cmu-delphi/epiprocess#647). + +## Improvements + +- Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. +- Fix `layer_population_scaling` default `by` with `other_keys`. +- Make key column inference more consistent within the package and with current `epiprocess`. +- Fix `quantile_reg()` producing error when asked to output just median-level predictions. +- (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift` +- Add `reference_date` as an argument to `epi_recipe()` +- Add `step_climate()` to create "climate" predictor in forecast workflows +- Add `climatological_forecaster()` to automatically create climate baselines +- Replace `dist_quantiles()` with `hardhat::quantile_pred()` +- Allow `quantile()` to threshold to an interval if desired (#434) +- `arx_forecaster()` detects if there's enough data to predict +- Add `observed_response` to `autoplot` so that forecasts can be plotted against the values they're predicting + +## Bug fixes + +- Shifting no columns results in no error for either `step_epi_ahead` and `step_epi_lag` +- Quantiles produced by `grf` were sometimes out of order. +- dist_quantiles can have all `NA` values without causing unrelated errors +- adjust default quantiles throughout so that they match. +- force `layer_residual_quantiles()` to always include `0.5`. +- Rename `recipes:::check_training_set()` to `recipes:::validate_training_data()`, as it changed in recipes 1.1.0. +- A new column name duplicating an existing column name results in an error instead of a random name. + # epipredict 0.1 - simplify `layer_residual_quantiles()` to avoid timesuck in `utils::methods()` @@ -22,7 +70,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - training window step debugged - `min_train_window` argument removed from canned forecasters - add forecasters -- implement postprocessing +- implement post-processing - vignettes avaliable - arx_forecaster - pkgdown diff --git a/R/arx_classifier.R b/R/arx_classifier.R index ca6a3537b..57812233b 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -1,8 +1,106 @@ #' Direct autoregressive classifier with covariates #' -#' This is an autoregressive classification model for -#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It does "direct" forecasting, meaning -#' that it estimates a class at a particular target horizon. +#' +#' @description +#' This is an autoregressive classification model for continuous data. It does +#' "direct" forecasting, meaning that it estimates a class at a particular +#' target horizon. +#' +#' @details +#' The `arx_classifier()` is an autoregressive classification model for `epi_df` +#' data that is used to predict a discrete class for each case under +#' consideration. It is a direct forecaster in that it estimates the classes +#' at a specific horizon or ahead value. +#' +#' To get a sense of how the `arx_classifier()` works, let's consider a simple +#' example with minimal inputs. For this, we will use the built-in +#' `covid_case_death_rates` that contains confirmed COVID-19 cases and deaths +#' from JHU CSSE for all states over Dec 31, 2020 to Dec 31, 2021. From this, +#' we'll take a subset of data for five states over June 4, 2021 to December +#' 31, 2021. Our objective is to predict whether the case rates are increasing +#' when considering the 0, 7 and 14 day case rates: +#' +#' ```{r} +#' jhu <- covid_case_death_rates %>% +#' filter( +#' time_value >= "2021-06-04", +#' time_value <= "2021-12-31", +#' geo_value %in% c("ca", "fl", "tx", "ny", "nj") +#' ) +#' +#' out <- arx_classifier(jhu, outcome = "case_rate", predictors = "case_rate") +#' +#' out$predictions +#' ``` +#' +#' The key takeaway from the predictions is that there are two prediction +#' classes: `(-Inf, 0.25]` and `(0.25, Inf)`: the classes to predict must be +#' discrete. The discretization of the real-valued outcome is controlled by +#' the `breaks` argument, which defaults to `0.25`. Such breaks will be +#' automatically extended to cover the entire real line. For example, the +#' default break of `0.25` is silently extended to `breaks = c(-Inf, .25, +#' Inf)` and, therefore, results in two classes: `[-Inf, 0.25]` and `(0.25, +#' Inf)`. These two classes are used to discretize the outcome. The conversion +#' of the outcome to such classes is handled internally. So if discrete +#' classes already exist for the outcome in the `epi_df`, then we recommend to +#' code a classifier from scratch using the `epi_workflow` framework for more +#' control. +#' +#' The `trainer` is a `parsnip` model describing the type of estimation such +#' that `mode = "classification"` is enforced. The two typical trainers that +#' are used are `parsnip::logistic_reg()` for two classes or +#' `parsnip::multinom_reg()` for more than two classes. +#' +#' ```{r} +#' workflows::extract_spec_parsnip(out$epi_workflow) +#' ``` +#' +#' From the parsnip model specification, we can see that the trainer used is +#' logistic regression, which is expected for our binary outcome. More +#' complicated trainers like `parsnip::naive_Bayes()` or +#' `parsnip::rand_forest()` may also be used (however, we will stick to the +#' basics in this gentle introduction to the classifier). +#' +#' If you use the default trainer of logistic regression for binary +#' classification and you decide against using the default break of 0.25, then +#' you should only input one break so that there are two classification bins +#' to properly dichotomize the outcome. For example, let's set a break of 0.5 +#' instead of relying on the default of 0.25. We can do this by passing 0.5 to +#' the `breaks` argument in `arx_class_args_list()` as follows: +#' +#' ```{r} +#' out_break_0.5 <- arx_classifier( +#' jhu, +#' outcome = "case_rate", +#' predictors = "case_rate", +#' args_list = arx_class_args_list( +#' breaks = 0.5 +#' ) +#' ) +#' +#' out_break_0.5$predictions +#' ``` +#' Indeed, we can observe that the two `.pred_class` are now (-Inf, 0.5] and +#' (0.5, Inf). See `help(arx_class_args_list)` for other available +#' modifications. +#' +#' Additional arguments that may be supplied to `arx_class_args_list()` include +#' the expected `lags` and `ahead` arguments for an autoregressive-type model. +#' These have default values of 0, 7, and 14 days for the lags of the +#' predictors and 7 days ahead of the forecast date for predicting the +#' outcome. There is also `n_training` to indicate the upper bound for the +#' number of training rows per key. If you would like some practice with using +#' this, then remove the filtering command to obtain data within "2021-06-04" +#' and "2021-12-31" and instead set `n_training` to be the number of days +#' between these two dates, inclusive of the end points. The end results +#' should be the same. In addition to `n_training`, there are `forecast_date` +#' and `target_date` to specify the date that the forecast is created and +#' intended, respectively. We will not dwell on such arguments here as they +#' are not unique to this classifier or absolutely essential to understanding +#' how it operates. The remaining arguments will be discussed organically, as +#' they are needed to serve our purposes. For information on any remaining +#' arguments that are not discussed here, please see the function +#' documentation for a complete list and their definitions. #' #' @inheritParams arx_forecaster #' @param outcome A character (scalar) specifying the outcome (in the @@ -26,9 +124,9 @@ #' @seealso [arx_class_epi_workflow()], [arx_class_args_list()] #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% -#' filter(time_value >= as.Date("2021-11-01")) +#' tiny_geos <- c("as", "mp", "vi", "gu", "pr") +#' jhu <- covid_case_death_rates %>% +#' filter(time_value >= as.Date("2021-11-01"), !(geo_value %in% tiny_geos)) #' #' out <- arx_classifier(jhu, "death_rate", c("case_rate", "death_rate")) #' @@ -55,12 +153,20 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - preds <- forecast( - wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) - ) %>% + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn( + "The specified forecast date {args_list$forecast_date} doesn't match the + date from which the forecast is occurring {forecast_date}." + ) + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + preds <- forecast(wf) %>% as_tibble() %>% select(-time_value) @@ -94,10 +200,9 @@ arx_classifier <- function( #' #' @return An unfit `epi_workflow`. #' @export -#' @seealso [arx_classifier()] +#' @seealso [arx_classifier()] [arx_class_args_list()] #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' #' arx_class_epi_workflow(jhu, "death_rate", c("case_rate", "death_rate")) @@ -125,27 +230,40 @@ arx_class_epi_workflow <- function( if (!(is.null(trainer) || is_classification(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.") } + + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor # ------- predictors r <- epi_recipe(epi_data) %>% step_growth_rate( - dplyr::all_of(predictors), + all_of(predictors), role = "grp", horizon = args_list$horizon, method = args_list$method, - log_scale = args_list$log_scale, - additional_gr_args_list = args_list$additional_gr_args + log_scale = args_list$log_scale ) for (l in seq_along(lags)) { - p <- predictors[l] - p <- as.character(glue::glue_data(args_list, "gr_{horizon}_{method}_{p}")) - r <- step_epi_lag(r, !!p, lag = lags[[l]]) + pred_names <- predictors[l] + pred_names <- as.character(glue::glue_data( + args_list, "gr_{horizon}_{method}_{pred_names}" + )) + r <- step_epi_lag(r, !!pred_names, lag = lags[[l]]) } # ------- outcome if (args_list$outcome_transform == "lag_difference") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "lag_diff_{horizon}_{outcome}") ) r <- r %>% @@ -156,7 +274,7 @@ arx_class_epi_workflow <- function( ) } if (args_list$outcome_transform == "growth_rate") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "gr_{horizon}_{method}_{outcome}") ) if (!(outcome %in% predictors)) { @@ -166,23 +284,41 @@ arx_class_epi_workflow <- function( role = "pre-outcome", horizon = args_list$horizon, method = args_list$method, - log_scale = args_list$log_scale, - additional_gr_args_list = args_list$additional_gr_args + log_scale = args_list$log_scale ) } } - o2 <- rlang::sym(paste0("ahead_", args_list$ahead, "_", o)) + # regex that will match any amount of adjustment for the ahead + ahead_out_name_regex <- glue::glue("ahead_[0-9]*_{pre_out_name}") + method_adjust_latency <- args_list$adjust_latency + if (method_adjust_latency != "none") { + if (method_adjust_latency != "extend_ahead") { + cli_abort("only extend_ahead is currently supported", + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + } + r <- r %>% step_adjust_latency(!!pre_out_name, + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } + r <- r %>% + step_epi_ahead(!!pre_out_name, ahead = args_list$ahead, role = "pre-outcome") r <- r %>% - step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") %>% - recipes::step_mutate( - outcome_class = cut(!!o2, breaks = args_list$breaks), + step_mutate( + across( + matches(ahead_out_name_regex), + ~ cut(.x, breaks = args_list$breaks), + .names = "outcome_class", + .unpack = TRUE + ), role = "outcome" ) %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( + r <- check_enough_data( r, recipes::all_predictors(), recipes::all_outcomes(), @@ -192,10 +328,6 @@ arx_class_epi_workflow <- function( ) } - - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() f <- layer_add_forecast_date(f, forecast_date = forecast_date) %>% @@ -213,7 +345,7 @@ arx_class_epi_workflow <- function( #' be created using growth rates (as the predictors are) or lagged #' differences. The second case is closer to the requirements for the #' [2022-23 CDC Flusight Hospitalization Experimental Target](https://github.com/cdcepi/Flusight-forecast-data/blob/745511c436923e1dc201dea0f4181f21a8217b52/data-experimental/README.md). -#' See the Classification Vignette for details of how to create a reasonable +#' See the [Classification chapter from the forecasting book](https://cmu-delphi.github.io/delphi-tooling-book/arx-classifier.html) Vignette for details of how to create a reasonable #' baseline for this case. Selecting `"growth_rate"` (the default) uses #' [epiprocess::growth_rate()] to create the outcome using some of the #' additional arguments below. Choosing `"lag_difference"` instead simply @@ -236,9 +368,6 @@ arx_class_epi_workflow <- function( #' @param method Character. Options available for growth rate calculation. #' @param log_scale Scalar logical. Whether to compute growth rates on the #' log scale. -#' @param additional_gr_args List. Optional arguments controlling growth rate -#' calculation. See [epiprocess::growth_rate()] and the related Vignette for -#' more details. #' @param check_enough_data_n Integer. A lower limit for the number of rows per #' epi_key that are required for training. If `NULL`, this check is ignored. #' @param check_enough_data_epi_keys Character vector. A character vector of @@ -260,13 +389,13 @@ arx_class_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, method = c("rel_change", "linear_reg"), log_scale = FALSE, - additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -276,7 +405,8 @@ arx_class_args_list <- function( method <- rlang::arg_match(method) outcome_transform <- rlang::arg_match(outcome_transform) - arg_is_scalar(ahead, n_training, horizon, log_scale) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, horizon, log_scale, adjust_latency, warn_latency) arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags, horizon) @@ -284,23 +414,16 @@ arx_class_args_list <- function( arg_is_lgl(log_scale) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) - if (!is.list(additional_gr_args)) { - cli_abort(c( - "`additional_gr_args` must be a {.cls list}.", - "!" = "This is a {.cls {class(additional_gr_args)}}.", - i = "See `?epiprocess::growth_rate` for available arguments." - )) - } arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + cli_warn( + "`forecast_date` {.val {forecast_date}} + + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } @@ -318,13 +441,12 @@ arx_class_args_list <- function( breaks, forecast_date, target_date, + adjust_latency, outcome_transform, max_lags, horizon, method, log_scale, - additional_gr_args, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), @@ -337,5 +459,3 @@ print.arx_class <- function(x, ...) { name <- "ARX Classifier" NextMethod(name = name, ...) } - -# this is a trivial change to induce a check diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 37c9aae86..7ef609668 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -1,43 +1,48 @@ #' Direct autoregressive forecaster with covariates #' #' This is an autoregressive forecasting model for -#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It does "direct" forecasting, meaning -#' that it estimates a model for a particular target horizon. +#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It does "direct" +#' forecasting, meaning that it estimates a model for a particular target +#' horizon of the `outcome` based on the lags of the `predictors`. See the [Get +#' started vignette](../articles/epipredict.html) for some worked examples and +#' [Custom epi_workflows vignette](../articles/custom_epiworkflows.html) for a +#' recreation using a custom `epi_workflow()`. #' #' #' @param epi_data An `epi_df` object -#' @param outcome A character (scalar) specifying the outcome (in the -#' `epi_df`). +#' @param outcome A character (scalar) specifying the outcome (in the `epi_df`). #' @param predictors A character vector giving column(s) of predictor variables. -#' This defaults to the `outcome`. However, if manually specified, only those variables -#' specifically mentioned will be used. (The `outcome` will not be added.) -#' By default, equals the outcome. If manually specified, does not add the -#' outcome variable, so make sure to specify it. -#' @param trainer A `{parsnip}` model describing the type of estimation. -#' For now, we enforce `mode = "regression"`. -#' @param args_list A list of customization arguments to determine -#' the type of forecasting model. See [arx_args_list()]. +#' This defaults to the `outcome`. However, if manually specified, only those +#' variables specifically mentioned will be used, and the `outcome` will not be +#' added. +#' @param trainer A `{parsnip}` model describing the type of estimation. For +#' now, we enforce `mode = "regression"`. +#' @param args_list A list of customization arguments to determine the type of +#' forecasting model. See [arx_args_list()]. #' -#' @return A list with (1) `predictions` an `epi_df` of predicted values -#' and (2) `epi_workflow`, a list that encapsulates the entire estimation -#' workflow +#' @return An `arx_fcast`, with the fields `predictions` and `epi_workflow`. +#' `predictions` is a `tibble` of predicted values while `epi_workflow()` is +#' the fit workflow used to make those predictions #' @export #' @seealso [arx_fcast_epi_workflow()], [arx_args_list()] #' #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' dplyr::filter(time_value >= as.Date("2021-12-01")) #' #' out <- arx_forecaster( -#' jhu, "death_rate", +#' jhu, +#' "death_rate", #' c("case_rate", "death_rate") #' ) #' -#' out <- arx_forecaster(jhu, "death_rate", +#' out <- arx_forecaster(jhu, +#' "death_rate", #' c("case_rate", "death_rate"), #' trainer = quantile_reg(), #' args_list = arx_args_list(quantile_levels = 1:9 / 10) #' ) +#' out arx_forecaster <- function( epi_data, outcome, @@ -51,12 +56,16 @@ arx_forecaster <- function( wf <- arx_fcast_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - preds <- forecast( - wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) - ) %>% + # get the forecast date for the forecast function + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + + + preds <- forecast(wf) %>% as_tibble() %>% select(-time_value) @@ -88,11 +97,10 @@ arx_forecaster <- function( #' #' @return An unfitted `epi_workflow`. #' @export -#' @seealso [arx_forecaster()] +#' @seealso [arx_forecaster()], [arx_args_list()] #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-12-01")) #' #' arx_fcast_epi_workflow( @@ -119,49 +127,97 @@ arx_fcast_epi_workflow <- function( if (!(is.null(trainer) || is_regression(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'regression'.") } + # forecast_date is above all what they set; + # if they don't and they're not adjusting latency, it defaults to the max time_value + # if they're adjusting, it defaults to the as_of + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn( + "The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is actually occurring {forecast_date_default}.", + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + if (forecast_date + args_list$ahead != target_date) { + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", + class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" + ) + } + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor r <- epi_recipe(epi_data) + # adjust latency if the user asks + method_adjust_latency <- args_list$adjust_latency + if (!is.null(method_adjust_latency)) { + if (method_adjust_latency == "extend_ahead") { + r <- r %>% step_adjust_latency(all_outcomes(), + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } else if (method_adjust_latency == "extend_lags") { + r <- r %>% step_adjust_latency(all_predictors(), + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } + } for (l in seq_along(lags)) { p <- predictors[l] r <- step_epi_lag(r, !!p, lag = lags[[l]]) } r <- r %>% - step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% + step_epi_ahead(!!outcome, ahead = args_list$ahead) + r <- r %>% step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) + step_training_window(n_recent = args_list$n_training) %>% + check_enough_data(all_predictors(), min_observations = 1, skip = FALSE) if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( - r, + r <- r %>% check_enough_data( all_predictors(), - !!outcome, - n = args_list$check_enough_data_n, + all_outcomes(), + min_observations = args_list$check_enough_data_n, epi_keys = args_list$check_enough_data_epi_keys, drop_na = FALSE ) } - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() - if (inherits(trainer, "quantile_reg")) { + is_quantile_reg <- inherits(trainer, "quantile_reg") | + (inherits(trainer, "rand_forest") & trainer$engine == "grf_quantiles") + if (is_quantile_reg) { # add all quantile_level to the forecaster and update postprocessor - quantile_levels <- sort(compare_quantile_args( - args_list$quantile_levels, - rlang::eval_tidy(trainer$args$quantile_levels) - )) + if (inherits(trainer, "quantile_reg")) { + quantile_levels <- sort(compare_quantile_args( + args_list$quantile_levels, + rlang::eval_tidy(trainer$args$quantile_levels), + "qr" + )) + trainer$args$quantile_levels <- rlang::enquo(quantile_levels) + } else { + quantile_levels <- sort(compare_quantile_args( + args_list$quantile_levels, + rlang::eval_tidy(trainer$eng_args$quantiles) %||% + c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), + "grf" + )) + trainer$eng_args$quantiles <- rlang::enquo(quantile_levels) + } args_list$quantile_levels <- quantile_levels - trainer$args$quantile_levels <- enquo(quantile_levels) - f <- layer_quantile_distn(f, quantile_levels = quantile_levels) %>% + f <- f %>% + layer_quantile_distn(quantile_levels = quantile_levels) %>% layer_point_from_distn() } else { - f <- layer_residual_quantiles( - f, + f <- f %>% layer_residual_quantiles( quantile_levels = args_list$quantile_levels, symmetrize = args_list$symmetrize, by_key = args_list$quantile_by_key @@ -187,17 +243,36 @@ arx_fcast_epi_workflow <- function( #' @param n_training Integer. An upper limit for the number of rows per #' key that are used for training #' (in the time unit of the `epi_df`). -#' @param forecast_date Date. The date on which the forecast is created. -#' The default `NULL` will attempt to determine this automatically. -#' @param target_date Date. The date for which the forecast is intended. -#' The default `NULL` will attempt to determine this automatically. +#' @param forecast_date Date. The date from which the forecast is occurring. +#' The default `NULL` will determine this automatically from either +#' 1. the maximum time value for which there's data if there is no latency +#' adjustment (the default case), or +#' 2. the `as_of` date of `epi_data` if `adjust_latency` is +#' non-`NULL`. +#' @param target_date Date. The date that is being forecast. The default `NULL` +#' will determine this automatically as `forecast_date + ahead`. +#' @param adjust_latency Character. One of the `method`s of +#' [step_adjust_latency()], or `"none"` (in which case there is no adjustment). +#' If the `forecast_date` is after the last day of data, this determines how +#' to shift the model to account for this difference. The options are: +#' - `"none"` the default, assumes the `forecast_date` is the last day of data +#' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to +#' the last day of data. For example, if the last day of data was 3 days ago, +#' the ahead becomes `ahead+3`. +#' - `"extend_lags"`: increase the lags so they're relative to the actual +#' forecast date. For example, if the lags are `c(0, 7, 14)` and the last day of +#' data was 3 days ago, the lags become `c(3, 10, 17)`. +#' @param warn_latency by default, `step_adjust_latency` warns the user if the +#' latency is large. If this is `FALSE`, that warning is turned off. #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. -#' @param symmetrize Logical. The default `TRUE` calculates -#' symmetric prediction intervals. This argument only applies when -#' residual quantiles are used. It is not applicable with -#' `trainer = quantile_reg()`, for example. +#' @param symmetrize Logical. The default `TRUE` calculates symmetric prediction +#' intervals. This argument only applies when residual quantiles are used. It +#' is not applicable with `trainer = quantile_reg()`, for example. Typically, one +#' would only want non-symmetric quantiles when increasing trajectories are +#' quite different from decreasing ones, such as a strictly postive variable +#' near zero. #' @param nonneg Logical. The default `TRUE` enforces nonnegative predictions #' by hard-thresholding at 0. #' @param quantile_by_key Character vector. Groups residuals by listed keys @@ -206,15 +281,6 @@ arx_fcast_epi_workflow <- function( #' `character(0)` performs no grouping. This argument only applies when #' residual quantiles are used. It is not applicable with #' `trainer = quantile_reg()`, for example. -#' @param nafill_buffer At predict time, recent values of the training data -#' are used to create a forecast. However, these can be `NA` due to, e.g., -#' data latency issues. By default, any missing values will get filled with -#' less recent data. Setting this value to `NULL` will result in 1 extra -#' recent row (beyond those required for lag creation) to be used. Note that -#' we require at least `min(lags)` rows of recent data per `geo_value` to -#' create a prediction. For this reason, setting `nafill_buffer < min(lags)` -#' will be treated as _additional_ allowed recent data rather than the -#' total amount of recent data to examine. #' @param check_enough_data_n Integer. A lower limit for the number of rows per #' epi_key that are required for training. If `NULL`, this check is ignored. #' @param check_enough_data_epi_keys Character vector. A character vector of @@ -225,6 +291,7 @@ arx_fcast_epi_workflow <- function( #' #' @return A list containing updated parameter choices with class `arx_flist`. #' @export +#' @seealso [arx_forecaster()] #' #' @examples #' arx_args_list() @@ -236,11 +303,12 @@ arx_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, - quantile_levels = c(0.05, 0.95), + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -249,7 +317,8 @@ arx_args_list <- function( .lags <- lags if (is.list(lags)) lags <- unlist(lags) - arg_is_scalar(ahead, n_training, symmetrize, nonneg) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, symmetrize, nonneg, adjust_latency, warn_latency) arg_is_chr(quantile_by_key, allow_empty = TRUE) arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) @@ -258,16 +327,14 @@ arx_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } @@ -280,11 +347,12 @@ arx_args_list <- function( quantile_levels, forecast_date, target_date, + adjust_latency, + warn_latency, symmetrize, nonneg, max_lags, quantile_by_key, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), @@ -299,9 +367,13 @@ print.arx_fcast <- function(x, ...) { NextMethod(name = name, ...) } -compare_quantile_args <- function(alist, tlist) { +compare_quantile_args <- function(alist, tlist, train_method = c("qr", "grf")) { + train_method <- rlang::arg_match(train_method) default_alist <- eval(formals(arx_args_list)$quantile_levels) - default_tlist <- eval(formals(quantile_reg)$quantile_levels) + default_tlist <- switch(train_method, + "qr" = eval(formals(quantile_reg)$quantile_levels), + "grf" = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95) + ) if (setequal(alist, default_alist)) { if (setequal(tlist, default_tlist)) { return(sort(unique(union(alist, tlist)))) diff --git a/R/autoplot.R b/R/autoplot.R index 648c74e33..3295992c7 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -13,9 +13,11 @@ ggplot2::autoplot #' #' #' @inheritParams epiprocess::autoplot.epi_df -#' @param object An `epi_workflow` +#' @param object,x An `epi_workflow` #' @param predictions A data frame with predictions. If `NULL`, only the #' original data is shown. +#' @param observed_response An epi_df of the data to plot against. This is for the case +#' where you have the actual results to compare the forecast against. #' @param .levels A numeric vector of levels to plot for any prediction bands. #' More than 3 levels begins to be difficult to see. #' @param ... Ignored @@ -28,8 +30,7 @@ ggplot2::autoplot #' #' @name autoplot-epipred #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' #' r <- epi_recipe(jhu) %>% @@ -39,9 +40,7 @@ ggplot2::autoplot #' step_epi_naomit() #' #' f <- frosting() %>% -#' layer_residual_quantiles( -#' quantile_levels = c(.025, .1, .25, .75, .9, .975) -#' ) %>% +#' layer_residual_quantiles() %>% #' layer_threshold(starts_with(".pred")) %>% #' layer_add_target_date() #' @@ -51,7 +50,7 @@ ggplot2::autoplot #' #' latest <- jhu %>% filter(time_value >= max(time_value) - 14) #' preds <- predict(wf, latest) -#' autoplot(wf, preds, .max_facets = 4) +#' autoplot(wf, preds, .facet_filter = geo_value %in% c("ca", "ny", "de", "mt")) #' #' # ------- Show multiple horizons #' @@ -66,31 +65,34 @@ ggplot2::autoplot #' }) #' #' p <- do.call(rbind, p) -#' autoplot(wf, p, .max_facets = 4) +#' autoplot(wf, p, .facet_filter = geo_value %in% c("ca", "ny", "de", "mt")) #' #' # ------- Plotting canned forecaster output #' -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' flat <- flatline_forecaster(jhu, "death_rate") -#' autoplot(flat, .max_facets = 4) +#' autoplot(flat, .facet_filter = geo_value %in% c("ca", "ny", "de", "mt")) #' #' arx <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), #' args_list = arx_args_list(ahead = 14L) #' ) -#' autoplot(arx, .max_facets = 6) +#' autoplot(arx, .facet_filter = geo_value %in% c("ca", "ny", "de", "mt", "mo", "in")) NULL #' @export #' @rdname autoplot-epipred autoplot.epi_workflow <- function( - object, predictions = NULL, - .levels = c(.5, .8, .95), ..., + object, + predictions = NULL, + observed_response = NULL, + .levels = c(.5, .8, .9), ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "dodgerblue4", .point_pred_color = "orange", - .max_facets = Inf) { + .facet_filter = NULL, + .max_facets = deprecated()) { rlang::check_dots_empty() arg_is_probabilities(.levels) rlang::arg_match(.color_by) @@ -111,32 +113,41 @@ autoplot.epi_workflow <- function( } keys <- c("geo_value", "time_value", "key") mold_roles <- names(mold$extras$roles) - edf <- bind_cols(mold$extras$roles[mold_roles %in% keys], y) - if (starts_with_impl("ahead_", names(y))) { + # extract the relevant column names for plotting + if (starts_with_impl("ahead_", names(y)) || starts_with_impl("lag_", names(y))) { old_name_y <- unlist(strsplit(names(y), "_")) - shift <- as.numeric(old_name_y[2]) new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_") - edf <- rename(edf, !!new_name_y := !!names(y)) - } else if (starts_with_impl("lag_", names(y))) { - old_name_y <- unlist(strsplit(names(y), "_")) - shift <- -as.numeric(old_name_y[2]) - new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_") - edf <- rename(edf, !!new_name_y := !!names(y)) + } else { + new_name_y <- names(y) } - - if (!is.null(shift)) { - edf <- mutate(edf, time_value = time_value + shift) + if (is.null(observed_response)) { + # the outcome has shifted, so we need to shift it forward (or back) + # by the corresponding amount + observed_response <- bind_cols(mold$extras$roles[mold_roles %in% keys], y) + if (starts_with_impl("ahead_", names(y))) { + shift <- as.numeric(old_name_y[2]) + } else if (starts_with_impl("lag_", names(y))) { + old_name_y <- unlist(strsplit(names(y), "_")) + shift <- -as.numeric(old_name_y[2]) + } else { + new_name_y <- names(y) + shift <- 0 + } + observed_response <- rename(observed_response, !!new_name_y := !!names(y)) + if (!is.null(shift)) { + observed_response <- mutate(observed_response, time_value = time_value + shift) + } + other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) + observed_response <- as_epi_df(observed_response, + as_of = object$fit$meta$as_of, + other_keys = other_keys + ) } - extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) - if (length(extra_keys) == 0L) extra_keys <- NULL - edf <- as_epi_df(edf, - as_of = object$fit$meta$as_of, - other_keys = extra_keys %||% character() - ) if (is.null(predictions)) { return(autoplot( - edf, new_name_y, + observed_response, new_name_y, .color_by = .color_by, .facet_by = .facet_by, .base_color = .base_color, + .facet_filter = {{ .facet_filter }}, .max_facets = .max_facets )) } @@ -147,34 +158,36 @@ autoplot.epi_workflow <- function( } predictions <- rename(predictions, time_value = target_date) } - pred_cols_ok <- hardhat::check_column_names(predictions, key_colnames(edf)) + pred_cols_ok <- hardhat::check_column_names(predictions, key_colnames(observed_response)) if (!pred_cols_ok$ok) { cli_warn(c( "`predictions` is missing required variables: {.var {pred_cols_ok$missing_names}}.", i = "Plotting the original data." )) return(autoplot( - edf, !!new_name_y, + observed_response, !!new_name_y, .color_by = .color_by, .facet_by = .facet_by, .base_color = .base_color, + .facet_filter = {{ .facet_filter }}, .max_facets = .max_facets )) } # First we plot the history, always faceted by everything - bp <- autoplot(edf, !!new_name_y, + bp <- autoplot(observed_response, !!new_name_y, .color_by = "none", .facet_by = "all_keys", - .base_color = "black", .max_facets = .max_facets + .base_color = "black", .facet_filter = {{ .facet_filter }}, + .max_facets = .max_facets ) # Now, prepare matching facets in the predictions - ek <- epi_keys_only(edf) + ek <- epi_keys_only(observed_response) predictions <- predictions %>% mutate( - .facets = interaction(!!!rlang::syms(as.list(ek)), sep = "/"), + .facets = interaction(!!!rlang::syms(as.list(ek)), sep = " / "), ) - if (.max_facets < Inf) { - top_n <- levels(as.factor(bp$data$.facets))[seq_len(.max_facets)] - predictions <- filter(predictions, .facets %in% top_n) %>% + .facet_filter <- rlang::enquo(.facet_filter) + if (!rlang::quo_is_null(.facet_filter) && ".facets" %in% names(bp$data)) { + predictions <- filter(predictions, .facets %in% unique(bp$data$.facets)) %>% mutate(.facets = droplevels(.facets)) } @@ -184,7 +197,7 @@ autoplot.epi_workflow <- function( } if (".pred" %in% names(predictions)) { - ntarget_dates <- n_distinct(predictions$time_value) + ntarget_dates <- dplyr::n_distinct(predictions$time_value) if (ntarget_dates > 1L) { bp <- bp + geom_line( @@ -205,26 +218,41 @@ autoplot.epi_workflow <- function( #' @export #' @rdname autoplot-epipred autoplot.canned_epipred <- function( - object, ..., + object, observed_response = NULL, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "dodgerblue4", .point_pred_color = "orange", - .max_facets = Inf) { + .facet_filter = NULL, + .max_facets = deprecated()) { rlang::check_dots_empty() rlang::arg_match(.color_by) rlang::arg_match(.facet_by) ewf <- object$epi_workflow predictions <- object$predictions %>% - dplyr::rename(time_value = target_date) + rename(time_value = target_date) - autoplot(ewf, predictions, + autoplot(ewf, predictions, observed_response, ..., .color_by = .color_by, .facet_by = .facet_by, - .base_color = .base_color, .max_facets = .max_facets + .base_color = .base_color, .facet_filter = {{ .facet_filter }}, + .max_facets = .max_facets ) } +#' @export +#' @rdname autoplot-epipred +plot.epi_workflow <- function(x, ...) { + autoplot(x, ...) +} + +#' @export +#' @rdname autoplot-epipred +plot.canned_epipred <- function(x, ...) { + autoplot(x, ...) +} + + starts_with_impl <- function(x, vars) { n <- nchar(x) x == substr(vars, 1, n) @@ -232,24 +260,25 @@ starts_with_impl <- function(x, vars) { plot_bands <- function( base_plot, predictions, - levels = c(.5, .8, .95), + levels = c(.5, .8, .9), fill = "blue4", alpha = 0.6, linewidth = 0.05) { innames <- names(predictions) - n <- length(levels) - alpha <- alpha / (n - 1) - l <- (1 - levels) / 2 - l <- c(rev(l), 1 - l) + n_levels <- length(levels) + alpha <- alpha / (n_levels - 1) + # generate the corresponding level that is 1 - level + levels <- (1 - levels) / 2 + levels <- c(rev(levels), 1 - levels) ntarget_dates <- dplyr::n_distinct(predictions$time_value) predictions <- predictions %>% - mutate(.pred_distn = dist_quantiles(quantile(.pred_distn, l), l)) %>% + mutate(.pred_distn = quantile_pred(quantile(.pred_distn, levels), levels)) %>% pivot_quantiles_wider(.pred_distn) qnames <- setdiff(names(predictions), innames) - for (i in 1:n) { + for (i in 1:n_levels) { bottom <- qnames[i] top <- rev(qnames)[i] if (i == 1) { diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 0adc0536a..7d53862c2 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -63,6 +63,7 @@ print.alist <- function(x, ...) { } #' @export +#' @importFrom hardhat extract_recipe print.canned_epipred <- function(x, name, ...) { d <- cli::cli_div(theme = list(rule = list("line-type" = "double"))) cli::cli_rule("A basic forecaster of type {name}") @@ -76,11 +77,13 @@ print.canned_epipred <- function(x, name, ...) { fn_meta <- function() { cli::cli_ul() cli::cli_li("Geography: {.field {x$metadata$training$geo_type}},") - if (!is.null(x$metadata$training$other_keys)) { - cli::cli_li("Other keys: {.field {x$metadata$training$other_keys}},") + other_keys <- x$metadata$training$other_keys + if (!is.null(other_keys) && length(other_keys) > 0L) { + cli::cli_li("Other keys: {.field {other_keys}},") } cli::cli_li("Time type: {.field {x$metadata$training$time_type}},") cli::cli_li("Using data up-to-date as of: {.field {format(x$metadata$training$as_of)}}.") + cli::cli_li("With the last data available on {.field {format(max(x$epi_workflow$original_data$time_value))}}") cli::cli_end() } fn_meta() @@ -103,10 +106,45 @@ print.canned_epipred <- function(x, name, ...) { "A total of {.val {nrow(x$predictions)}} prediction{?s}", " {?is/are} available for" )) + cli::cli_ul(c( "{.val {n_geos}} unique geographic region{?s},", "At forecast date{?s}: {.val {fds}},", - "For target date{?s}: {.val {tds}}." + "For target date{?s}: {.val {tds}}," )) + if ("pre" %in% names(x) && "actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) { + fit_recipe <- extract_recipe(x$epi_workflow) + if (detect_step(fit_recipe, "adjust_latency")) { + is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency")) + latency_step <- fit_recipe$steps[is_adj_latency][[1]] + # all steps after adjust_latency + later_steps <- fit_recipe$steps[-(1:which(is_adj_latency))] + if (latency_step$method == "extend_ahead") { + step_names <- "step_epi_ahead" + type_str <- "Aheads" + } else if (latency_step$method == "extend_lags") { + step_names <- "step_epi_lag" + type_str <- "Lags" + } else { + step_names <- "" + type_str <- "columns locf" + } + later_steps[[1]]$columns + valid_columns <- later_steps %>% + keep(function(x) inherits(x, step_names)) %>% + purrr::map("columns") %>% + reduce(c) + latency_per_base_col <- latency_step$latency_table %>% + filter(col_name %in% valid_columns) %>% + mutate(latency = abs(latency)) + if (latency_step$method != "locf" && nrow(latency_per_base_col) > 1) { + intro_text <- glue::glue("{type_str} adjusted per column: ") + } else if (latency_step$method != "locf") { + intro_text <- glue::glue("{type_str} adjusted for ") + } + latency_info <- paste0(intro_text, paste(apply(latency_per_base_col, 1, paste0, collapse = "="), collapse = ", ")) + cli::cli_ul(latency_info) + } + } cli::cli_text("") } diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index b2e7434e2..a97eece87 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -23,7 +23,8 @@ #' #' @examples #' library(dplyr) -#' weekly_deaths <- case_death_rate_subset %>% +#' library(epiprocess) +#' weekly_deaths <- covid_case_death_rates %>% #' select(geo_value, time_value, death_rate) %>% #' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% #' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% @@ -36,25 +37,24 @@ #' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") #' preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) #' -#' if (require(ggplot2)) { -#' forecast_date <- unique(preds$forecast_date) -#' four_states <- c("ca", "pa", "wa", "ny") -#' preds %>% -#' filter(geo_value %in% four_states) %>% -#' ggplot(aes(target_date)) + -#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + -#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + -#' geom_line(aes(y = .pred), color = "orange") + -#' geom_line( -#' data = weekly_deaths %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = deaths_7dsum) -#' ) + -#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + -#' labs(x = "Date", y = "Weekly deaths") + -#' facet_wrap(~geo_value, scales = "free_y") + -#' theme_bw() + -#' geom_vline(xintercept = forecast_date) -#' } +#' library(ggplot2) +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = weekly_deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths_7dsum) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) cdc_baseline_forecaster <- function( epi_data, outcome, @@ -78,10 +78,7 @@ cdc_baseline_forecaster <- function( # target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - latest <- get_test_data( - epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, - forecast_date - ) + latest <- get_test_data(epi_recipe(epi_data), epi_data) f <- frosting() %>% layer_predict() %>% @@ -169,7 +166,6 @@ cdc_baseline_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(n_training, nsims, data_frequency) @@ -183,7 +179,6 @@ cdc_baseline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) structure( enlist( @@ -195,8 +190,7 @@ cdc_baseline_args_list <- function( nsims, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("cdc_baseline_fcast", "alist") ) diff --git a/R/check_enough_data.R b/R/check_enough_data.R new file mode 100644 index 000000000..e830d5e54 --- /dev/null +++ b/R/check_enough_data.R @@ -0,0 +1,193 @@ +#' Check the dataset contains enough data points. +#' +#' `check_enough_data` creates a *specification* of a recipe +#' operation that will check if variables contain enough data. +#' +#' @param recipe A recipe object. The check will be added to the +#' sequence of operations for this recipe. +#' @param ... One or more selector functions to choose variables for this check. +#' See [selections()] for more details. You will usually want to use +#' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here. +#' @param min_observations The minimum number of data points required for +#' training. If this is NULL, the total number of predictors will be used. +#' @param epi_keys A character vector of column names on which to group the data +#' and check threshold within each group. Useful if your forecaster trains +#' per group (for example, per geo_value). +#' @param drop_na A logical for whether to count NA values as valid rows. +#' @param role Not used by this check since no new variables are +#' created. +#' @param trained A logical for whether the selectors in `...` +#' have been resolved by [prep()]. +#' @param id A character string that is unique to this check to identify it. +#' @param skip A logical. If `TRUE`, only training data is checked, while if +#' `FALSE`, both training and predicting data is checked. Technically, this +#' answers the question "should the check be skipped when the recipe is baked +#' by [bake()]?" While all operations are baked when [prep()] is run, some +#' operations may not be able to be conducted on new data (e.g. processing the +#' outcome variable(s)). Care should be taken when using `skip = TRUE` as it +#' may affect the computations for subsequent operations. +#' @family checks +#' @export +#' @details This check will break the `prep` and/or bake function if any of the +#' checked columns have not enough non-NA values. If the check passes, nothing +#' is changed in the data. It is best used after every other step. +#' +#' For checking training data, it is best to set `...` to be +#' `all_predictors(), all_outcomes()`, while for checking prediction data, it +#' is best to set `...` to be `all_predictors()` only, with `n = 1`. +#' +#' # tidy() results +#' +#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column +#' `terms` (the selectors or variables selected) is returned. +#' +check_enough_data <- + function(recipe, + ..., + min_observations = NULL, + epi_keys = NULL, + drop_na = TRUE, + role = NA, + trained = FALSE, + skip = TRUE, + id = rand_id("enough_data")) { + recipes::add_check( + recipe, + check_enough_data_new( + min_observations = min_observations, + epi_keys = epi_keys, + drop_na = drop_na, + terms = enquos(...), + role = role, + trained = trained, + columns = NULL, + skip = skip, + id = id + ) + ) + } + +check_enough_data_new <- + function(min_observations, epi_keys, drop_na, terms, + role, trained, columns, skip, id) { + recipes::check( + subclass = "enough_data", + prefix = "check_", + min_observations = min_observations, + epi_keys = epi_keys, + drop_na = drop_na, + terms = terms, + role = role, + trained = trained, + columns = columns, + skip = skip, + id = id + ) + } + +#' @export +prep.check_enough_data <- function(x, training, info = NULL, ...) { + col_names <- recipes::recipes_eval_select(x$terms, training, info) + if (is.null(x$min_observations)) { + x$min_observations <- length(col_names) + } + + check_enough_data_core(training, x, col_names, "train") + + check_enough_data_new( + min_observations = x$min_observations, + epi_keys = x$epi_keys, + drop_na = x$drop_na, + terms = x$terms, + role = x$role, + trained = TRUE, + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.check_enough_data <- function(object, new_data, ...) { + col_names <- object$columns + check_enough_data_core(new_data, object, col_names, "predict") + new_data +} + +#' @export +print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) { + title <- paste0("Check enough data (n = ", x$min_observations, ") for ") + recipes::print_step(x$columns, x$terms, x$trained, title, width) + invisible(x) +} + +#' @export +tidy.check_enough_data <- function(x, ...) { + if (recipes::is_trained(x)) { + res <- tibble(terms = unname(x$columns)) + } else { + res <- tibble(terms = recipes::sel2char(x$terms)) + } + res$id <- x$id + res$min_observations <- x$min_observations + res$epi_keys <- x$epi_keys + res$drop_na <- x$drop_na + res +} + +check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) { + epi_df <- epi_df %>% + group_by(across(all_of(.env$step_obj$epi_keys))) + if (step_obj$drop_na) { + any_missing_data <- epi_df %>% + mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>% + # count the number of rows where they're all not na + summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop") + any_missing_data <- any_missing_data %>% + summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>% + any() + + # figuring out which individual columns (if any) are to blame for this dearth + # of data + cols_not_enough_data <- epi_df %>% + summarise( + across( + all_of(.env$col_names), + ~ sum(!is.na(.x)) < .env$step_obj$min_observations + ), + .groups = "drop" + ) %>% + # Aggregate across keys (if present) + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + # Select the names of the columns that are TRUE + names(.)[.] + + # Either all columns have enough data, in which case this message won't be + # sent later or none of the single columns have enough data, that means its + # the combination of all of them. + if (length(cols_not_enough_data) == 0) { + cols_not_enough_data <- + glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}") + } + } else { + # if we're not dropping na values, just count + cols_not_enough_data <- epi_df %>% + summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations)) + any_missing_data <- cols_not_enough_data %>% + summarize(across(all_of(.env$col_names), all)) %>% + all() + cols_not_enough_data <- cols_not_enough_data %>% + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + # Select the names of the columns that are TRUE + names(.)[.] + } + + if (any_missing_data) { + cli_abort( + "The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.", + class = "epipredict__not_enough_data" + ) + } +} diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R deleted file mode 100644 index 1279a3712..000000000 --- a/R/check_enough_train_data.R +++ /dev/null @@ -1,145 +0,0 @@ -#' Check the dataset contains enough data points. -#' -#' `check_enough_train_data` creates a *specification* of a recipe -#' operation that will check if variables contain enough data. -#' -#' @param recipe A recipe object. The check will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables for this check. -#' See [selections()] for more details. You will usually want to use -#' [recipes::all_predictors()] here. -#' @param n The minimum number of data points required for training. If this is -#' NULL, the total number of predictors will be used. -#' @param epi_keys A character vector of column names on which to group the data -#' and check threshold within each group. Useful if your forecaster trains -#' per group (for example, per geo_value). -#' @param drop_na A logical for whether to count NA values as valid rows. -#' @param role Not used by this check since no new variables are -#' created. -#' @param trained A logical for whether the selectors in `...` -#' have been resolved by [prep()]. -#' @param columns An internal argument that tracks which columns are evaluated -#' for this check. Should not be used by the user. -#' @param id A character string that is unique to this check to identify it. -#' @param skip A logical. Should the check be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. -#' @family checks -#' @export -#' @details This check will break the `bake` function if any of the checked -#' columns have not enough non-NA values. If the check passes, nothing is -#' changed to the data. -#' -#' # tidy() results -#' -#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column -#' `terms` (the selectors or variables selected) is returned. -#' -check_enough_train_data <- - function(recipe, - ..., - n = NULL, - epi_keys = NULL, - drop_na = TRUE, - role = NA, - trained = FALSE, - columns = NULL, - skip = TRUE, - id = rand_id("enough_train_data")) { - recipes::add_check( - recipe, - check_enough_train_data_new( - n = n, - epi_keys = epi_keys, - drop_na = drop_na, - terms = enquos(...), - role = role, - trained = trained, - columns = columns, - skip = skip, - id = id - ) - ) - } - -check_enough_train_data_new <- - function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) { - recipes::check( - subclass = "enough_train_data", - prefix = "check_", - n = n, - epi_keys = epi_keys, - drop_na = drop_na, - terms = terms, - role = role, - trained = trained, - columns = columns, - skip = skip, - id = id - ) - } - -#' @export -prep.check_enough_train_data <- function(x, training, info = NULL, ...) { - col_names <- recipes::recipes_eval_select(x$terms, training, info) - if (is.null(x$n)) { - x$n <- length(col_names) - } - - if (x$drop_na) { - training <- tidyr::drop_na(training) - } - cols_not_enough_data <- training %>% - group_by(across(all_of(.env$x$epi_keys))) %>% - summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$x$n), .groups = "drop") %>% - summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% - unlist() %>% - names(.)[.] - - if (length(cols_not_enough_data) > 0) { - cli_abort( - "The following columns don't have enough data to predict: {cols_not_enough_data}." - ) - } - - check_enough_train_data_new( - n = x$n, - epi_keys = x$epi_keys, - drop_na = x$drop_na, - terms = x$terms, - role = x$role, - trained = TRUE, - columns = col_names, - skip = x$skip, - id = x$id - ) -} - -#' @export -bake.check_enough_train_data <- function(object, new_data, ...) { - new_data -} - -#' @export -print.check_enough_train_data <- function(x, width = max(20, options()$width - 30), ...) { - title <- paste0("Check enough data (n = ", x$n, ") for ") - recipes::print_step(x$columns, x$terms, x$trained, title, width) - invisible(x) -} - -#' @export -tidy.check_enough_train_data <- function(x, ...) { - if (recipes::is_trained(x)) { - res <- tibble(terms = unname(x$columns)) - } else { - res <- tibble(terms = recipes::sel2char(x$terms)) - } - res$id <- x$id - res$n <- x$n - res$epi_keys <- x$epi_keys - res$drop_na <- x$drop_na - res -} diff --git a/R/climatological_forecaster.R b/R/climatological_forecaster.R new file mode 100644 index 000000000..1748f2103 --- /dev/null +++ b/R/climatological_forecaster.R @@ -0,0 +1,287 @@ +#' Climatological forecaster +#' +#' This is another "baseline" type forecaster, but it is especially appropriate +#' for strongly seasonal diseases (e.g., influenza). The idea is to predict +#' the "typical season" by summarizing over all available history in the +#' `epi_data`. This is analogous to a "climate" forecast rather than a "weather" +#' forecast, essentially predicting "typical January" behavior by relying on a +#' long history of such periods rather than heavily using recent data. +#' +#' The point forecast is either the mean or median of the `outcome` in a small +#' window around the target period, computed over the entire available history, +#' separately for each key in the `epi_df` (`geo_value` and any additional keys). +#' The forecast quantiles are computed from the residuals for this point prediction. +#' By default, the residuals are ungrouped, meaning every key will have the same +#' shape distribution (though different centers). Note that if your data is not +#' or comparable scales across keys, this default is likely inappropriate. In that +#' case, you can choose by which keys quantiles are computed using +#' `climate_args_list(quantile_by_key = ...)`. +#' +#' @inheritParams flatline_forecaster +#' @param args_list A list of additional arguments as created by the +#' [climate_args_list()] constructor function. +#' +#' @return A data frame of point and interval) forecasts at a all horizons +#' for each unique combination of `key_vars`. +#' @export +#' @seealso [step_climate()] +#' +#' @examples +#' cases <- cases_deaths_subset +#' # set as_of to the last day in the data +#' # "case_rate_7d_av" is on the same scale for all geographies +#' attr(cases, "metadata")$as_of <- as.Date("2021-12-31") +#' fcast <- climatological_forecaster(cases, "case_rate_7d_av") +#' autoplot(fcast) +#' +#' # Compute quantiles separately by location, and a backcast +#' # "cases" is on different scales by geography, due to population size +#' # so, it is better to compute quantiles separately +#' backcast <- climatological_forecaster( +#' cases, "case_rate_7d_av", +#' climate_args_list( +#' quantile_by_key = "geo_value", +#' forecast_date = as.Date("2021-06-01") +#' ) +#' ) +#' autoplot(backcast) +#' +#' # compute the climate "daily" rather than "weekly" +#' # use a two week window (on both sides) +#' # "cases" is on different scales by geography, due to population size +#' daily_fcast <- climatological_forecaster( +#' cases, "cases", +#' climate_args_list( +#' quantile_by_key = "geo_value", +#' time_type = "day", +#' window_size = 14L, +#' forecast_horizon = 0:30 +#' ) +#' ) +#' autoplot(daily_fcast) + +#' ggplot2::coord_cartesian(xlim = c(as.Date("2021-10-01"), NA)) +climatological_forecaster <- function(epi_data, + outcome, + args_list = climate_args_list()) { + if (!is_epi_df(epi_data)) { + cli_abort( + "`epi_data` must be an {.cls epi_df}, not a {.cls {class(epi_data)}}." + ) + } + edf_time_type <- attr(epi_data, "metadata")$time_type + if (edf_time_type == "custom") { + cli_abort("This forecaster only works with daily, weekly, or yearmonth data.") + } + if (!inherits(args_list, c("climate_fcast", "alist"))) { + cli_abort("`args_list` was not created using `climate_args_list()`.") + } + arg_is_chr_scalar(outcome) + hardhat::check_column_names(epi_data, c(outcome, args_list$quantile_by_key)) + forecast_date <- args_list$forecast_date %||% attr(epi_data, "metadata")$as_of + horizon <- args_list$forecast_horizon + window_size <- args_list$window_size + time_type <- args_list$time_type + # check that the prediction time type is more granular than epi_data's + # time_type + ttype_ord <- match(time_type, c("day", "epiweek", "week", "month")) + ttype_ord <- ttype_ord - as.integer(ttype_ord > 2) + edf_ttype_ord <- match(edf_time_type, c("day", "week", "yearmonth")) + if (ttype_ord < edf_ttype_ord) { + cli_abort(c("Climate forecasts for more granular time types are not possible + if the `epi_data` has a higher level of aggregation", + i = "Here, the data is in {.val {edf_time_type}}s while + `time_type` is {.val {time_type}}." + )) + } + # process the time types + sym_outcome <- sym(outcome) + epi_data <- epi_data %>% + filter(!is.na(!!outcome)) %>% + select(all_of(c(key_colnames(epi_data), outcome))) + if (time_type %in% c("week", "epiweek")) { + ttype_dur <- lubridate::weeks + time_aggr <- ifelse(time_type == "week", epiweek_leap, isoweek_leap) + modulus <- 52L + } else if (time_type == "month") { + ttype_dur <- function(x) lubridate::period(month = x) + time_aggr <- lubridate::month + modulus <- 12L + } else if (time_type == "day") { + ttype_dur <- lubridate::days + time_aggr <- yday_leap + modulus <- 365L + } + center_fn <- switch(args_list$center_method, + mean = function(x, w) mean(x, na.rm = TRUE), + median = function(x, w) stats::median(x, na.rm = TRUE) + ) + keys <- key_colnames(epi_data, exclude = "time_value") + # Get the prediction geo and .idx for the target date(s) + predictions <- epi_data %>% + select(all_of(keys)) %>% + dplyr::distinct() %>% + mutate(forecast_date = forecast_date, .idx = time_aggr(forecast_date)) + predictions <- + map(horizon, ~ { + predictions %>% + mutate(.idx = .idx + .x, target_date = forecast_date + ttype_dur(.x)) + }) %>% + purrr::list_rbind() %>% + mutate( + .idx = .idx %% modulus, + .idx = dplyr::case_when(.idx == 0 ~ modulus, TRUE ~ .idx) + ) + # get the distinct .idx for the target date(s) + distinct_target_idx <- predictions$.idx %>% unique() + # get all of the idx's within the window of the target .idxs + entries <- map(distinct_target_idx, function(idx) within_window(idx, window_size, modulus)) %>% + do.call(c, .) %>% + unique() + # for the center, we need those within twice the window, since for each point + # we're subtracting out the center to generate the quantiles + entries_double_window <- map(entries, function(idx) within_window(idx, window_size, modulus)) %>% + do.call(c, .) %>% + unique() + + epi_data_target <- + epi_data %>% + mutate(.idx = time_aggr(time_value), .weights = 1) + # get the point predictions + climate_center <- + epi_data_target %>% + filter(.idx %in% entries_double_window) %>% + mutate(.idx = time_aggr(time_value), .weights = 1) %>% + select(.idx, .weights, all_of(c(outcome, keys))) %>% + dplyr::reframe( + roll_modular_multivec( + !!sym_outcome, .idx, .weights, center_fn, window_size, + modulus + ), + .by = all_of(keys) + ) %>% + rename(.pred = climate_pred) + # get the quantiles + Quantile <- function(x, w) { + if (args_list$symmetrize) x <- c(x, -x) + list(unname(quantile( + x, + probs = args_list$quantile_levels, na.rm = TRUE, type = 8 + ))) + } + # add on the centers and subtract them out before computing the quantiles + climate_quantiles <- + epi_data_target %>% + filter(.idx %in% entries) %>% + left_join(climate_center, by = c(".idx", keys)) %>% + mutate({{ outcome }} := !!sym_outcome - .pred) %>% + select(.idx, .weights, all_of(c(outcome, args_list$quantile_by_key))) %>% + dplyr::reframe( + roll_modular_multivec( + !!sym_outcome, .idx, .weights, Quantile, window_size, + modulus + ), + .by = all_of(args_list$quantile_by_key) + ) %>% + mutate(.pred_distn = hardhat::quantile_pred(do.call(rbind, climate_pred), args_list$quantile_levels)) %>% + select(-climate_pred) + # combine them together + climate_table <- climate_center %>% + inner_join(climate_quantiles, by = c(".idx", args_list$quantile_by_key)) %>% + mutate(.pred_distn = .pred_distn + .pred) + predictions <- predictions %>% + left_join(climate_table, by = c(".idx", keys)) %>% + select(-.idx) + if (args_list$nonneg) { + predictions <- predictions %>% mutate( + .pred = snap(.pred, 0, Inf), + .pred_distn = snap(.pred_distn, 0, Inf) + ) + } + + # fill in some extras for plotting methods, etc. + ewf <- epi_workflow() + ewf$trained <- TRUE + ewf$original_data <- epi_data + ewf$pre <- list(mold = list( + outcomes = select(epi_data, !!sym_outcome), + extras = list(roles = list( + geo_value = select(epi_data, geo_value), + time_value = select(epi_data, time_value) + )) + )) + other_keys <- key_colnames(epi_data, exclude = c("time_value", "geo_value")) + if (length(other_keys) > 0) { + ewf$pre$mold$extras$roles$key <- epi_data %>% select(all_of(other_keys)) + } + + structure( + list( + predictions = predictions, + epi_workflow = ewf, + metadata = list( + training = attr(epi_data, "metadata"), + forecast_created = Sys.time() + ) + ), + class = c("climate_fcast", "canned_epipred") + ) +} + +#' Climatological forecaster argument constructor +#' +#' @inheritParams epi_recipe +#' @param forecast_horizon Vector of integers giving the number of time steps, +#' in units of the `time_type`, +#' from the `reference_date` for which predictions should be produced. +#' @inheritParams step_climate +#' @inheritParams flatline_args_list +#' +#' @return A list containing updated parameter choices with class `climate_alist`. +#' @export +#' @seealso [climatological_forecaster()], [step_climate()] +#' +#' @examples +#' climate_args_list() +#' climate_args_list( +#' forecast_horizon = 0:10, +#' quantile_levels = c(.01, .025, 1:19 / 20, .975, .99) +#' ) +#' +climate_args_list <- function( + forecast_date = NULL, + forecast_horizon = 0:4, + time_type = c("epiweek", "week", "month", "day"), + center_method = c("median", "mean"), + window_size = 3L, + quantile_levels = c(.05, .1, .25, .5, .75, .9, .95), + symmetrize = FALSE, + nonneg = TRUE, + quantile_by_key = character(0L), + ...) { + rlang::check_dots_empty() + time_type <- arg_match(time_type) + center_method <- rlang::arg_match(center_method) + arg_is_scalar(window_size, symmetrize, nonneg) + arg_is_chr(quantile_by_key, allow_empty = TRUE) + arg_is_scalar(forecast_date, allow_null = TRUE) + arg_is_date(forecast_date, allow_null = TRUE) + arg_is_nonneg_int(window_size) + arg_is_int(forecast_horizon) + arg_is_lgl(symmetrize, nonneg) + arg_is_probabilities(quantile_levels) + quantile_levels <- sort(unique(c(0.5, quantile_levels))) + + structure( + enlist( + forecast_date, forecast_horizon, time_type, center_method, window_size, + quantile_levels, symmetrize, nonneg, quantile_by_key + ), + class = c("climate_fcast", "alist") + ) +} + +#' @export +print.climate_fcast <- function(x, ...) { + name <- "ARX Forecaster" + NextMethod(name = name, ...) +} diff --git a/R/compat-purrr.R b/R/compat-purrr.R index e06038e44..6ec7df02f 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -11,11 +11,11 @@ map_vec <- function(.x, .f, ...) { map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } map2_dfr <- function(.x, .y, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map2(.x, .y, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } diff --git a/R/compat-recipes.R b/R/compat-recipes.R index 12d11049a..f90367497 100644 --- a/R/compat-recipes.R +++ b/R/compat-recipes.R @@ -18,7 +18,7 @@ inline_check <- function(x) { funs <- fun_calls(x) funs <- funs[!(funs %in% c("~", "+", "-"))] if (length(funs) > 0) { - rlang::abort(paste0( + cli_abort(paste0( "No in-line functions should be used here; ", "use steps to define baking actions." )) diff --git a/R/create-layer.R b/R/create-layer.R deleted file mode 100644 index 0268a906f..000000000 --- a/R/create-layer.R +++ /dev/null @@ -1,45 +0,0 @@ -#' Create a new layer -#' -#' This function creates the skeleton for a new `frosting` layer. When called -#' inside a package, it will create an R script in the `R/` directory, -#' fill in the name of the layer, and open the file. -#' -#' @inheritParams usethis::use_test -#' -#' @importFrom rlang %||% -#' @noRd -#' @keywords internal -#' @examples -#' \dontrun{ -#' -#' # Note: running this will write `layer_strawberry.R` to -#' # the `R/` directory of your current project -#' create_layer("strawberry") -#' } -#' -create_layer <- function(name = NULL, open = rlang::is_interactive()) { - name <- name %||% usethis:::get_active_r_file(path = "R") - if (substr(name, 1, 5) == "layer") { - nn <- substring(name, 6) - if (substr(nn, 1, 1) == "_") nn <- substring(nn, 2) - cli::cli_abort( - c('`name` should not begin with "layer" or "layer_".', - i = 'Did you mean to use `create_layer("{ nn }")`?' - ) - ) - } - layer_name <- name - name <- paste0("layer_", name) - name <- usethis:::slug(name, "R") - usethis:::check_file_name(name) - path <- fs::path("R", name) - if (!fs::file_exists(path)) { - usethis::use_template( - "layer.R", - save_as = path, - data = list(name = layer_name), open = FALSE, - package = "epipredict" - ) - } - usethis::edit_file(usethis::proj_path(path), open = open) -} diff --git a/R/data.R b/R/data.R deleted file mode 100644 index 71e5bdcd3..000000000 --- a/R/data.R +++ /dev/null @@ -1,87 +0,0 @@ -#' Subset of JHU daily state cases and deaths -#' -#' This data source of confirmed COVID-19 cases and deaths -#' is based on reports made available by the Center for -#' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Dec 31, 2020 to Dec 31, 2021, -#' and includes all states. -#' -#' @format A tibble with 20,496 rows and 4 variables: -#' \describe{ -#' \item{geo_value}{the geographic value associated with each row -#' of measurements.} -#' \item{time_value}{the time value associated with each row of measurements.} -#' \item{case_rate}{7-day average signal of number of new -#' confirmed COVID-19 cases per 100,000 population, daily} -#' \item{death_rate}{7-day average signal of number of new confirmed -#' deaths due to COVID-19 per 100,000 population, daily} -#' } -#' @source This object contains a modified part of the -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -#' This data set is licensed under the terms of the -#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -#' by the Johns Hopkins University on behalf of its Center for Systems Science -#' in Engineering. Copyright Johns Hopkins University 2020. -#' -#' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -#' without changes. The 7-day average signals are computed by Delphi by -#' calculating moving averages of the preceding 7 days, so the signal for -#' June 7 is the average of the underlying data for June 1 through 7, -#' inclusive. -"case_death_rate_subset" - -#' State population data -#' -#' Data set on state populations, from the 2019 US Census. -#' -#' @format Data frame with 57 rows (including one for the United States as a -#' whole, plus the District of Columbia, Puerto Rico Commonwealth, -#' American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, -#' Islands). -#' -#' \describe{ -#' \item{fips}{FIPS code} -#' \item{name}{Full name of the state or territory} -#' \item{pop}{Estimate of the location's resident population in -#' 2019.} -#' \item{abbr}{Postal abbreviation for the location} -#' } -#' -#' @source United States Census Bureau, at -#' \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, -#' \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, -#' and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} -"state_census" - -#' Subset of Statistics Canada median employment income for postsecondary graduates -#' -#' @format An [epiprocess::epi_df][epiprocess::as_epi_df] with 10193 rows and 8 variables: -#' \describe{ -#' \item{geo_value}{The province in Canada associated with each -#' row of measurements.} -#' \item{time_value}{The time value, a year integer in YYYY format} -#' \item{edu_qual}{The education qualification} -#' \item{fos}{The field of study} -#' \item{age_group}{The age group; either 15 to 34 or 35 to 64} -#' \item{num_graduates}{The number of graduates for the given row of characteristics} -#' \item{med_income_2y}{The median employment income two years after graduation} -#' \item{med_income_5y}{The median employment income five years after graduation} -#' } -#' @source This object contains modified data from the following Statistics Canada -#' data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{ -#' Characteristics and median employment income of longitudinal cohorts of postsecondary -#' graduates two and five years after graduation, by educational qualification and -#' field of study (primary groupings) -#' } -#' -#' Modifications: -#' * Only provincial-level geo_values are kept -#' * Only age group, field of study, and educational qualification are kept as -#' covariates. For the remaining covariates, we keep aggregated values and -#' drop the level-specific rows. -#' * No modifications were made to the time range of the data -"grad_employ_subset" diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R deleted file mode 100644 index dd97ec809..000000000 --- a/R/dist_quantiles.R +++ /dev/null @@ -1,295 +0,0 @@ -#' @importFrom vctrs field vec_cast new_rcrd -new_quantiles <- function(values = double(1), quantile_levels = double(1)) { - arg_is_probabilities(quantile_levels) - - vec_cast(values, double()) - vec_cast(quantile_levels, double()) - values <- unname(values) - if (length(values) == 0L) { - return(new_rcrd( - list( - values = rep(NA_real_, length(quantile_levels)), - quantile_levels = quantile_levels - ), - class = c("dist_quantiles", "dist_default") - )) - } - stopifnot(length(values) == length(quantile_levels)) - - stopifnot(!vctrs::vec_duplicate_any(quantile_levels)) - if (is.unsorted(quantile_levels)) { - o <- vctrs::vec_order(quantile_levels) - values <- values[o] - quantile_levels <- quantile_levels[o] - } - if (is.unsorted(values, na.rm = TRUE)) { - cli::cli_abort("`values[order(quantile_levels)]` produces unsorted quantiles.") - } - - new_rcrd(list(values = values, quantile_levels = quantile_levels), - class = c("dist_quantiles", "dist_default") - ) -} - - - -#' @importFrom vctrs vec_ptype_abbr vec_ptype_full -#' @export -vec_ptype_abbr.dist_quantiles <- function(x, ...) "dist_qntls" -#' @export -vec_ptype_full.dist_quantiles <- function(x, ...) "dist_quantiles" - -#' @export -format.dist_quantiles <- function(x, digits = 2, ...) { - m <- suppressWarnings(median(x)) - paste0("quantiles(", round(m, digits), ")[", vctrs::vec_size(x), "]") -} - - -#' A distribution parameterized by a set of quantiles -#' -#' @param values A vector (or list of vectors) of values. -#' @param quantile_levels A vector (or list of vectors) of probabilities -#' corresponding to `values`. -#' -#' When creating multiple sets of `values`/`quantile_levels` resulting in -#' different distributions, the sizes must match. See the examples below. -#' -#' @return A vector of class `"distribution"`. -#' -#' @export -#' -#' @examples -#' dist_quantiles(1:4, 1:4 / 5) -#' dist_quantiles(list(1:3, 1:4), list(1:3 / 4, 1:4 / 5)) -#' dstn <- dist_quantiles(list(1:4, 8:11), c(.2, .4, .6, .8)) -#' dstn -#' -#' quantile(dstn, p = c(.1, .25, .5, .9)) -#' median(dstn) -#' -#' # it's a bit annoying to inspect the data -#' distributional::parameters(dstn[1]) -#' nested_quantiles(dstn[1])[[1]] -#' -#' @importFrom vctrs as_list_of vec_recycle_common new_vctr -dist_quantiles <- function(values, quantile_levels) { - if (!is.list(quantile_levels)) { - assert_numeric(quantile_levels, lower = 0, upper = 1, any.missing = FALSE, min.len = 1L) - quantile_levels <- list(quantile_levels) - } - if (!is.list(values)) { - if (length(values) == 0L) values <- NA_real_ - values <- list(values) - } - - values <- as_list_of(values, .ptype = double()) - quantile_levels <- as_list_of(quantile_levels, .ptype = double()) - args <- vec_recycle_common(values = values, quantile_levels = quantile_levels) - - qntls <- as_list_of( - map2(args$values, args$quantile_levels, new_quantiles), - .ptype = new_quantiles(NA_real_, 0.5) - ) - new_vctr(qntls, class = "distribution") -} - -validate_dist_quantiles <- function(values, quantile_levels) { - map(quantile_levels, arg_is_probabilities) - common_length <- vctrs::vec_size_common( # aborts internally - values = values, - quantile_levels = quantile_levels - ) - length_diff <- vctrs::list_sizes(values) != vctrs::list_sizes(quantile_levels) - if (any(length_diff)) { - cli::cli_abort(c( - "`values` and `quantile_levels` must have common length.", - i = "Mismatches found at position(s): {.val {which(length_diff)}}." - )) - } - level_duplication <- map_lgl(quantile_levels, vctrs::vec_duplicate_any) - if (any(level_duplication)) { - cli::cli_abort(c( - "`quantile_levels` must not be duplicated.", - i = "Duplicates found at position(s): {.val {which(level_duplication)}}." - )) - } -} - - -is_dist_quantiles <- function(x) { - is_distribution(x) & all(stats::family(x) == "quantiles") -} - - - -#' @export -#' @importFrom stats median qnorm family -median.dist_quantiles <- function(x, na.rm = FALSE, ..., middle = c("cubic", "linear")) { - quantile_levels <- field(x, "quantile_levels") - values <- field(x, "values") - if (0.5 %in% quantile_levels) { - return(values[match(0.5, quantile_levels)]) - } - if (length(quantile_levels) < 2 || min(quantile_levels) > 0.5 || max(quantile_levels) < 0.5) { - return(NA) - } - if (length(quantile_levels) < 3 || min(quantile_levels) > .25 || max(quantile_levels) < .75) { - return(stats::approx(quantile_levels, values, xout = 0.5)$y) - } - quantile(x, 0.5, ..., middle = middle) -} - -# placeholder to avoid errors, but not ideal -#' @export -mean.dist_quantiles <- function(x, na.rm = FALSE, ..., middle = c("cubic", "linear")) { - median(x, ..., middle = middle) -} - -#' @export -#' @importFrom stats quantile -#' @import distributional -quantile.dist_quantiles <- function(x, p, ..., middle = c("cubic", "linear")) { - arg_is_probabilities(p) - p <- sort(p) - middle <- match.arg(middle) - quantile_extrapolate(x, p, middle) -} - - -quantile_extrapolate <- function(x, tau_out, middle) { - tau <- field(x, "quantile_levels") - qvals <- field(x, "values") - nas <- is.na(qvals) - qvals_out <- rep(NA, length(tau_out)) - qvals <- qvals[!nas] - tau <- tau[!nas] - - # short circuit if we aren't actually extrapolating - # matches to ~15 decimals - if (all(tau_out %in% tau)) { - return(qvals[match(tau_out, tau)]) - } - if (length(tau) < 2) { - cli::cli_abort( - "Quantile extrapolation is not possible with fewer than 2 quantiles." - ) - return(qvals_out) - } - - indl <- tau_out < min(tau) - indr <- tau_out > max(tau) - indm <- !indl & !indr - - if (middle == "cubic") { - method <- "cubic" - result <- tryCatch( - { - Q <- stats::splinefun(tau, qvals, method = "hyman") - quartiles <- Q(c(.25, .5, .75)) - }, - error = function(e) { - return(NA) - } - ) - } - if (middle == "linear" || any(is.na(result))) { - method <- "linear" - quartiles <- stats::approx(tau, qvals, c(.25, .5, .75))$y - } - if (any(indm)) { - qvals_out[indm] <- switch(method, - linear = stats::approx(tau, qvals, tau_out[indm])$y, - cubic = Q(tau_out[indm]) - ) - } - if (any(indl) || any(indr)) { - qv <- data.frame( - q = c(tau, tau_out[indm]), - v = c(qvals, qvals_out[indm]) - ) %>% - dplyr::distinct(q, .keep_all = TRUE) %>% - dplyr::arrange(q) - } - if (any(indl)) { - qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2)) - } - if (any(indr)) { - qvals_out[indr] <- tail_extrapolate(tau_out[indr], utils::tail(qv, 2)) - } - qvals_out -} - -logit <- function(p) { - p <- pmax(pmin(p, 1), 0) - log(p) - log(1 - p) -} - -# extrapolates linearly on the logistic scale using -# the two points nearest the tail -tail_extrapolate <- function(tau_out, qv) { - if (nrow(qv) == 1L) { - return(rep(qv$v[1], length(tau_out))) - } - x <- logit(qv$q) - x0 <- logit(tau_out) - y <- qv$v - m <- diff(y) / diff(x) - m * (x0 - x[1]) + y[1] -} - - -#' @method Math dist_quantiles -#' @export -Math.dist_quantiles <- function(x, ...) { - quantile_levels <- field(x, "quantile_levels") - values <- field(x, "values") - values <- vctrs::vec_math(.Generic, values, ...) - new_quantiles(values = values, quantile_levels = quantile_levels) -} - -#' @method Ops dist_quantiles -#' @export -Ops.dist_quantiles <- function(e1, e2) { - is_quantiles <- c( - inherits(e1, "dist_quantiles"), - inherits(e2, "dist_quantiles") - ) - is_dist <- c(inherits(e1, "dist_default"), inherits(e2, "dist_default")) - tau1 <- tau2 <- NULL - if (is_quantiles[1]) { - q1 <- field(e1, "values") - tau1 <- field(e1, "quantile_levels") - } - if (is_quantiles[2]) { - q2 <- field(e2, "values") - tau2 <- field(e2, "quantile_levels") - } - tau <- union(tau1, tau2) - if (all(is_dist)) { - cli::cli_abort( - "You can't perform arithmetic between two distributions like this." - ) - } else { - if (is_quantiles[1]) { - q2 <- e2 - } else { - q1 <- e1 - } - } - q <- vctrs::vec_arith(.Generic, q1, q2) - new_quantiles(values = q, quantile_levels = tau) -} - -#' @method is.na distribution -#' @export -is.na.distribution <- function(x) { - sapply(vec_data(x), is.na) -} - -#' @method is.na dist_quantiles -#' @export -is.na.dist_quantiles <- function(x) { - q <- field(x, "values") - all(is.na(q)) -} diff --git a/R/epi_recipe.R b/R/epi_recipe.R index f8216c2af..02a463ed9 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -13,14 +13,6 @@ epi_recipe <- function(x, ...) { } -#' @rdname epi_recipe -#' @export -epi_recipe.default <- function(x, ...) { - cli_abort(paste( - "`x` must be an {.cls epi_df} or a {.cls formula},", - "not a {.cls {class(x)[[1]]}}." - )) -} #' @rdname epi_recipe #' @inheritParams recipes::recipe @@ -28,6 +20,19 @@ epi_recipe.default <- function(x, ...) { #' describes a single role that the variable will take. This value could be #' anything but common roles are `"outcome"`, `"predictor"`, #' `"time_value"`, and `"geo_value"` +#' @param reference_date Either a date of the same class as the `time_value` +#' column in the `epi_df` or `NULL`. If a date, it gives the date to which all +#' operations are relative. Typically, in real-time tasks this is the date that +#' the model is created (and presumably trained). In forecasting, this is +#' often the same as the most recent date of +#' data availability, but when data is "latent" (reported after the date to +#' which it corresponds), or if performing a nowcast, the `reference_date` may +#' be later than this. Setting `reference_date` +#' to a value BEFORE the most recent data is not a true "forecast", +#' because future data is being used to create the model, but this may be +#' reasonable in model building, nowcasting (predicting finalized values from +#' preliminary data), or if producing a backcast. If `NULL`, it will be set +#' to the `as_of` date of the `epi_df`. #' @param ... Further arguments passed to or from other methods (not currently #' used). #' @param formula A model formula. No in-line functions should be used here @@ -35,124 +40,122 @@ epi_recipe.default <- function(x, ...) { #' transformations should be enacted using `step` functions in this package. #' Dots are allowed as are simple multivariate outcome terms (i.e. no need for #' `cbind`; see Examples). -#' @param x,data A data frame, tibble, or epi_df of the *template* data set -#' (see below). This is always coerced to the first row to avoid memory issues +#' @param x,data An epi_df of the *template* data set (see below). #' @inherit recipes::recipe return #' #' @export #' @examples -#' library(dplyr) -#' library(recipes) -#' jhu <- case_death_rate_subset %>% -#' filter(time_value > "2021-08-01") %>% -#' arrange(geo_value, time_value) +#' jhu <- covid_case_death_rates %>% +#' filter(time_value > "2021-08-01") #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% -#' step_naomit(all_predictors()) %>% -#' # below, `skip` means we don't do this at predict time -#' step_naomit(all_outcomes(), skip = TRUE) +#' step_epi_naomit() #' #' r -epi_recipe.epi_df <- - function(x, formula = NULL, ..., vars = NULL, roles = NULL) { - attr(x, "decay_to_tibble") <- FALSE - if (!is.null(formula)) { - if (!is.null(vars)) { - rlang::abort( - paste0( - "This `vars` specification will be ignored ", - "when a formula is used" - ) - ) - } - if (!is.null(roles)) { - rlang::abort( - paste0( - "This `roles` specification will be ignored ", - "when a formula is used" - ) - ) - } - - obj <- epi_recipe.formula(formula, x, ...) - return(obj) - } - if (is.null(vars)) vars <- colnames(x) - if (any(table(vars) > 1)) { - rlang::abort("`vars` should have unique members") +epi_recipe.epi_df <- function(x, + reference_date = NULL, + formula = NULL, + ..., + vars = NULL, + roles = NULL) { + attr(x, "decay_to_tibble") <- FALSE + if (!is.null(formula)) { + if (!is.null(vars)) { + cli_abort(paste0( + "This `vars` specification will be ignored ", + "when a formula is used" + )) } - if (any(!(vars %in% colnames(x)))) { - rlang::abort("1 or more elements of `vars` are not in the data") + if (!is.null(roles)) { + cli_abort( + paste0( + "This `roles` specification will be ignored ", + "when a formula is used" + ) + ) } - keys <- key_colnames(x) # we know x is an epi_df + obj <- epi_recipe.formula(formula, x, ...) + return(obj) + } + if (is.null(vars)) vars <- colnames(x) + if (any(table(vars) > 1)) { + cli_abort("`vars` should have unique members") + } + if (any(!(vars %in% colnames(x)))) { + cli_abort("1 or more elements of `vars` are not in the data") + } - var_info <- tibble(variable = vars) - key_roles <- c("geo_value", rep("key", length(keys) - 2), "time_value") + keys <- key_colnames(x) # we know x is an epi_df - ## Check and add roles when available - if (!is.null(roles)) { - if (length(roles) != length(vars)) { - rlang::abort(c( - "The number of roles should be the same as the number of ", - "variables." - )) - } - var_info$role <- roles - } else { - var_info <- var_info %>% dplyr::filter(!(variable %in% keys)) - var_info$role <- "raw" - } - ## Now we add the keys when necessary - var_info <- dplyr::union( - var_info, - tibble::tibble(variable = keys, role = key_roles) - ) + var_info <- tibble(variable = vars) + key_roles <- c("geo_value", rep("key", length(keys) - 2), "time_value") - ## Add types - var_info <- dplyr::full_join(recipes:::get_types(x), var_info, by = "variable") - var_info$source <- "original" - - ## arrange to easy order - var_info <- var_info %>% - dplyr::arrange(factor( - role, - levels = union( - c("predictor", "outcome", "time_value", "geo_value", "key"), - unique(role) - ) # anything else + ## Check and add roles when available + if (!is.null(roles)) { + if (length(roles) != length(vars)) { + cli_abort(paste0( + "The number of roles should be the same as the number of ", + "variables." )) - - ## Return final object of class `recipe` - out <- list( - var_info = var_info, - term_info = var_info, - steps = NULL, - template = x[1, ], - max_time_value = max(x$time_value), - levels = NULL, - retained = NA - ) - class(out) <- c("epi_recipe", "recipe") - out + } + var_info$role <- roles + } else { + var_info <- var_info %>% filter(!(variable %in% keys)) + var_info$role <- "raw" } + ## Now we add the keys when necessary + var_info <- dplyr::union( + var_info, + tibble::tibble(variable = keys, role = key_roles) + ) + + ## Add types + var_info <- full_join(recipes:::get_types(x), var_info, by = "variable") + var_info$source <- "original" + + ## arrange to easy order + var_info <- var_info %>% + arrange(factor( + role, + levels = union( + c("predictor", "outcome", "time_value", "geo_value", "key"), + unique(role) + ) # anything else + )) + + ## Return final object of class `recipe` + max_time_value <- max(x$time_value) + reference_date <- reference_date %||% attr(x, "metadata")$as_of + out <- list( + var_info = var_info, + term_info = var_info, + steps = NULL, + template = x[1, ], + max_time_value = max_time_value, + reference_date = reference_date, + levels = NULL, + retained = NA + ) + class(out) <- c("epi_recipe", "recipe") + out +} #' @rdname epi_recipe -#' @importFrom rlang abort #' @export -epi_recipe.formula <- function(formula, data, ...) { +epi_recipe.formula <- function(formula, data, reference_date = NULL, ...) { # we ensure that there's only 1 row in the template data <- data[1, ] # check for minus: if (!epiprocess::is_epi_df(data)) { - cli_abort(paste( - "`epi_recipe()` has been called with a non-{.cls epi_df} object.", - "Use `recipe()` instead." - )) + cli_abort( + "`epi_recipe()` has been called with a non-{.cls epi_df} object. + Use `recipe()` instead." + ) } attr(data, "decay_to_tibble") <- FALSE @@ -229,9 +232,9 @@ is_epi_recipe <- function(x) { -#' Add an `epi_recipe` to a workflow +#' Add/remove/update the `epi_recipe` of an `epi_workflow` #' -#' @seealso [workflows::add_recipe()] +#' @description #' - `add_recipe()` specifies the terms of the model and any preprocessing that #' is required through the usage of a recipe. #' @@ -241,9 +244,9 @@ is_epi_recipe <- function(x) { #' recipe with the new one. #' #' @details -#' `add_epi_recipe` has the same behaviour as -#' [workflows::add_recipe()] but sets a different -#' default blueprint to automatically handle [epiprocess::epi_df][epiprocess::as_epi_df] data. +#' `add_epi_recipe()` has the same behaviour as [workflows::add_recipe()] but +#' sets a different default blueprint to automatically handle +#' `epiprocess::epi_df()` data. #' #' @param x A `workflow` or `epi_workflow` #' @@ -262,20 +265,16 @@ is_epi_recipe <- function(x) { #' `x`, updated with a new recipe preprocessor. #' #' @export +#' @seealso [workflows::add_recipe()] #' @examples -#' library(dplyr) -#' library(recipes) -#' -#' jhu <- case_death_rate_subset %>% -#' filter(time_value > "2021-08-01") %>% -#' arrange(geo_value, time_value) +#' jhu <- covid_case_death_rates %>% +#' filter(time_value > "2021-08-01") #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% -#' step_naomit(all_predictors()) %>% -#' step_naomit(all_outcomes(), skip = TRUE) +#' step_epi_naomit() #' #' workflow <- epi_workflow() %>% #' add_epi_recipe(r) @@ -347,10 +346,9 @@ update_epi_recipe <- function(x, recipe, ..., blueprint = default_epi_recipe_blu #' #' @export #' @examples -#' library(dplyr) #' library(workflows) #' -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -395,7 +393,7 @@ adjust_epi_recipe.epi_workflow <- function(x, which_step, ..., blueprint = defau #' @export adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default_epi_recipe_blueprint()) { if (!(is.numeric(which_step) || is.character(which_step))) { - cli::cli_abort( + cli_abort( c("`which_step` must be a number or a character.", i = "`which_step` has class {.cls {class(which_step)[1]}}." ) @@ -408,7 +406,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (!starts_with_step) which_step <- paste0("step_", which_step) if (!(which_step %in% step_names)) { - cli::cli_abort(c( + cli_abort(c( "`which_step` does not appear in the available `epi_recipe` step names. ", i = "The step names are {.val {step_names}}." )) @@ -417,7 +415,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (length(which_step_idx) == 1) { x$steps[[which_step_idx]] <- update(x$steps[[which_step_idx]], ...) } else { - cli::cli_abort(c( + cli_abort(c( "`which_step` is not unique. Matches steps: {.val {which_step_idx}}.", i = "Please use the step number instead for precise alterations." )) @@ -432,7 +430,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(c( + cli_warn(c( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", @@ -440,9 +438,9 @@ prep.epi_recipe <- function( "!" = "to avoid addtional warning messages." )) } - training <- recipes:::check_training_set(training, x, fresh) + training <- recipes:::validate_training_data(training, x, fresh) training <- epi_check_training_set(training, x) - training <- dplyr::relocate(training, dplyr::all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) tr_data <- recipes:::train_info(training) keys <- key_colnames(x) @@ -457,7 +455,7 @@ prep.epi_recipe <- function( } skippers <- map_lgl(x$steps, recipes:::is_skipable) if (any(skippers) & !retain) { - cli::cli_warn(c( + cli_warn(paste( "Since some operations have `skip = TRUE`, using ", "`retain = TRUE` will allow those steps results to ", "be accessible." @@ -465,9 +463,9 @@ prep.epi_recipe <- function( } if (fresh) x$term_info <- x$var_info - running_info <- x$term_info %>% dplyr::mutate(number = 0, skip = FALSE) - for (i in seq(along.with = x$steps)) { - needs_tuning <- map_lgl(x$steps[[i]], recipes:::is_tune) + running_info <- x$term_info %>% mutate(number = 0, skip = FALSE) + for (ii in seq(along.with = x$steps)) { + needs_tuning <- map_lgl(x$steps[[ii]], recipes:::is_tune) if (any(needs_tuning)) { arg <- names(needs_tuning)[needs_tuning] arg <- paste0("'", arg, "'", collapse = ", ") @@ -475,22 +473,22 @@ prep.epi_recipe <- function( "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", arg, ". Do you want to use a tuning function such as `tune_grid()`?" ) - rlang::abort(msg) + cli_abort(msg) } - note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) - if (!x$steps[[i]]$trained | fresh) { + note <- paste("oper", ii, gsub("_", " ", class(x$steps[[ii]])[1])) + if (!x$steps[[ii]]$trained || fresh) { if (verbose) { cat(note, "[training]", "\n") } before_nms <- names(training) before_template <- training[1, ] - x$steps[[i]] <- prep(x$steps[[i]], + x$steps[[ii]] <- prep(x$steps[[ii]], training = training, info = x$term_info ) - training <- bake(x$steps[[i]], new_data = training) + training <- bake(x$steps[[ii]], new_data = training) if (!tibble::is_tibble(training)) { - cli::cli_abort("`bake()` methods should always return {.cls tibble}.") + cli_abort("`bake()` methods should always return {.cls tibble}.") } if (!is_epi_df(training)) { # tidymodels killed our class @@ -502,20 +500,20 @@ prep.epi_recipe <- function( other_keys = metadata$other_keys %||% character() ) } - training <- dplyr::relocate(training, all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) - if (!is.na(x$steps[[i]]$role)) { + if (!is.na(x$steps[[ii]]$role)) { new_vars <- setdiff(x$term_info$variable, running_info$variable) pos_new_var <- x$term_info$variable %in% new_vars pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role) pos_new_and_na_source <- pos_new_var & is.na(x$term_info$source) - x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role + x$term_info$role[pos_new_and_na_role] <- x$steps[[ii]]$role x$term_info$source[pos_new_and_na_source] <- "derived" } recipes:::changelog(log_changes, before_nms, names(training), x$steps[[i]]) running_info <- rbind( running_info, - dplyr::mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) + mutate(x$term_info, number = ii, skip = x$steps[[ii]]$skip) ) } else { if (verbose) cat(note, "[pre-trained]\n") @@ -547,9 +545,9 @@ prep.epi_recipe <- function( x$orig_lvls <- orig_lvls x$retained <- retain x$last_term_info <- running_info %>% - dplyr::group_by(variable) %>% - dplyr::arrange(dplyr::desc(number)) %>% - dplyr::summarise( + group_by(variable) %>% + arrange(dplyr::desc(number)) %>% + summarise( type = list(dplyr::first(type)), role = list(unique(unlist(role))), source = dplyr::first(source), @@ -571,6 +569,7 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { } composition <- "tibble" } + new_data <- NextMethod("bake") if (!is.null(meta)) { # Baking should have dropped epi_df-ness and metadata. Re-infer some diff --git a/R/epi_shift.R b/R/epi_shift.R index eb534f1ea..877f7866c 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -2,43 +2,76 @@ #' #' This is a lower-level function. As such it performs no error checking. #' -#' @param x Data frame. Variables to shift -#' @param shifts List. Each list element is a vector of shifts. -#' Negative values produce leads. The list should have the same -#' length as the number of columns in `x`. -#' @param time_value Vector. Same length as `x` giving time stamps. -#' @param keys Data frame, vector, or `NULL`. Additional grouping vars. -#' @param out_name Chr. The output list will use this as a prefix. +#' @param x Data frame. +#' @param shift_val a single integer. Negative values produce leads. +#' @param newname the name for the newly shifted column +#' @param key_cols vector, or `NULL`. Additional grouping vars. #' #' @keywords internal #' #' @return a list of tibbles -epi_shift <- function(x, shifts, time_value, keys = NULL, out_name = "x") { - if (!is.data.frame(x)) x <- data.frame(x) - if (is.null(keys)) keys <- rep("empty", nrow(x)) - p_in <- ncol(x) - out_list <- tibble(i = 1:p_in, shift = shifts) %>% - tidyr::unchop(shift) %>% # what is chop - mutate(name = paste0(out_name, 1:nrow(.))) %>% - # One list element for each shifted feature - pmap(function(i, shift, name) { - tibble(keys, - time_value = time_value + shift, # Shift back - !!name := x[[i]] - ) - }) - if (is.data.frame(keys)) { - common_names <- c(names(keys), "time_value") - } else { - common_names <- c("keys", "time_value") - } - - reduce(out_list, dplyr::full_join, by = common_names) -} - epi_shift_single <- function(x, col, shift_val, newname, key_cols) { x %>% select(all_of(c(key_cols, col))) %>% mutate(time_value = time_value + shift_val) %>% rename(!!newname := {{ col }}) } + +#' lags move columns forward to bring the past up to today, while aheads drag +#' the future back to today +#' @keywords internal +get_sign <- function(object) { + if (!is.null(object$prefix)) { + if (object$prefix == "lag_") { + return(1) + } else { + return(-1) + } + } else if (object$method == "extend_lags") { + return(1) + } else { + return(-1) + } +} + +#' backend for both `bake.step_epi_ahead` and `bake.step_epi_lag`, performs the +#' checks missing in `epi_shift_single` +#' @keywords internal +#' @importFrom tidyr expand_grid +#' @importFrom dplyr join_by +add_shifted_columns <- function(new_data, object) { + grid <- object$shift_grid + + if (nrow(object$shift_grid) == 0) { + # we're not shifting any rows, so this is a no-op + return(new_data) + } + ## ensure no name clashes + new_data_names <- colnames(new_data) + intersection <- new_data_names %in% grid$newname + if (any(intersection)) { + cli_abort(c( + "Name collision occured in {.cls {class(object)[1]}}", + "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." + )) + } + ok <- object$keys + shifted <- reduce( + pmap(grid, epi_shift_single, x = new_data, key_cols = ok), + full_join, + by = ok + ) + processed <- new_data %>% + full_join(shifted, by = ok) %>% + group_by(across(all_of(kill_time_value(ok)))) %>% + arrange(time_value) + if (inherits(new_data, "epi_df")) { + processed <- processed %>% + ungroup() %>% + as_epi_df( + as_of = attributes(new_data)$metadata$as_of, + other_keys = attributes(new_data)$metadata$other_keys + ) + } + return(processed) +} diff --git a/R/epi_workflow.R b/R/epi_workflow.R index af4555303..b17bd697b 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -1,26 +1,27 @@ #' Create an epi_workflow #' #' This is a container object that unifies preprocessing, fitting, prediction, -#' and postprocessing for predictive modeling on epidemiological data. It extends -#' the functionality of a [workflows::workflow()] to handle the typical panel -#' data structures found in this field. This extension is handled completely -#' internally, and should be invisible to the user. For all intents and purposes, -#' this operates exactly like a [workflows::workflow()]. For more details -#' and numerous examples, see there. +#' and post-processing for predictive modeling on epidemiological data. It +#' extends the functionality of a [workflows::workflow()] to handle the typical +#' panel data structures found in this field. This extension is handled +#' completely internally, and should be invisible to the user. For all intents +#' and purposes, this operates exactly like a [workflows::workflow()]. For some +#' `{epipredict}` specific examples, see the [custom epiworkflows +#' vignette](../articles/custom_epiworkflows.html). #' #' @inheritParams workflows::workflow #' @param postprocessor An optional postprocessor to add to the workflow. #' Currently only `frosting` is allowed using, `add_frosting()`. #' #' @return A new `epi_workflow` object. -#' @seealso workflows::workflow +#' @seealso [workflows::workflow()] #' @importFrom rlang is_null #' @importFrom stats predict #' @importFrom generics fit #' @importFrom generics augment #' @export #' @examples -#' jhu <- case_death_rate_subset +#' jhu <- covid_case_death_rates #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -62,9 +63,9 @@ is_epi_workflow <- function(x) { #' Fit an `epi_workflow` object #' #' @description -#' This is the `fit()` method for an `epi_workflow` object that +#' This is the `fit()` method for an `epi_workflow()` object that #' estimates parameters for a given model from a set of data. -#' Fitting an `epi_workflow` involves two main steps, which are +#' Fitting an `epi_workflow()` involves two main steps, which are #' preprocessing the data and fitting the underlying parsnip model. #' #' @inheritParams workflows::fit.workflow @@ -79,12 +80,12 @@ is_epi_workflow <- function(x) { #' @return The `epi_workflow` object, updated with a fit parsnip #' model in the `object$fit$fit` slot. #' -#' @seealso workflows::fit-workflow +#' @seealso [workflows::fit-workflow()] #' #' @name fit-epi_workflow #' @export #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -111,20 +112,25 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' Predict from an epi_workflow #' #' @description -#' This is the `predict()` method for a fit epi_workflow object. The nice thing -#' about predicting from an epi_workflow is that it will: -#' +#' This is the `predict()` method for a fit epi_workflow object. The 3 steps that this implements are: #' - Preprocess `new_data` using the preprocessing method specified when the #' workflow was created and fit. This is accomplished using #' [hardhat::forge()], which will apply any formula preprocessing or call #' [recipes::bake()] if a recipe was supplied. #' -#' - Call [parsnip::predict.model_fit()] for you using the underlying fit +#' - Preprocessing `new_data` using the preprocessing method specified when the +#' epi_workflow was created and fit. This is accomplished using +#' `hardhat::bake()` if a recipe was supplied (passing through +#' [hardhat::forge()], which is used for non-recipe preprocessors). Note that +#' this is a slightly different `bake` operation than the one occuring during +#' the fit. Any `step` that has `skip = TRUE` isn't applied during prediction; +#' for example in `step_epi_naomit()`, `all_outcomes()` isn't `NA` omitted, +#' since doing so would drop the exact `time_values` we are trying to predict. +#' +#' - Calling `parsnip::predict.model_fit()` for you using the underlying fit #' parsnip model. #' -#' - Ensure that the returned object is an [epiprocess::epi_df][epiprocess::as_epi_df] where -#' possible. Specifically, the output will have `time_value` and -#' `geo_value` columns as well as the prediction. +#' - `slather()` any frosting that has been included in the `epi_workflow`. #' #' @param object An epi_workflow that has been fit by #' [workflows::fit.workflow()] @@ -136,13 +142,13 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' #' @return #' A data frame of model predictions, with as many rows as `new_data` has. -#' If `new_data` is an `epi_df` or a data frame with `time_value` or +#' If `new_data` is an `epiprocess::epi_df` or a data frame with `time_value` or #' `geo_value` columns, then the result will have those as well. #' #' @name predict-epi_workflow #' @export #' @examples -#' jhu <- case_death_rate_subset +#' jhu <- covid_case_death_rates #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -157,7 +163,7 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' preds predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), ...) { if (!workflows::is_trained_workflow(object)) { - cli::cli_abort(c( + cli_abort(c( "Can't predict on an untrained epi_workflow.", i = "Do you need to call `fit()`?" )) @@ -167,6 +173,7 @@ predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), . components$forged <- hardhat::forge(new_data, blueprint = components$mold$blueprint ) + components$keys <- grab_forged_keys(components$forged, object, new_data) components <- apply_frosting(object, components, new_data, type = type, opts = opts, ...) components$predictions @@ -176,6 +183,11 @@ predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), . #' Augment data with predictions #' +#' `augment()`, unlike `forecast()`, has the goal of modifying the training +#' data, rather than just producing new forecasts. It does a prediction on +#' `new_data`, which will produce a prediction for most `time_values`, and then +#' adds `.pred` as a column to `new_data` and returns the resulting join. +#' #' @param x A trained epi_workflow #' @param new_data A epi_df of predictors #' @param ... Arguments passed on to the predict method. @@ -188,8 +200,8 @@ augment.epi_workflow <- function(x, new_data, ...) { join_by <- key_colnames(predictions) } else { cli_abort(c( - "Cannot determine how to join new_data with the predictions.", - "Try converting new_data to an epi_df with `as_epi_df(new_data)`." + "Cannot determine how to join `new_data` with the `predictions`.", + "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." )) } complete_overlap <- intersect(names(new_data), join_by) @@ -227,27 +239,31 @@ print.epi_workflow <- function(x, ...) { } -#' Produce a forecast from an epi workflow +#' Produce a forecast from an epi workflow and it's training data +#' +#' `forecast.epi_workflow` predicts by restricting the training data to the +#' latest available data, and predicting on that. It binds together +#' `get_test_data()` and `predict()`. #' #' @param object An epi workflow. #' @param ... Not used. -#' @param fill_locf Logical. Should we use locf to fill in missing data? -#' @param n_recent Integer or NULL. If filling missing data with locf = TRUE, -#' how far back are we willing to tolerate missing data? Larger values allow -#' more filling. The default NULL will determine this from the the recipe. For -#' example, suppose n_recent = 3, then if the 3 most recent observations in any -#' geo_value are all NA’s, we won’t be able to fill anything, and an error -#' message will be thrown. (See details.) -#' @param forecast_date By default, this is set to the maximum time_value in x. -#' But if there is data latency such that recent NA's should be filled, this may -#' be after the last available time_value. #' #' @return A forecast tibble. #' #' @export -forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) { - rlang::check_dots_empty() - +#' @examples +#' jhu <- covid_case_death_rates %>% +#' filter(time_value > "2021-08-01") +#' +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_naomit() +#' +#' epi_workflow(r, parsnip::linear_reg()) %>% +#' fit(jhu) %>% +#' forecast() +forecast.epi_workflow <- function(object, ...) { if (!object$trained) { cli_abort(c( "You cannot `forecast()` a {.cls workflow} that has not been trained.", @@ -268,10 +284,7 @@ forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NUL test_data <- get_test_data( hardhat::extract_preprocessor(object), - object$original_data, - fill_locf = fill_locf, - n_recent = n_recent %||% Inf, - forecast_date = forecast_date %||% frosting_fd %||% max(object$original_data$time_value) + object$original_data ) predict(object, new_data = test_data) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index ad0f95295..fae8c3c65 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,5 +1,5 @@ ## usethis namespace: start -#' @import epiprocess parsnip +#' @import epiprocess parsnip epidatasets #' @importFrom checkmate assert_class assert_numeric #' @importFrom checkmate test_character test_date test_function #' @importFrom checkmate test_integerish test_logical @@ -7,11 +7,14 @@ #' @importFrom cli cli_abort cli_warn #' @importFrom dplyr arrange across all_of any_of bind_cols bind_rows group_by #' @importFrom dplyr full_join relocate summarise everything +#' @importFrom dplyr inner_join #' @importFrom dplyr summarize filter mutate select left_join rename ungroup +#' @importFrom epiprocess growth_rate growth_rate_params is_epi_df +#' @importFrom lifecycle deprecated +#' @importFrom magrittr extract2 #' @importFrom rlang := !! %||% as_function global_env set_names !!! caller_arg #' @importFrom rlang is_logical is_true inject enquo enquos expr sym arg_match #' @importFrom stats poly predict lm residuals quantile #' @importFrom tibble as_tibble -na_chr <- NA_character_ ## usethis namespace: end NULL diff --git a/R/extract.R b/R/extract.R index e227b59b1..2e06567e2 100644 --- a/R/extract.R +++ b/R/extract.R @@ -13,7 +13,7 @@ #' @examples #' f <- frosting() %>% #' layer_predict() %>% -#' layer_residual_quantiles(quantile_levels = c(0.0275, 0.975), symmetrize = FALSE) %>% +#' layer_residual_quantiles(symmetrize = FALSE) %>% #' layer_naomit(.pred) #' #' extract_argument(f, "layer_residual_quantiles", "symmetrize") diff --git a/R/extrapolate_quantiles.R b/R/extrapolate_quantiles.R index 3362e339e..3736a2772 100644 --- a/R/extrapolate_quantiles.R +++ b/R/extrapolate_quantiles.R @@ -1,65 +1,68 @@ -#' Summarize a distribution with a set of quantiles +#' Extrapolate the quantiles to new quantile levels #' -#' @param x a `distribution` vector +#' This both interpolates between quantile levels already defined in `x` and +#' extrapolates quantiles outside their bounds. The interpolation method is +#' determined by the `quantile` argument `middle`, which can be either `"cubic"` +#' for a (Hyman) cubic spline interpolation, or `"linear"` for simple linear +#' interpolation. +#' +#' There is only one extrapolation method for values greater than the largest +#' available quantile level or smaller than the smallest available quantile +#' level. It assumes a roughly exponential tail, whose decay rate and offset is +#' derived from the slope of the two most extreme quantile levels on a logistic +#' scale. See the internal function `tail_extrapolate()` for the exact +#' implementation. +#' +#' This function takes a `quantile_pred` vector and returns the same +#' type of object, expanded to include +#' *additional* quantiles computed at `probs`. If you want behaviour more +#' similar to [stats::quantile()], then `quantile(x,...)` may be more +#' appropriate. +#' +#' @param x A vector of class `quantile_pred`. #' @param probs a vector of probabilities at which to calculate quantiles #' @param replace_na logical. If `x` contains `NA`'s, these are imputed if -#' possible (if `TRUE`) or retained (if `FALSE`). This only effects -#' elements of class `dist_quantiles`. +#' possible (if `TRUE`) or retained (if `FALSE`). #' @param ... additional arguments passed on to the `quantile` method #' -#' @return a `distribution` vector containing `dist_quantiles`. Any elements -#' of `x` which were originally `dist_quantiles` will now have a superset +#' @return a `quantile_pred` vector. Each element +#' of `x` will now have a superset #' of the original `quantile_values` (the union of those and `probs`). #' @export #' #' @examples -#' library(distributional) -#' dstn <- dist_normal(c(10, 2), c(5, 10)) -#' extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) +#' dstn <- quantile_pred(rbind(1:4, 8:11), c(.2, .4, .6, .8)) +#' # extra quantiles are appended +#' as_tibble(extrapolate_quantiles(dstn, probs = c(0.25, 0.5, 0.75))) #' -#' dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) -#' # because this distribution is already quantiles, any extra quantiles are -#' # appended -#' extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) -#' -#' dstn <- c( -#' dist_normal(c(10, 2), c(5, 10)), -#' dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) -#' ) -#' extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) +#' extrapolate_quantiles(dstn, probs = c(0.0001, 0.25, 0.5, 0.75, 0.99999)) extrapolate_quantiles <- function(x, probs, replace_na = TRUE, ...) { UseMethod("extrapolate_quantiles") } #' @export -#' @importFrom vctrs vec_data -extrapolate_quantiles.distribution <- function(x, probs, replace_na = TRUE, ...) { - rlang::check_dots_empty() +extrapolate_quantiles.quantile_pred <- function(x, probs, replace_na = TRUE, ...) { arg_is_lgl_scalar(replace_na) arg_is_probabilities(probs) if (is.unsorted(probs)) probs <- sort(probs) - dstn <- lapply(vec_data(x), extrapolate_quantiles, probs = probs, replace_na = replace_na) - new_vctr(dstn, vars = NULL, class = "distribution") -} + orig_probs <- x %@% "quantile_levels" + orig_values <- as.matrix(x) -#' @export -extrapolate_quantiles.dist_default <- function(x, probs, replace_na = TRUE, ...) { - values <- quantile(x, probs, ...) - new_quantiles(values = values, quantile_levels = probs) -} - -#' @export -extrapolate_quantiles.dist_quantiles <- function(x, probs, replace_na = TRUE, ...) { - orig_probs <- field(x, "quantile_levels") - orig_values <- field(x, "values") - new_probs <- c(orig_probs, probs) - dups <- duplicated(new_probs) if (!replace_na || !anyNA(orig_values)) { - new_values <- c(orig_values, quantile(x, probs, ...)) + all_values <- cbind(orig_values, quantile(x, probs, ...)) } else { - nas <- is.na(orig_values) - orig_values[nas] <- quantile(x, orig_probs[nas], ...) - new_values <- c(orig_values, quantile(x, probs, ...)) + newx <- quantile(x, orig_probs, ...) %>% + hardhat::quantile_pred(orig_probs) + all_values <- cbind(as.matrix(newx), quantile(newx, probs, ...)) } - new_quantiles(new_values[!dups], new_probs[!dups]) + all_probs <- c(orig_probs, probs) + dups <- duplicated(all_probs) + all_values <- all_values[, !dups, drop = FALSE] + all_probs <- all_probs[!dups] + o <- order(all_probs) + + hardhat::quantile_pred( + all_values[, o, drop = FALSE], + quantile_levels = all_probs[o] + ) } diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 55808b803..617d703e7 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -1,22 +1,45 @@ #' Predict the future with today's value #' -#' This is a simple forecasting model for -#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It uses the most recent -#' observation as the -#' forecast for any future date, and produces intervals based on the quantiles -#' of the residuals of such a "flatline" forecast over all available training -#' data. +#' @description This is a simple forecasting model for +#' [epiprocess::epi_df][epiprocess::as_epi_df] data. It uses the most recent +#' observation as the forecast for any future date, and produces intervals +#' based on the quantiles of the residuals of such a "flatline" forecast over +#' all available training data. #' #' By default, the predictive intervals are computed separately for each -#' combination of key values (`geo_value` + any additional keys) in the -#' `epi_data` argument. +#' combination of key values (`geo_value` + any additional keys) in the +#' `epi_data` argument. #' #' This forecaster is very similar to that used by the -#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' +#' @details +#' Here is (roughly) the code for the `flatline_forecaster()` applied to the +#' `case_rate` for `epidatasets::covid_case_death_rates`. +#' +#' ```{r} +#' jhu <- covid_case_death_rates %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' r <- epi_recipe(covid_case_death_rates) %>% +#' step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>% +#' recipes::update_role(case_rate, new_role = "predictor") %>% +#' recipes::add_role(all_of(key_colnames(jhu)), new_role = "predictor") +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_residual_quantiles() %>% +#' layer_add_forecast_date() %>% +#' layer_add_target_date() %>% +#' layer_threshold(starts_with(".pred")) +#' +#' eng <- linear_reg() %>% set_engine("flatline") +#' wf <- epi_workflow(r, eng, f) %>% fit(jhu) +#' preds <- forecast(wf) +#' ``` #' #' @param epi_data An [epiprocess::epi_df][epiprocess::as_epi_df] #' @param outcome A scalar character for the column name we wish to predict. -#' @param args_list A list of dditional arguments as created by the +#' @param args_list A list of additional arguments as created by the #' [flatline_args_list()] constructor function. #' #' @return A data frame of point (and optionally interval) forecasts at a single @@ -24,7 +47,7 @@ #' @export #' #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' dplyr::filter(time_value >= as.Date("2021-12-01")) #' #' out <- flatline_forecaster(jhu, "death_rate") @@ -34,7 +57,7 @@ flatline_forecaster <- function( args_list = flatline_args_list()) { validate_forecaster_inputs(epi_data, outcome, "time_value") if (!inherits(args_list, c("flat_fcast", "alist"))) { - cli_abort("`args_list` was not created using `flatline_args_list().") + cli_abort("`args_list` was not created using `flatline_args_list()`.") } keys <- key_colnames(epi_data) ek <- kill_time_value(keys) @@ -63,14 +86,9 @@ flatline_forecaster <- function( eng <- linear_reg(engine = "flatline") - wf <- epi_workflow(r, eng, f) - wf <- fit(wf, epi_data) - preds <- suppressWarnings(forecast( - wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = forecast_date - )) %>% + wf <- epi_workflow(r, eng, f) %>% + fit(epi_data) + preds <- suppressWarnings(forecast(wf)) %>% as_tibble() %>% select(-time_value) @@ -113,11 +131,10 @@ flatline_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, - quantile_levels = c(0.05, 0.95), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(ahead, n_training) @@ -129,11 +146,10 @@ flatline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( + cli_warn(c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." )) @@ -149,8 +165,7 @@ flatline_args_list <- function( quantile_levels, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("flat_fcast", "alist") ) diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index 3e0eb1aaa..3deba254e 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -1,7 +1,6 @@ location_to_abbr <- function(location) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -12,7 +11,6 @@ location_to_abbr <- function(location) { abbr_to_location <- function(abbr) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -57,7 +55,8 @@ abbr_to_location <- function(abbr) { #' #' @examples #' library(dplyr) -#' weekly_deaths <- case_death_rate_subset %>% +#' library(epiprocess) +#' weekly_deaths <- covid_case_death_rates %>% #' filter( #' time_value >= as.Date("2021-09-01"), #' geo_value %in% c("ca", "ny", "dc", "ga", "vt") @@ -105,12 +104,11 @@ flusight_hub_formatter.data.frame <- function( object <- object %>% # combine the predictions and the distribution - mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% - tidyr::unnest(.pred_distn) %>% + pivot_quantiles_longer(.pred_distn) %>% # now we create the correct column names rename( - value = values, - output_type_id = quantile_levels, + value = .pred_distn_value, + output_type_id = .pred_distn_quantile_level, reference_date = forecast_date ) %>% # convert to fips codes, and add any constant cols passed in ... diff --git a/R/frosting.R b/R/frosting.R index 8474edbdf..dca80392b 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -1,4 +1,4 @@ -#' Add frosting to a workflow +#' Add/remove/update the `frosting` of an `epi_workflow` #' #' @param x A workflow #' @param frosting A frosting object created using `frosting()`. @@ -8,8 +8,7 @@ #' @export #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -89,7 +88,7 @@ validate_has_postprocessor <- function(x, ..., call = caller_env()) { "The workflow must have a frosting postprocessor.", i = "Provide one with `add_frosting()`." ) - rlang::abort(message, call = call) + cli_abort(message, call = call) } invisible(x) } @@ -127,8 +126,7 @@ update_frosting <- function(x, frosting, ...) { #' #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -248,10 +246,10 @@ new_frosting <- function() { } -#' Create frosting for postprocessing predictions +#' Create frosting for post-processing predictions #' -#' This generates a postprocessing container (much like `recipes::recipe()`) -#' to hold steps for postprocessing predictions. +#' This generates a post-processing container (much like `recipes::recipe()`) +#' to hold steps for post-processing predictions. #' #' The arguments are currently placeholders and must be NULL #' @@ -262,13 +260,12 @@ new_frosting <- function() { #' @export #' #' @examples -#' library(dplyr) -#' # Toy example to show that frosting can be created and added for postprocessing +#' # Toy example to show that frosting can be created and added for post-processing #' f <- frosting() #' wf <- epi_workflow() %>% add_frosting(f) #' #' # A more realistic example -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -288,7 +285,7 @@ new_frosting <- function() { #' p frosting <- function(layers = NULL, requirements = NULL) { if (!is_null(layers) || !is_null(requirements)) { - cli::cli_abort( + cli_abort( "Currently, no arguments to `frosting()` are allowed to be non-null." ) } @@ -325,9 +322,9 @@ extract_frosting.epi_workflow <- function(x, ...) { } } -#' Apply postprocessing to a fitted workflow +#' Apply post-processing to a fitted workflow #' -#' This function is intended for internal use. It implements postprocessing +#' This function is intended for internal use. It implements post-processing #' inside of the `predict()` method for a fitted workflow. #' #' @param workflow An object of class workflow @@ -345,7 +342,7 @@ apply_frosting <- function(workflow, ...) { apply_frosting.default <- function(workflow, components, ...) { if (has_postprocessor(workflow)) { cli_abort(c( - "Postprocessing is only available for epi_workflows currently.", + "Post-processing is only available for epi_workflows currently.", i = "Can you use `epi_workflow()` instead of `workflow()`?" )) } @@ -356,7 +353,6 @@ apply_frosting.default <- function(workflow, components, ...) { #' @rdname apply_frosting #' @importFrom rlang is_null -#' @importFrom rlang abort #' @param type,opts forwarded (along with `...`) to [`predict.model_fit()`] and #' [`slather()`] for supported layers #' @export @@ -368,21 +364,21 @@ apply_frosting.epi_workflow <- components$predictions <- predict( the_fit, components$forged$predictors, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) } if (!has_postprocessor_frosting(workflow)) { - cli_warn(c( + cli_warn(paste( "Only postprocessors of class {.cls frosting} are allowed.", "Returning unpostprocessed predictions." )) components$predictions <- predict( the_fit, components$forged$predictors, type, opts, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) diff --git a/R/get_test_data.R b/R/get_test_data.R index 694e73b06..174308242 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -1,61 +1,36 @@ #' Get test data for prediction based on longest lag period #' -#' Based on the longest lag period in the recipe, -#' `get_test_data()` creates an [epi_df][epiprocess::as_epi_df] -#' with columns `geo_value`, `time_value` -#' and other variables in the original dataset, -#' which will be used to create features necessary to produce forecasts. +#' If `predict()` is given the full training dataset, it will produce a +#' prediction for every `time_value` which has enough data. For most cases, this +#' generates predictions for `time_values` where the `outcome` has already been +#' observed. `get_test_data()` is designed to restrict the given dataset to the +#' minimum amount needed to produce a forecast on the `forecast_date` for future +#' data, rather than a prediction on past `time_value`s. Primarily this is +#' based on the longest lag period in the recipe. #' #' The minimum required (recent) data to produce a forecast is equal to #' the maximum lag requested (on any predictor) plus the longest horizon #' used if growth rate calculations are requested by the recipe. This is #' calculated internally. #' -#' It also optionally fills missing values -#' using the last-observation-carried-forward (LOCF) method. If this -#' is not possible (say because there would be only `NA`'s in some location), -#' it will produce an error suggesting alternative options to handle missing -#' values with more advanced techniques. -#' #' @param recipe A recipe object. #' @param x An epi_df. The typical usage is to #' pass the same data as that used for fitting the recipe. -#' @param fill_locf Logical. Should we use `locf` to fill in missing data? -#' @param n_recent Integer or NULL. If filling missing data with `locf = TRUE`, -#' how far back are we willing to tolerate missing data? Larger values allow -#' more filling. The default `NULL` will determine this from the -#' the `recipe`. For example, suppose `n_recent = 3`, then if the -#' 3 most recent observations in any `geo_value` are all `NA`’s, we won’t be -#' able to fill anything, and an error message will be thrown. (See details.) -#' @param forecast_date By default, this is set to the maximum -#' `time_value` in `x`. But if there is data latency such that recent `NA`'s -#' should be filled, this may be _after_ the last available `time_value`. #' #' @return An object of the same type as `x` with columns `geo_value`, `time_value`, any additional #' keys, as well other variables in the original dataset. #' @examples #' # create recipe -#' rec <- epi_recipe(case_death_rate_subset) %>% +#' rec <- epi_recipe(covid_case_death_rates) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) -#' get_test_data(recipe = rec, x = case_death_rate_subset) +#' get_test_data(recipe = rec, x = covid_case_death_rates) #' @importFrom rlang %@% +#' @importFrom stats na.omit #' @export -get_test_data <- function( - recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value)) { +get_test_data <- function(recipe, x) { if (!is_epi_df(x)) cli_abort("`x` must be an `epi_df`.") - arg_is_lgl(fill_locf) - arg_is_scalar(fill_locf) - arg_is_scalar(n_recent, allow_null = TRUE) - if (!is.null(n_recent) && is.finite(n_recent)) { - arg_is_pos_int(n_recent, allow_null = TRUE) - } - if (!is.null(n_recent)) n_recent <- abs(n_recent) # in case they passed -Inf check <- hardhat::check_column_names(x, colnames(recipe$template)) if (!check$ok) { @@ -64,103 +39,40 @@ get_test_data <- function( i = "The following required columns are missing: {check$missing_names}" )) } - if (class(forecast_date) != class(x$time_value)) { - cli_abort("`forecast_date` must be the same class as `x$time_value`.") - } - if (forecast_date < max(x$time_value)) { - cli_abort("`forecast_date` must be no earlier than `max(x$time_value)`") - } min_lags <- min(map_dbl(recipe$steps, ~ min(.x$lag %||% Inf)), Inf) max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0) max_horizon <- max(map_dbl(recipe$steps, ~ max(.x$horizon %||% 0)), 0) max_slide <- max(map_dbl(recipe$steps, ~ max(.x$before %||% 0)), 0) min_required <- max_lags + max_horizon + max_slide - if (is.null(n_recent)) n_recent <- min_required + 1 # one extra for filling - if (n_recent <= min_required) n_recent <- min_required + n_recent + keep <- max_lags + max_horizon # CHECK: Error out if insufficient training data # Probably needs a fix based on the time_type of the epi_df avail_recent <- diff(range(x$time_value)) - if (avail_recent < min_required) { + if (avail_recent < keep) { cli_abort(c( "You supplied insufficient recent data for this recipe. ", "!" = "You need at least {min_required} days of data,", "!" = "but `x` contains only {avail_recent}." )) } - + max_time_value <- x %>% + na.omit() %>% + pull(time_value) %>% + max() x <- arrange(x, time_value) groups <- epi_keys_only(recipe) # If we skip NA completion, we remove undesirably early time values # Happens globally, over all groups - keep <- max(n_recent, min_required + 1) - x <- filter(x, forecast_date - time_value <= keep) - - # Pad with explicit missing values up to and including the forecast_date - # x is grouped here - x <- pad_to_end(x, groups, forecast_date) %>% - group_by(across(all_of(groups))) + x <- filter(x, max_time_value - time_value <= keep) # If all(lags > 0), then we get rid of recent data if (min_lags > 0 && min_lags < Inf) { - x <- filter(x, forecast_date - time_value >= min_lags) - } - - # Now, fill forward missing data if requested - if (fill_locf) { - cannot_be_used <- x %>% - filter(forecast_date - time_value <= n_recent) %>% - mutate(fillers = forecast_date - time_value > min_required) %>% - summarize( - across( - -any_of(key_colnames(recipe)), - ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) - ), - .groups = "drop" - ) %>% - select(-fillers) %>% - summarise(across(-any_of(key_colnames(recipe)), ~ any(.x))) %>% - unlist() - if (any(cannot_be_used)) { - bad_vars <- names(cannot_be_used)[cannot_be_used] - if (recipes::is_trained(recipe)) { - cli_abort(c( - "The variables {.var {bad_vars}} have too many recent missing", - `!` = "values to be filled automatically. ", - i = "You should either choose `n_recent` larger than its current ", - i = "value {n_recent}, or perform NA imputation manually, perhaps with ", - i = "{.code recipes::step_impute_*()} or with {.code tidyr::fill()}." - )) - } - } - x <- tidyr::fill(x, !time_value) + x <- filter(x, max_time_value - time_value >= min_lags) } - filter(x, forecast_date - time_value <= min_required) %>% - ungroup() -} - -pad_to_end <- function(x, groups, end_date) { - itval <- guess_period(c(x$time_value, end_date), "time_value") - completed_time_values <- x %>% - group_by(across(all_of(groups))) %>% - summarise( - time_value = rlang::list2( - time_value = Seq(max(time_value) + itval, end_date, itval) - ) - ) %>% - unnest("time_value") %>% - mutate(time_value = vctrs::vec_cast(time_value, x$time_value)) - - bind_rows(x, completed_time_values) %>% - arrange(across(all_of(c("time_value", groups)))) -} - -Seq <- function(from, to, by) { - if (from > to) { - return(NULL) - } - seq(from = from, to = to, by = by) + filter(x, max_time_value - time_value <= keep) %>% + epiprocess::ungroup() } diff --git a/R/key_colnames.R b/R/key_colnames.R index b9ebde5dc..9e0d44dcf 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,20 +1,25 @@ #' @export -key_colnames.recipe <- function(x, ...) { +key_colnames.recipe <- function(x, ..., exclude = character()) { geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"] time_key <- x$var_info$variable[x$var_info$role %in% "time_value"] keys <- x$var_info$variable[x$var_info$role %in% "key"] - c(geo_key, keys, time_key) %||% character(0L) + full_key <- c(geo_key, keys, time_key) %||% character(0L) + full_key[!full_key %in% exclude] } #' @export -key_colnames.epi_workflow <- function(x, ...) { +key_colnames.epi_workflow <- function(x, ..., exclude = character()) { # safer to look at the mold than the preprocessor mold <- hardhat::extract_mold(x) - molded_names <- names(mold$extras$roles) - geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value) - time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value) - keys <- names(mold$extras$roles[molded_names %in% "key"]$key) - c(geo_key, keys, time_key) %||% character(0L) + molded_roles <- mold$extras$roles + extras <- bind_cols(molded_roles$geo_value, molded_roles$key, molded_roles$time_value) + full_key <- names(extras) + if (length(full_key) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_keys <- c("geo_value", "time_value") + full_key <- potential_keys[potential_keys %in% names(bind_cols(molded_roles))] + } + full_key[!full_key %in% exclude] } kill_time_value <- function(v) { diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 3d5ea010b..72bc33703 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -1,11 +1,13 @@ -#' Postprocessing step to add the forecast date +#' Post-processing step to add the forecast date #' #' @param frosting a `frosting` postprocessor #' @param forecast_date The forecast date to add as a column to the `epi_df`. -#' For most cases, this should be specified in the form "yyyy-mm-dd". Note that -#' when the forecast date is left unspecified, it is set to the maximum time -#' value from the data used in pre-processing, fitting the model, and -#' postprocessing. +#' For most cases, this should be specified in the form "yyyy-mm-dd". Note +#' that when the forecast date is left unspecified, it is set to one of two +#' values. If there is a `step_adjust_latency` step present, it uses the +#' `forecast_date` as set in that function. Otherwise, it uses the maximum +#' `time_value` across the data used for pre-processing, fitting the model, +#' and post-processing. #' @param id a random id string #' #' @return an updated `frosting` postprocessor @@ -13,14 +15,13 @@ #' @details To use this function, either specify a forecast date or leave the #' forecast date unspecifed here. In the latter case, the forecast date will #' be set as the maximum time value from the data used in pre-processing, -#' fitting the model, and postprocessing. In any case, when the forecast date is +#' fitting the model, and post-processing. In any case, when the forecast date is #' less than the maximum `as_of` value (from the data used pre-processing, -#' model fitting, and postprocessing), an appropriate warning will be thrown. +#' model fitting, and post-processing), an appropriate warning will be thrown. #' #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -86,19 +87,16 @@ layer_add_forecast_date_new <- function(forecast_date, id) { } #' @export +#' @importFrom workflows extract_preprocessor slather.layer_add_forecast_date <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - if (is.null(object$forecast_date)) { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- object$forecast_date %||% + get_forecast_date_in_layer( + extract_recipe(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) - forecast_date <- max_time_value - } else { - forecast_date <- object$forecast_date - } + new_data + ) expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 094ec8501..8c60dfbfc 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -1,27 +1,31 @@ -#' Postprocessing step to add the target date +#' Post-processing step to add the target date #' #' @param frosting a `frosting` postprocessor -#' @param target_date The target date to add as a column to the -#' `epi_df`. If there's a forecast date specified in a layer, then -#' it is the forecast date plus `ahead` (from `step_epi_ahead` in -#' the `epi_recipe`). Otherwise, it is the maximum `time_value` -#' (from the data used in pre-processing, fitting the model, and -#' postprocessing) plus `ahead`, where `ahead` has been specified in -#' preprocessing. The user may override these by specifying a -#' target date of their own (of the form "yyyy-mm-dd"). +#' @param target_date The target date to add as a column to the `epi_df`. If +#' there's a forecast date specified upstream (either in a +#' `step_adjust_latency` or in a `layer_forecast_date`), then it is the +#' forecast date plus `ahead` (from `step_epi_ahead` in the `epi_recipe`). +#' Otherwise, it is the maximum `time_value` (from the data used in +#' pre-processing, fitting the model, and post-processing) plus `ahead`, where +#' `ahead` has been specified in preprocessing. The user may override these by +#' specifying a target date of their own (of the form "yyyy-mm-dd"). #' @param id a random id string #' #' @return an updated `frosting` postprocessor #' -#' @details By default, this function assumes that a value for `ahead` -#' has been specified in a preprocessing step (most likely in -#' `step_epi_ahead`). Then, `ahead` is added to the maximum `time_value` -#' in the test data to get the target date. +#' @details By default, this function assumes that a value for `ahead` has been +#' specified in a preprocessing step (most likely in `step_epi_ahead`). Then, +#' `ahead` is added to the `forecast_date` in the test data to get the target +#' date. `forecast_date` itself can be set in 3 ways: +#' 1. The default `forecast_date` is simply the maximum `time_value` over every +#' dataset used (prep, training, and prediction). +#' 2. if `step_adjust_latency` is present, it will typically use the training +#' `epi_df`'s `as_of` +#' 3. `layer_add_forecast_date`, which inherits from 2 if not manually specifed #' #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -41,8 +45,14 @@ #' p <- forecast(wf1) #' p #' -#' # Use ahead + max time value from pre, fit, post -#' # which is the same if include `layer_add_forecast_date()` +#' # Use ahead + forecast_date from adjust_latency +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_naomit() #' f2 <- frosting() %>% #' layer_predict() %>% #' layer_add_target_date() %>% @@ -52,15 +62,26 @@ #' p2 <- forecast(wf2) #' p2 #' -#' # Specify own target date +#' # Use ahead + max time value from pre, fit, post +#' # which is the same if include `layer_add_forecast_date()` #' f3 <- frosting() %>% #' layer_predict() %>% -#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_add_target_date() %>% #' layer_naomit(.pred) #' wf3 <- wf %>% add_frosting(f3) #' -#' p3 <- forecast(wf3) -#' p3 +#' p3 <- forecast(wf2) +#' p2 +#' +#' # Specify own target date +#' f4 <- frosting() %>% +#' layer_predict() %>% +#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_naomit(.pred) +#' wf4 <- wf %>% add_frosting(f4) +#' +#' p4 <- forecast(wf4) +#' p4 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { arg_is_chr_scalar(id) @@ -112,13 +133,13 @@ slather.layer_add_target_date <- function(object, components, workflow, ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) + new_data + ) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead + target_date <- forecast_date + ahead } object$target_date <- target_date diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 8d16ba32f..fffacf21a 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -51,19 +51,19 @@ #' in an additional `` named `.pred_distn_all` containing 2-column #' [tibble::tibble()]'s. For each #' desired combination of `key`'s, the tibble will contain one row per ahead -#' with the associated [dist_quantiles()]. +#' with the associated [quantile_pred()]. #' @export #' #' @examples -#' library(dplyr) -#' r <- epi_recipe(case_death_rate_subset) %>% +#' library(recipes) +#' r <- epi_recipe(covid_case_death_rates) %>% #' # data is "daily", so we fit this to 1 ahead, the result will contain #' # 1 day ahead residuals #' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% -#' recipes::update_role(death_rate, new_role = "predictor") %>% -#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' update_role(death_rate, new_role = "predictor") %>% +#' add_role(time_value, geo_value, new_role = "predictor") #' -#' forecast_date <- max(case_death_rate_subset$time_value) +#' forecast_date <- max(covid_case_death_rates$time_value) #' #' f <- frosting() %>% #' layer_predict() %>% @@ -71,7 +71,7 @@ #' #' eng <- linear_reg(engine = "flatline") #' -#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' wf <- epi_workflow(r, eng, f) %>% fit(covid_case_death_rates) #' preds <- forecast(wf) %>% #' select(-time_value) %>% #' mutate(forecast_date = forecast_date) @@ -82,24 +82,23 @@ #' pivot_quantiles_wider(.pred_distn) %>% #' mutate(target_date = forecast_date + ahead) #' -#' if (require("ggplot2")) { -#' four_states <- c("ca", "pa", "wa", "ny") -#' preds %>% -#' filter(geo_value %in% four_states) %>% -#' ggplot(aes(target_date)) + -#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + -#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + -#' geom_line(aes(y = .pred), color = "orange") + -#' geom_line( -#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = death_rate) -#' ) + -#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + -#' labs(x = "Date", y = "Death rate") + -#' facet_wrap(~geo_value, scales = "free_y") + -#' theme_bw() + -#' geom_vline(xintercept = forecast_date) -#' } +#' library(ggplot2) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = covid_case_death_rates %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) layer_cdc_flatline_quantiles <- function( frosting, ..., @@ -163,7 +162,7 @@ slather.layer_cdc_flatline_quantiles <- } the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { - cli::cli_warn(c( + cli_warn(c( "Predictions for this workflow were not produced by the {.cls flatline}", "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." )) @@ -176,7 +175,7 @@ slather.layer_cdc_flatline_quantiles <- if (length(object$by_key) > 0L) { cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { - cli::cli_warn(c( + cli_warn(paste( "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) @@ -184,7 +183,7 @@ slather.layer_cdc_flatline_quantiles <- if (inherits(the_fit, "_flatline")) { cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli_warn(paste( "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) @@ -201,7 +200,7 @@ slather.layer_cdc_flatline_quantiles <- ) cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli_warn(paste( "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) @@ -228,7 +227,6 @@ slather.layer_cdc_flatline_quantiles <- ) %>% select(all_of(c(avail_grps, ".pred_distn_all"))) - # res <- check_pname(res, components$predictions, object) components$predictions <- left_join( components$predictions, res, @@ -266,11 +264,10 @@ propagate_samples <- function( } } res <- res[aheads] + res_quantiles <- map(res, quantile, probs = quantile_levels) list(tibble( ahead = aheads, - .pred_distn = map_vec( - res, ~ dist_quantiles(quantile(.x, quantile_levels), quantile_levels) - ) + .pred_distn = quantile_pred(do.call(rbind, res_quantiles), quantile_levels) )) } diff --git a/R/layer_naomit.R b/R/layer_naomit.R index 209a663b4..5071bcc5c 100644 --- a/R/layer_naomit.R +++ b/R/layer_naomit.R @@ -11,8 +11,7 @@ #' @return an updated `frosting` postprocessor #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_point_from_distn.R b/R/layer_point_from_distn.R index f14008748..07f524470 100644 --- a/R/layer_point_from_distn.R +++ b/R/layer_point_from_distn.R @@ -1,6 +1,6 @@ #' Converts distributional forecasts to point forecasts #' -#' This function adds a postprocessing layer to extract a point forecast from +#' This function adds a post-processing layer to extract a point forecast from #' a distributional forecast. NOTE: With default arguments, this will remove #' information, so one should usually call this AFTER `layer_quantile_distn()` #' or set the `name` argument to something specific. @@ -16,8 +16,7 @@ #' @export #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -79,10 +78,10 @@ layer_point_from_distn_new <- function(type, name, id) { slather.layer_point_from_distn <- function(object, components, workflow, new_data, ...) { dstn <- components$predictions$.pred - if (!inherits(dstn, "distribution")) { - rlang::warn( + if (!(inherits(dstn, "quantile_pred") | inherits(dstn, "distribution"))) { + cli_warn( c("`layer_point_from_distn` requires distributional predictions.", - i = "These are of class {class(dstn)}. Ignoring this layer." + i = "These are of class {.cls {class(dstn)}}. Ignoring this layer." ) ) return(components) @@ -94,7 +93,12 @@ slather.layer_point_from_distn <- components$predictions$.pred <- dstn } else { dstn <- tibble(dstn = dstn) - dstn <- check_pname(dstn, components$predictions, object) + dstn <- check_name( + dstn, + components$predictions, + object, + newname = object$name + ) components$predictions <- mutate(components$predictions, !!!dstn) } components diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 4755083ce..cc77bf03b 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -1,15 +1,15 @@ #' Convert per-capita predictions to raw scale #' -#' `layer_population_scaling` creates a specification of a frosting layer -#' that will "undo" per-capita scaling. Typical usage would -#' load a dataset that contains state-level population, and use it to convert -#' predictions made from a rate-scale model to raw scale by multiplying by -#' the population. -#' Although, it is worth noting that there is nothing special about "population". -#' The function can be used to scale by any variable. Population is the -#' standard use case in the epidemiology forecasting scenario. Any value -#' passed will *multiply* the selected variables while the `rate_rescaling` -#' argument is a common *divisor* of the selected variables. +#' `layer_population_scaling` creates a specification of a frosting layer that +#' will "undo" per-capita scaling done in `step_population_scaling()`. +#' Typical usage would set `df` to be a dataset that contains a list of +#' population for the `geo_value`s, and use it to convert predictions made from +#' a raw scale model to rate-scale by dividing by the population. +#' Although, it is worth noting that there is nothing special about +#' "population", and the function can be used to scale by any variable. +#' Population is the standard use case in the epidemiology forecasting scenario. +#' Any value passed will *multiply* the selected variables while the +#' `rate_rescaling` argument is a common *divisor* of the selected variables. #' #' @param frosting a `frosting` postprocessor. The layer will be added to the #' sequence of operations for this frosting. @@ -17,14 +17,20 @@ #' for this step. See [recipes::selections()] for more details. #' @param df a data frame that contains the population data to be used for #' inverting the existing scaling. -#' @param by A (possibly named) character vector of variables to join by. +#' @param by A (possibly named) character vector of variables to join `df` onto +#' the `epi_df` by. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the test data with +#' roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in `df`; +#' these roles are automatically set if you are using an `epi_df`, or you can +#' use, e.g., `update_role`. If no such roles are set, it will try to perform a +#' natural join, using variables in common between the training/test data and +#' population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` @@ -47,7 +53,6 @@ #' @return an updated `frosting` postprocessor #' @export #' @examples -#' library(dplyr) #' jhu <- cases_deaths_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' select(geo_value, time_value, cases) @@ -135,6 +140,26 @@ slather.layer_population_scaling <- ) rlang::check_dots_empty() + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- kill_time_value(lhs_potential_keys) + if (!all(suggested_min_keys %in% object$by)) { + cli_warn(c( + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys") + } + } + object$by <- object$by %||% intersect( epi_keys_only(components$predictions), colnames(select(object$df, !object$df_pop_col)) @@ -152,10 +177,12 @@ slather.layer_population_scaling <- suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(components$predictions)) - components$predictions <- left_join( + components$predictions <- inner_join( components$predictions, object$df, by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop"), suffix = c("", ".df") ) %>% mutate(across( diff --git a/R/layer_predict.R b/R/layer_predict.R index 6ca17ac24..623ab3391 100644 --- a/R/layer_predict.R +++ b/R/layer_predict.R @@ -1,11 +1,11 @@ -#' Prediction layer for postprocessing +#' Prediction layer for post-processing #' #' Implements prediction on a fitted `epi_workflow`. One may want different #' types of prediction, and to potentially apply this after some amount of -#' postprocessing. This would typically be the first layer in a `frosting` +#' post-processing. This would typically be the first layer in a `frosting` #' postprocessor. #' -#' @seealso `parsnip::predict.model_fit()` +#' @seealso [parsnip::predict.model_fit()] #' #' @inheritParams parsnip::predict.model_fit #' @param frosting a frosting object @@ -16,8 +16,7 @@ #' @export #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_predictive_distn.R b/R/layer_predictive_distn.R index b28e0c765..8b6e170ab 100644 --- a/R/layer_predictive_distn.R +++ b/R/layer_predictive_distn.R @@ -1,11 +1,15 @@ #' Returns predictive distributions #' +#' `r lifecycle::badge("deprecated")` +#' #' This function calculates an _approximation_ to a parametric predictive #' distribution. Predictive distributions from linear models require #' `x* (X'X)^{-1} x*` -#' along with the degrees of freedom. This function approximates both. It -#' should be reasonably accurate for models fit using `lm` when the new point -#' `x*` isn't too far from the bulk of the data. +#' along with the degrees of freedom. This function approximates both. It should +#' be reasonably accurate for models fit using `lm` when the new point `x*` +#' isn't too far from the bulk of the data. Outside of that specific case, it is +#' recommended to use `layer_residual_quantiles()`, or if you are working with a +#' model that produces distributional predictions, use `layer_quantile_distn()`. #' #' @param frosting a `frosting` postprocessor #' @param ... Unused, include for consistency with other layers. @@ -16,96 +20,13 @@ #' #' @return an updated `frosting` postprocessor with additional columns of the #' residual quantiles added to the prediction - #' @export #' -#' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% -#' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' -#' r <- epi_recipe(jhu) %>% -#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% -#' step_epi_ahead(death_rate, ahead = 7) %>% -#' step_epi_naomit() -#' -#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) -#' -#' f <- frosting() %>% -#' layer_predict() %>% -#' layer_predictive_distn() %>% -#' layer_naomit(.pred) -#' wf1 <- wf %>% add_frosting(f) -#' -#' p <- forecast(wf1) -#' p layer_predictive_distn <- function(frosting, ..., dist_type = c("gaussian", "student_t"), truncate = c(-Inf, Inf), name = ".pred_distn", id = rand_id("predictive_distn")) { - rlang::check_dots_empty() - arg_is_chr_scalar(name, id) - dist_type <- match.arg(dist_type) - stopifnot( - length(truncate) == 2L, is.numeric(truncate), truncate[1] < truncate[2] - ) - - add_layer( - frosting, - layer_predictive_distn_new( - dist_type = dist_type, - truncate = truncate, - name = name, - id = id - ) - ) -} - -layer_predictive_distn_new <- function(dist_type, truncate, name, id) { - layer("predictive_distn", - dist_type = dist_type, truncate = truncate, - name = name, id = id - ) -} - -#' @export -slather.layer_predictive_distn <- - function(object, components, workflow, new_data, ...) { - the_fit <- workflows::extract_fit_parsnip(workflow) - rlang::check_dots_empty() - - m <- components$predictions$.pred - r <- grab_residuals(the_fit, components) - df <- the_fit$df.residual - n <- sum(!is.na(r)) - papprox <- ncol(components$mold$predictors) + 1 - if (is.null(df)) df <- n - papprox - mse <- sum(r^2, na.rm = TRUE) / df - s <- sqrt(mse * (1 + papprox / df)) # E[x (X'X)^1 x] if E[X'X] ~= (n-p) I - dstn <- switch(object$dist_type, - gaussian = distributional::dist_normal(m, s), - student_t = distributional::dist_student_t(df, m, s) - ) - truncate <- object$truncate - if (!all(is.infinite(truncate))) { - dstn <- distributional::dist_truncated(dstn, truncate[1], truncate[2]) - } - dstn <- tibble(dstn = dstn) - dstn <- check_pname(dstn, components$predictions, object) - components$predictions <- mutate(components$predictions, !!!dstn) - components - } - -#' @export -print.layer_predictive_distn <- function( - x, width = max(20, options()$width - 30), ...) { - title <- "Creating approximate predictive intervals" - td <- "" - td <- rlang::enquos(td) - print_layer(td, - title = title, width = width, conjunction = "type", - extra_text = x$dist_type - ) + lifecycle::deprecate_stop("0.1.11", "layer_predictive_distn()", "layer_residual_quantiles()") } diff --git a/R/layer_quantile_distn.R b/R/layer_quantile_distn.R index 5f87ded29..a0af380a2 100644 --- a/R/layer_quantile_distn.R +++ b/R/layer_quantile_distn.R @@ -1,6 +1,8 @@ #' Returns predictive quantiles #' #' This function calculates quantiles when the prediction was _distributional_. +#' If the model producing the forecast is not distributional, it is recommended +#' to use `layer_residual_quantiles()` instead. #' #' Currently, the only distributional modes/engines are #' * `quantile_reg()` @@ -22,8 +24,7 @@ #' @export #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -44,7 +45,7 @@ #' p layer_quantile_distn <- function(frosting, ..., - quantile_levels = c(.25, .75), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), truncate = c(-Inf, Inf), name = ".pred_distn", id = rand_id("quantile_distn")) { @@ -79,15 +80,15 @@ layer_quantile_distn_new <- function(quantile_levels, truncate, name, id) { slather.layer_quantile_distn <- function(object, components, workflow, new_data, ...) { dstn <- components$predictions$.pred - if (!inherits(dstn, "distribution")) { - cli_abort(c( - "`layer_quantile_distn()` requires distributional predictions.", - "These are of class {.cls {class(dstn)}}." - )) + if (!inherits(dstn, "quantile_pred")) { + cli_abort( + "`layer_quantile_distn()` requires or quantile + predictions. These are of class {.cls {class(dstn)}}." + ) } rlang::check_dots_empty() - dstn <- dist_quantiles( + dstn <- quantile_pred( quantile(dstn, object$quantile_levels), object$quantile_levels ) @@ -97,7 +98,12 @@ slather.layer_quantile_distn <- dstn <- snap(dstn, truncate[1], truncate[2]) } dstn <- tibble(dstn = dstn) - dstn <- check_pname(dstn, components$predictions, object) + dstn <- check_name( + dstn, + components$predictions, + object, + newname = object$name + ) components$predictions <- mutate(components$predictions, !!!dstn) components } diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index eae151905..26ec9a184 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -1,10 +1,19 @@ #' Creates predictions based on residual quantiles #' +#' This function calculates predictive quantiles based on the empirical +#' quantiles of the model's residuals. If the model producing the forecast is +#' distributional, it is recommended to use `layer_residual_quantiles()` +#' instead, as those will be more accurate. +#' #' @param frosting a `frosting` postprocessor #' @param ... Unused, include for consistency with other layers. #' @param quantile_levels numeric vector of probabilities with values in (0,1) -#' referring to the desired quantile. -#' @param symmetrize logical. If `TRUE` then interval will be symmetric. +#' referring to the desired quantile. Note that 0.5 will always be included +#' even if left out by the user. +#' @param symmetrize logical. If `TRUE` then the interval will be symmetric. +#' Typically, one would only want non-symmetric quantiles when increasing +#' trajectories are quite different from decreasing ones, such as a strictly +#' postive variable near zero. #' @param by_key A character vector of keys to group the residuals by before #' calculating quantiles. The default, `c()` performs no grouping. #' @param name character. The name for the output column. @@ -14,8 +23,7 @@ #' residual quantiles added to the prediction #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -28,7 +36,7 @@ #' f <- frosting() %>% #' layer_predict() %>% #' layer_residual_quantiles( -#' quantile_levels = c(0.0275, 0.975), +#' quantile_levels = c(0.025, 0.975), #' symmetrize = FALSE #' ) %>% #' layer_naomit(.pred) @@ -48,7 +56,7 @@ #' p2 <- forecast(wf2) layer_residual_quantiles <- function( frosting, ..., - quantile_levels = c(0.05, 0.95), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, by_key = character(0L), name = ".pred_distn", @@ -59,6 +67,7 @@ layer_residual_quantiles <- function( arg_is_chr(by_key, allow_empty = TRUE) arg_is_probabilities(quantile_levels) arg_is_lgl(symmetrize) + quantile_levels <- sort(unique(c(0.5, quantile_levels))) add_layer( frosting, layer_residual_quantiles_new( @@ -90,7 +99,7 @@ slather.layer_residual_quantiles <- return(components) } - s <- ifelse(object$symmetrize, -1, NA) + symmetric <- ifelse(object$symmetrize, -1, NA) r <- grab_residuals(the_fit, components) ## Handle any grouping requests @@ -102,7 +111,7 @@ slather.layer_residual_quantiles <- common <- intersect(object$by_key, names(key_cols)) excess <- setdiff(object$by_key, names(key_cols)) if (length(excess) > 0L) { - cli::cli_warn(c( + cli_warn(paste( "Requested residual grouping key(s) {.val {excess}} are unavailable ", "in the original data. Grouping by the remainder: {.val {common}}." )) @@ -113,7 +122,7 @@ slather.layer_residual_quantiles <- if (length(common_in_r) == length(common)) { r <- left_join(key_cols, r, by = common_in_r) } else { - cli::cli_warn(c( + cli_warn(paste( "Some grouping keys are not in data.frame returned by the", "`residuals()` method. Groupings may not be correct." )) @@ -124,32 +133,33 @@ slather.layer_residual_quantiles <- } r <- r %>% - summarize( - dstn = list(quantile( - c(.resid, s * .resid), - probs = object$quantile_levels, na.rm = TRUE - )) - ) + summarize(dstn = quantile_pred(matrix(quantile( + c(.resid, symmetric * .resid), + probs = object$quantile_levels, na.rm = TRUE + ), nrow = 1), quantile_levels = object$quantile_levels)) # Check for NA - if (any(sapply(r$dstn, is.na))) { - cli::cli_abort(c( + if (anyNA(as.matrix(r$dstn))) { + cli_abort(c( "Residual quantiles could not be calculated due to missing residuals.", i = "This may be due to `n_train` < `ahead` in your {.cls epi_recipe}." )) } estimate <- components$predictions$.pred - res <- tibble( - .pred_distn = dist_quantiles(map2(estimate, r$dstn, "+"), object$quantile_levels) + res <- tibble(.pred_distn = r$dstn + estimate) + res <- check_name( + res, + components$predictions, + object, + newname = object$name ) - res <- check_pname(res, components$predictions, object) components$predictions <- mutate(components$predictions, !!!res) components } grab_residuals <- function(the_fit, components) { if (the_fit$spec$mode != "regression") { - cli::cli_abort("For meaningful residuals, the predictor should be a regression model.") + cli_abort("For meaningful residuals, the predictor should be a regression model.") } r <- stats::residuals(the_fit$fit) if (!is.null(r)) { # Got something from the method @@ -157,7 +167,7 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - cli::cli_warn(c( + cli_warn(c( "The `residuals()` method for objects of class {.cls {cl}} results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", @@ -168,7 +178,7 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - cli::cli_warn(c( + cli_warn(paste( "The `residuals()` method for objects of class {.cls {cl}} results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", diff --git a/R/layer_threshold_preds.R b/R/layer_threshold_preds.R index 56f8059ab..1630ce629 100644 --- a/R/layer_threshold_preds.R +++ b/R/layer_threshold_preds.R @@ -1,8 +1,12 @@ #' Lower and upper thresholds for predicted values #' -#' This postprocessing step is used to set prediction values that are -#' smaller than the lower threshold or higher than the upper threshold equal -#' to the threshold values. +#' This post-processing step is used to set prediction values that are smaller +#' than the lower threshold or higher than the upper threshold equal to the +#' threshold values. + +#' @details +#' Making case count predictions strictly positive is a typical example usage. +#' It must be called after there is a column containing quantiles. This means at earliest it can be called after `layer_predict()` for distributional models, or after `layer_residual_quantiles()` for point prediction models. Typical best practice will use `starts_with(".pred")` as the variables to threshold. #' #' @param frosting a `frosting` postprocessor #' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted @@ -22,9 +26,9 @@ #' @return an updated `frosting` postprocessor #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) +#' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% @@ -33,7 +37,7 @@ #' #' f <- frosting() %>% #' layer_predict() %>% -#' layer_threshold(.pred, lower = 0.180, upper = 0.310) +#' layer_threshold(starts_with(".pred"), lower = 0.180, upper = 0.310) #' wf <- wf %>% add_frosting(f) #' p <- forecast(wf) #' p @@ -61,6 +65,13 @@ layer_threshold_new <- +#' restrict various objects to the interval \[lower, upper\] +#' @param x the object to restrict +#' @param lower numeric, the lower bound +#' @param upper numeric, the upper bound +#' @param ... unused +#' @export +#' @keywords internal snap <- function(x, lower, upper, ...) { UseMethod("snap") } @@ -73,25 +84,11 @@ snap.default <- function(x, lower, upper, ...) { } #' @export -snap.distribution <- function(x, lower, upper, ...) { - rlang::check_dots_empty() - arg_is_scalar(lower, upper) - dstn <- lapply(vec_data(x), snap, lower = lower, upper = upper) - distributional:::wrap_dist(dstn) -} - -#' @export -snap.dist_default <- function(x, lower, upper, ...) { - rlang::check_dots_empty() - x -} - -#' @export -snap.dist_quantiles <- function(x, lower, upper, ...) { - values <- field(x, "values") - quantile_levels <- field(x, "quantile_levels") - values <- snap(values, lower, upper) - new_quantiles(values = values, quantile_levels = quantile_levels) +snap.quantile_pred <- function(x, lower, upper, ...) { + values <- as.matrix(x) + quantile_levels <- x %@% "quantile_levels" + values <- map(vctrs::vec_chop(values), ~ snap(.x, lower, upper)) + quantile_pred(do.call(rbind, values), quantile_levels = quantile_levels) } #' @export diff --git a/R/layer_unnest.R b/R/layer_unnest.R index a6fc9f0af..e097ad465 100644 --- a/R/layer_unnest.R +++ b/R/layer_unnest.R @@ -1,5 +1,12 @@ #' Unnest prediction list-cols #' +#' For any model that produces forecasts for multiple outcomes, such as multiple +#' aheads, the resulting prediction is a list of forecasts inside a column of +#' the prediction tibble, which is may not be desirable. This layer "lengthens" +#' the result, moving each outcome to a separate row, in the same manner as +#' `tidyr::unnest()` would. At the moment, the only such engine is +#' `smooth_quantile_reg()`. +#' #' @param frosting a `frosting` postprocessor #' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted #' expressions separated by commas. Variable names can be used as if they @@ -9,6 +16,35 @@ #' #' @return an updated `frosting` postprocessor #' @export +#' @examples +#' jhu <- covid_case_death_rates %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' +#' aheads <- 1:7 +#' +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = aheads) %>% +#' step_epi_naomit() +#' +#' wf <- epi_workflow( +#' r, +#' smooth_quantile_reg( +#' quantile_levels = c(.05, .1, .25, .5, .75, .9, .95), +#' outcome_locations = aheads +#' ) +#' ) %>% +#' fit(jhu) +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_naomit() %>% +#' layer_unnest(.pred) +#' +#' wf1 <- wf %>% add_frosting(f) +#' +#' p <- forecast(wf1) +#' p layer_unnest <- function(frosting, ..., id = rand_id("unnest")) { arg_is_chr_scalar(id) diff --git a/R/layers.R b/R/layers.R index aa515a917..b35dceaf2 100644 --- a/R/layers.R +++ b/R/layers.R @@ -41,8 +41,7 @@ layer <- function(subclass, ..., .prefix = "layer_") { #' in the layer, and the values are the new values to update the layer with. #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -148,9 +147,9 @@ validate_layer <- function(x, ..., arg = rlang::caller_arg(x), call = caller_env()) { rlang::check_dots_empty() if (!is_layer(x)) { - cli::cli_abort( + cli_abort( "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", - .call = call + call = call ) } invisible(x) @@ -179,11 +178,14 @@ detect_layer.workflow <- function(x, name, ...) { #' Spread a layer of frosting on a fitted workflow #' -#' Slathering frosting means to implement a postprocessing layer. When -#' creating a new postprocessing layer, you must implement an S3 method -#' for this function +#' Slathering frosting means to implement a post-processing layer. It is the +#' post-processing equivalent of `bake` for a recipe. Given a layer, it applies +#' the actual transformation of that layer. When creating a new post-processing +#' layer, you must implement an S3 method for this function. Generally, you will +#' not need to call this function directly, as it will be used indirectly during +#' `predict`. #' -#' @param object a workflow with `frosting` postprocessing steps +#' @param object a workflow with `frosting` post-processing steps #' @param components a list of components containing model information. These #' will be updated and returned by the layer. These should be #' * `mold` - the output of calling `hardhat::mold()` on the workflow. This @@ -201,7 +203,8 @@ detect_layer.workflow <- function(x, name, ...) { #' #' @param ... additional arguments used by methods. Currently unused. #' -#' @return The `components` list. In the same format after applying any updates. +#' @return The `components` list, in the same format as before, after applying +#' any updates. #' @export slather <- function(object, components, workflow, new_data, ...) { UseMethod("slather") diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 253ea1ac7..78c855831 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -60,8 +60,7 @@ #' #' # -- a more complicated task #' -#' library(dplyr) -#' dat <- case_death_rate_subset %>% +#' dat <- covid_case_death_rates %>% #' filter(time_value > as.Date("2021-10-01")) #' rec <- epi_recipe(dat) %>% #' step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% @@ -141,7 +140,7 @@ make_grf_quantiles <- function() { data = c(x = "X", y = "Y"), func = c(pkg = "grf", fun = "quantile_forest"), defaults = list( - quantiles = c(0.1, 0.5, 0.9), + quantiles = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), num.threads = 1L, seed = rlang::expr(stats::runif(1, 0, .Machine$integer.max)) ) @@ -163,12 +162,12 @@ make_grf_quantiles <- function() { ) ) - # turn the predictions into a tibble with a dist_quantiles column + # turn the predictions into a tibble with a quantile_pred column process_qrf_preds <- function(x, object) { - quantile_levels <- parsnip::extract_fit_engine(object)$quantiles.orig + quantile_levels <- parsnip::extract_fit_engine(object)$quantiles.orig %>% sort() x <- x$predictions out <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) - out <- dist_quantiles(out, list(quantile_levels)) + out <- hardhat::quantile_pred(do.call(rbind, out), quantile_levels) return(dplyr::tibble(.pred = out)) } diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 2157aa470..bc2d322bf 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -12,7 +12,7 @@ #' @param engine Character string naming the fitting function. Currently, only #' "rq" and "grf" are supported. #' @param quantile_levels A scalar or vector of values in (0, 1) to determine which -#' quantiles to estimate (default is 0.5). +#' quantiles to estimate (default is the set 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95). #' @param method A fitting method used by [quantreg::rq()]. See the #' documentation for a list of options. #' @@ -27,7 +27,9 @@ #' rq_spec <- quantile_reg(quantile_levels = c(.2, .8)) %>% set_engine("rq") #' ff <- rq_spec %>% fit(y ~ ., data = tib) #' predict(ff, new_data = tib) -quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0.5, method = "br") { +quantile_reg <- function(mode = "regression", engine = "rq", + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), + method = "br") { # Check for correct mode if (mode != "regression") { cli_abort("`mode` must be 'regression'") @@ -37,7 +39,7 @@ quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0 if (any(quantile_levels > 1)) cli_abort("All `quantile_levels` must be less than 1.") if (any(quantile_levels < 0)) cli_abort("All `quantile_levels` must be greater than 0.") if (is.unsorted(quantile_levels)) { - cli::cli_warn("Sorting `quantile_levels` to increasing order.") + cli_warn("Sorting `quantile_levels` to increasing order.") quantile_levels <- sort(quantile_levels) } args <- list(quantile_levels = rlang::enquo(quantile_levels), method = rlang::enquo(method)) @@ -108,21 +110,11 @@ make_quantile_reg <- function() { process_rq_preds <- function(x, object) { object <- parsnip::extract_fit_engine(object) - type <- class(object)[1] - - # can't make a method because object is second - out <- switch(type, - rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile - rqs = { - x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) - dist_quantiles(x, list(object$tau)) - }, - cli_abort(c( - "Prediction is not implemented for this `rq` type.", - i = "See {.fun quantreg::rq}." - )) - ) - return(dplyr::tibble(.pred = out)) + if (!is.matrix(x)) x <- as.matrix(x) + rownames(x) <- NULL + n_pred_quantiles <- ncol(x) + quantile_levels <- object$tau + tibble(.pred = hardhat::quantile_pred(x, quantile_levels)) } parsnip::set_pred( diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 448ee0fa5..03d4e19af 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -5,12 +5,7 @@ #' the [tidymodels](https://www.tidymodels.org/) framework. Currently, the #' only supported engine is [smoothqr::smooth_qr()]. #' -#' @param mode A single character string for the type of model. -#' The only possible value for this model is "regression". -#' @param engine Character string naming the fitting function. Currently, only -#' "smooth_qr" is supported. -#' @param quantile_levels A scalar or vector of values in (0, 1) to determine which -#' quantiles to estimate (default is 0.5). +#' @inheritParams quantile_reg #' @param outcome_locations Defaults to the vector `1:ncol(y)` but if the #' responses are observed at a different spacing (or appear in a different #' order), that information should be used here. This @@ -36,25 +31,21 @@ #' y <- sin(x) + rnorm(length(x), sd = .1) #' fd <- x[length(x) - 20] #' XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) -#' XY <- tibble::as_tibble(XY) +#' XY <- as_tibble(XY) #' qr_spec <- smooth_quantile_reg(quantile_levels = c(.2, .5, .8), outcome_locations = 20:1) #' tt <- qr_spec %>% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) #' -#' library(tidyr) -#' library(dplyr) #' pl <- predict( #' object = tt, #' new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] #' ) #' pl <- pl %>% #' unnest(.pred) %>% -#' mutate(distn = nested_quantiles(distn)) %>% -#' unnest(distn) %>% +#' pivot_quantiles_wider(distn) %>% #' mutate( #' x = x[length(x) - 20] + ahead / 100 * 2 * pi, #' ahead = NULL -#' ) %>% -#' pivot_wider(names_from = quantile_levels, values_from = values) +#' ) #' plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") #' curve(sin(x), add = TRUE) #' abline(v = fd, lty = 2) @@ -64,11 +55,11 @@ #' #' library(ggplot2) #' ggplot(data.frame(x = x, y = y), aes(x)) + -#' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + +#' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "cornflowerblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data #' geom_function(fun = sin, colour = "black") + # truth #' geom_vline(xintercept = fd, linetype = "dashed") + # end of training data -#' geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction +#' geom_line(data = pl, aes(y = `0.5`), colour = "orange") + # median prediction #' theme_bw() + #' coord_cartesian(xlim = c(0, NA)) + #' ylab("y") @@ -76,7 +67,7 @@ smooth_quantile_reg <- function( mode = "regression", engine = "smoothqr", outcome_locations = NULL, - quantile_levels = 0.5, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), degree = 3L) { # Check for correct mode if (mode != "regression") cli_abort("`mode` must be 'regression'") @@ -178,7 +169,7 @@ make_smooth_quantile_reg <- function() { x <- lapply(unname(split( p, seq(nrow(p)) )), function(q) unname(sort(q, na.last = TRUE))) - dist_quantiles(x, list(object$tau)) + quantile_pred(do.call(rbind, x), object$tau) }) n_preds <- length(list_of_pred_distns[[1]]) nout <- length(list_of_pred_distns) diff --git a/R/model-methods.R b/R/model-methods.R index f3b374879..dd08d3efc 100644 --- a/R/model-methods.R +++ b/R/model-methods.R @@ -32,8 +32,7 @@ #' #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index f014961e6..3534b4eed 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -1,4 +1,10 @@ #' Turn a vector of quantile distributions into a list-col +#' +#' `r lifecycle::badge("deprecated")` +#' +#' This function is deprecated. The recommended alternative is +#' [hardhat::quantile_pred()] with [tibble::as_tibble()] + #' #' @param x a `distribution` containing `dist_quantiles` #' @@ -6,159 +12,107 @@ #' @export #' #' @examples -#' library(dplyr) -#' library(tidyr) -#' edf <- case_death_rate_subset[1:3, ] -#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) +#' pred_quantile <- quantile_pred(matrix(rnorm(20), 5), c(.2, .4, .6, .8)) +#' nested_quantiles(pred_quantile) +#' +#' pred_quantile %>% +#' as_tibble() %>% +#' tidyr::nest(.by = .row) %>% +#' dplyr::select(-.row) #' -#' edf_nested <- edf %>% mutate(q = nested_quantiles(q)) -#' edf_nested %>% unnest(q) nested_quantiles <- function(x) { - stopifnot(is_dist_quantiles(x)) - distributional:::dist_apply(x, .f = function(z) { - as_tibble(vec_data(z)) %>% - mutate(across(everything(), as.double)) %>% - vctrs::list_of() - }) + lifecycle::deprecate_warn("0.1.11", "nested_quantiles()", "hardhat::quantile_pred()") + if (inherits(x, "quantile_pred")) { + return(x %>% as_tibble() %>% tidyr::nest(.by = .row) %>% + dplyr::select(data)) + } + cli_abort( + "`nested_quantiles()` is deprecated. See {.fn hardhat::quantile_pred}." + ) } -#' Pivot columns containing `dist_quantile` longer +#' Pivot a column containing `quantile_pred` to explicit rows or columns +#' +#' Both functions expand a column of `quantile_pred`s into the separate +#' quantiles. Since each consists of a set of names (quantiles) and values, +#' these operate analogously with `pivot_wider` and `pivot_longer`. #' -#' Selected columns that contain `dist_quantiles` will be "lengthened" with -#' the quantile levels serving as 1 column and the values as another. If -#' multiple columns are selected, these will be prefixed with the column name. +#' `piot_quantiles_wider` creates a new column for each `quantile_level`, with +#' the values as the corresponding quantile values. When pivoting multiple +#' columns, the original column name will be used as a prefix. +#' +#' Similarly, `pivot_quantiles_longer` assigns the selected columns +#' `quantile_level`s in one column and the `value`s in another. If multiple +#' columns are selected, these will be prefixed with the column name. #' #' @param .data A data frame, or a data frame extension such as a tibble or #' epi_df. -#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One unquoted #' expressions separated by commas. Variable names can be used as if they -#' were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. -#' @param .ignore_length_check If multiple columns are selected, as long as -#' each row has contains the same number of quantiles, the result will be -#' reasonable. But if, for example, `var1[1]` has 5 quantiles while `var2[1]` -#' has 7, then the only option would be to recycle everything, creating a -#' _very_ long result. By default, this would throw an error. But if this is -#' really the goal, then the error can be bypassed by setting this argument -#' to `TRUE`. The quantiles in the first selected column will vary the fastest. +#' were positions in the data frame. Note that only one variable +#' can be selected for this operation. #' #' @return An object of the same class as `.data`. #' @export +#' @name pivot_quantiles #' #' @examples -#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' d1 <- quantile_pred(rbind(1:3, 2:4), 1:3 / 4) +#' d2 <- quantile_pred(rbind(2:4, 3:5), 2:4 / 5) #' tib <- tibble(g = c("a", "b"), d1 = d1, d2 = d2) #' #' pivot_quantiles_longer(tib, "d1") #' pivot_quantiles_longer(tib, dplyr::ends_with("1")) -#' pivot_quantiles_longer(tib, d1, d2) -pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { - cols <- validate_pivot_quantiles(.data, ...) - .data <- .data %>% mutate(across(all_of(cols), nested_quantiles)) - if (length(cols) > 1L) { - lengths_check <- .data %>% - dplyr::transmute(across(all_of(cols), ~ map_int(.x, vctrs::vec_size))) %>% - as.matrix() %>% - apply(1, function(x) dplyr::n_distinct(x) == 1L) %>% - all() - if (lengths_check) { - .data <- tidyr::unnest(.data, all_of(cols), names_sep = "_") - } else { - if (.ignore_length_check) { - for (col in cols) { - .data <- .data %>% tidyr::unnest(all_of(col), names_sep = "_") - } - } else { - cli::cli_abort(c( - "Some selected columns contain different numbers of quantiles.", - "The result would be a {.emph very} long {.cls tibble}.", - "To do this anyway, rerun with `.ignore_length_check = TRUE`." - )) - } - } - } else { - .data <- .data %>% tidyr::unnest(all_of(cols)) - } - .data +#' pivot_quantiles_longer(tib, d2) +#' +#' pivot_quantiles_wider(tib, "d1") +#' pivot_quantiles_wider(tib, dplyr::ends_with("2")) +#' pivot_quantiles_wider(tib, d2) +NULL + + +#' @rdname pivot_quantiles +#' @export +pivot_quantiles_longer <- function(.data, ...) { + col <- validate_pivot_quantiles(.data, ...) + .data$.row <- seq_len(vctrs::vec_size(.data)) + long_tib <- as_tibble(.data[[col]]) + .data <- select(.data, !all_of(col)) + names(long_tib)[1:2] <- c(glue::glue("{col}_value"), glue::glue("{col}_quantile_level")) + left_join(.data, long_tib, by = ".row") %>% + select(!.row) } -#' Pivot columns containing `dist_quantile` wider -#' -#' Any selected columns that contain `dist_quantiles` will be "widened" with -#' the "taus" (quantile) serving as names and the values in the data frame. -#' When pivoting multiple columns, the original column name will be used as -#' a prefix. -#' -#' @param .data A data frame, or a data frame extension such as a tibble or -#' epi_df. -#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted -#' expressions separated by commas. Variable names can be used as if they -#' were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. -#' -#' @return An object of the same class as `.data` +#' @rdname pivot_quantiles #' @export -#' -#' @examples -#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -#' -#' pivot_quantiles_wider(tib, c("d1", "d2")) -#' pivot_quantiles_wider(tib, dplyr::starts_with("d")) -#' pivot_quantiles_wider(tib, d2) pivot_quantiles_wider <- function(.data, ...) { - cols <- validate_pivot_quantiles(.data, ...) - .data <- .data %>% mutate(across(all_of(cols), nested_quantiles)) - checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) - if (!all(checks)) { - nms <- cols[!checks] - cli::cli_abort(c( - "Quantiles must be the same length and have the same set of taus.", - i = "Check failed for variables(s) {.var {nms}}." - )) - } - - # tidyr::pivot_wider can crash if there are duplicates, this generally won't - # happen in our context. To avoid, silently add an index column and remove it - # later - .hidden_index <- seq_len(nrow(.data)) - .data <- tibble::add_column(.data, .hidden_index = .hidden_index) - if (length(cols) > 1L) { - for (col in cols) { - .data <- .data %>% - tidyr::unnest(all_of(col)) %>% - tidyr::pivot_wider( - names_from = "quantile_levels", values_from = "values", - names_prefix = paste0(col, "_") - ) - } - } else { - .data <- .data %>% - tidyr::unnest(all_of(cols)) %>% - tidyr::pivot_wider(names_from = "quantile_levels", values_from = "values") - } - select(.data, -.hidden_index) + col <- validate_pivot_quantiles(.data, ...) + .data$.row <- seq_len(vctrs::vec_size(.data)) + wide_tib <- as_tibble(.data[[col]]) %>% + tidyr::pivot_wider(names_from = .quantile_levels, values_from = .pred_quantile) + .data <- select(.data, !all_of(col)) + left_join(.data, wide_tib, by = ".row") %>% + select(!.row) } pivot_quantiles <- function(.data, ...) { - msg <- c( - "{.fn pivot_quantiles} was deprecated in {.pkg epipredict} 0.0.6", - i = "Please use {.fn pivot_quantiles_wider} instead." - ) - lifecycle::deprecate_stop(msg) + lifecycle::deprecate_stop("0.0.6", "pivot_quantiles()", "pivot_quantiles_wider()") } -validate_pivot_quantiles <- function(.data, ...) { +validate_pivot_quantiles <- function(.data, ..., call = caller_env()) { expr <- rlang::expr(c(...)) cols <- names(tidyselect::eval_select(expr, .data)) - dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) - if (!all(dqs)) { - nms <- cols[!dqs] - cli::cli_abort( - "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." + if (length(cols) > 1L) { + cli_abort( + "Only one column can be pivotted. Can not pivot all of: {.var {cols}}.", + call = call + ) + } + if (!inherits(.data[[cols]], "quantile_pred")) { + cli_abort( + "{.var {cols}} is not {.cls `quantile_pred`}. Cannot pivot it.", + call = call ) } cols diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R new file mode 100644 index 000000000..293fad902 --- /dev/null +++ b/R/quantile_pred-methods.R @@ -0,0 +1,313 @@ +#' A distribution parameterized by a set of quantiles +#' +#' `r lifecycle::badge("deprecated")` +#' +#' This function is deprecated. The recommended alternative is +#' [hardhat::quantile_pred()]. +#' +#' @param values A vector (or list of vectors) of values. +#' @param quantile_levels A vector (or list of vectors) of probabilities +#' corresponding to `values`. +#' +#' When creating multiple sets of `values`/`quantile_levels` resulting in +#' different distributions, the sizes must match. See the examples below. +#' +#' @return A vector of class `"distribution"`. +#' +#' @export +#' @keywords internal +#' +#' @importFrom vctrs as_list_of vec_recycle_common new_vctr +dist_quantiles <- function(values, quantile_levels) { + lifecycle::deprecate_warn("0.1.11", "dist_quantiles()", "hardhat::quantile_pred()") + if (is.list(values)) { + n <- length(values) + values <- unlist(values) + return(quantile_pred(matrix(values, nrow = n, byrow = TRUE), quantile_levels)) + } else if (is.matrix(values)) { + return(quantile_pred(values, quantile_levels)) + } else if (is.vector(values)) { + return(quantile_pred(matrix(values, nrow = 1), quantile_levels)) + } + cli_abort(c( + "`dist_quantiles()` is deprecated and the format of `values` could not", + `!` = "be automatically converted to work with the replacement.", + i = "See {.fn hardhat::quantile_pred}." + )) +} + +# placeholder to avoid errors, but not ideal +#' @importFrom hardhat quantile_pred +#' @export +mean.quantile_pred <- function(x, na.rm = FALSE, ...) { + median(x, ...) +} + +# These next 3 functions should probably be added via PR to {hardhat} +# Only the third is actually *needed* at the moment. +# The second doesn't work correctly (not sure why), but leaving here for the +# future. +# +# We only export the third. +# +# self-self method, should work only if attr(quantile_levels) are compatible +# #' @importFrom vctrs vec_ptype2 vec_cast +# #' @importFrom hardhat extract_quantile_levels +# #' @export +# #' @keywords internal +# vec_ptype2.quantile_pred.quantile_pred <- function( +# x, y, ..., x_arg = "", y_arg = "", call = caller_env() +# ) { +# if (all(extract_quantile_levels(y) %in% extract_quantile_levels(x))) { +# return(x) +# } +# if (all(extract_quantile_levels(x) %in% extract_quantile_levels(y))) { +# return(y) +# } +# vctrs::stop_incompatible_type( +# x, y, x_arg = x_arg, y_arg = y_arg, +# details = "`quantile_levels` must be compatible (a superset/subset relation)." +# ) +# } + +# currently doesn't work +# #' @export +# vec_cast.quantile_pred.quantile_pred <- function( +# x, to, ..., x_arg = caller_arg(x), to_arg = caller_arg(to), +# call = caller_env() +# ) { +# to_ql <- extract_quantile_levels(to) +# x_ql <- extract_quantile_levels(x) +# x_in_to <- x_ql %in% to_ql +# to_in_x <- to_ql %in% x_ql +# if (all(x_in_to)) { +# mat <- matrix(NA, ncol = length(to_ql)) +# mat[ , to_in_x] <- c(as.matrix(x)) +# } else if (all(to_in_x)) { +# mat <- as.matrix(x)[ , x_in_to, drop = FALSE] +# } else { +# vctrs::stop_incompatible_type( +# x, to, x_arg = x_arg, y_arg = to_arg, +# details = "`quantile_levels` must be compatible (a superset/subset relation)." +# ) +# } +# quantile_pred(mat, to_ql) +# } + + +# Convert the quantile_pred to a data frame (named with the .quantile_levels) +# This powers vec_proxy_equal (and hence ==, !=, is.na, etc) +# It also powers vec_proxy_compare, so, given matching colnames, these should +# work out of the box. +# +#' @importFrom vctrs vec_proxy_equal +#' @export +vec_proxy_equal.quantile_pred <- function(x, ...) { + as_tibble(x) %>% + tidyr::pivot_wider( + names_from = .quantile_levels, + values_from = .pred_quantile + ) %>% + dplyr::select(-.row) +} + + +# quantiles by treating quantile_pred like a distribution ----------------- + + +#' Quantiles from a distribution +#' +#' Given a [hardhat::quantile_pred] object, users may wish to compute additional +#' `quantile_levels` that are not part of the object. This function attempts +#' to estimate these quantities under some assumptions. Interior probabilities, +#' those contained within existing probabilities are interpolated in a manner +#' controled by the `middle` argument. Those outside existing probabilities +#' are extrapolated under the assumption that the tails of the distribution +#' decays exponentially. Optionally, one may constrain _all_ quantiles to be +#' within some support (say, `[0, Inf)`). +#' +#' @inheritParams stats::quantile +#' @param ... unused +#' @param lower Scalar. Optional lower bound. +#' @param upper Scalar. Optional upper bound. +#' @param middle Controls how extrapolation to "interior" probabilities is +#' performed. "cubic" attempts to use [stats::splinefun()] while "linear" +#' uses [stats::approx()]. The "linear" method is used as a fallback if +#' "cubic" should fail for some reason. +#' +#' @returns a matrix with one row for each entry in `x` and one column for each +#' value in `probs` +#' @seealso [extrapolate_quantiles()] +#' @export +#' @importFrom stats quantile +#' +#' @examples +#' qp <- quantile_pred(matrix(1:8, nrow = 2, byrow = TRUE), 1:4 / 5) +#' quantile(qp) +#' quantile(qp, lower = 0) +#' quantile(qp, probs = 0.5) +#' quantile(qp, probs = 1:9 / 10) +quantile.quantile_pred <- function(x, + probs = seq(0, 1, 0.25), + na.rm = FALSE, + lower = -Inf, + upper = Inf, + middle = c("cubic", "linear"), + ...) { + arg_is_probabilities(probs) + arg_is_scalar(lower, upper, na.rm) + arg_is_numeric(lower, upper) + arg_is_lgl(na.rm) + + if (lower > upper) { + cli_abort("`lower` must be less than `upper`.") + } + + if (is.unsorted(probs)) probs <- sort(probs) + middle <- rlang::arg_match(middle) + snap(quantile_internal(x, probs, middle), lower, upper) +} + + +quantile_internal <- function(x, tau_out, middle) { + tau <- x %@% "quantile_levels" + qvals <- as.matrix(x) + + # short circuit if we aren't actually extrapolating + # matches to ~15 decimals + if (all(tau_out %in% tau) && !anyNA(qvals)) { + return(qvals[, match(tau_out, tau), drop = FALSE]) + } + if (length(tau) < 2) { + cli_abort(paste( + "Quantile extrapolation is not possible when fewer than 2 quantiles", + "are available." + )) + } + qvals_out <- map( + vctrs::vec_chop(qvals), + ~ extrapolate_quantiles_single(.x, tau, tau_out, middle) + ) + qvals_out <- do.call(rbind, qvals_out) # ensure a matrix of the proper dims + qvals_out +} + +extrapolate_quantiles_single <- function(qvals, tau, tau_out, middle) { + qvals_out <- rep(NA, length(tau_out)) + good <- !is.na(qvals) + if (!any(good)) { + return(qvals_out) + } + qvals <- qvals[good] + tau <- tau[good] + + # in case we only have one point, and it matches something we wanted + if (length(good) < 2) { + matched_one <- tau_out %in% tau + qvals_out[matched_one] <- qvals[matched_one] + return(qvals_out) + } + + indl <- tau_out < min(tau) + indr <- tau_out > max(tau) + indm <- !indl & !indr + + if (middle == "cubic") { + method <- "cubic" + result <- tryCatch( + { + Q <- stats::splinefun(tau, qvals, method = "hyman") + quartiles <- Q(c(.25, .5, .75)) + }, + error = function(e) { + return(NA) + } + ) + } + if (middle == "linear" || any(is.na(result))) { + method <- "linear" + quartiles <- stats::approx(tau, qvals, c(.25, .5, .75))$y + } + if (any(indm)) { + qvals_out[indm] <- switch(method, + linear = stats::approx(tau, qvals, tau_out[indm])$y, + cubic = Q(tau_out[indm]) + ) + } + if (any(indl) || any(indr)) { + qv <- data.frame( + q = c(tau, tau_out[indm]), + v = c(qvals, qvals_out[indm]) + ) %>% + dplyr::distinct(q, .keep_all = TRUE) %>% + arrange(q) + } + if (any(indl)) { + qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2)) + } + if (any(indr)) { + qvals_out[indr] <- tail_extrapolate(tau_out[indr], utils::tail(qv, 2)) + } + qvals_out +} + +logit <- function(p) { + p <- pmax(pmin(p, 1), 0) + log(p) - log(1 - p) +} + +# extrapolates linearly on the logistic scale using +# the two points nearest the tail +tail_extrapolate <- function(tau_out, qv) { + if (nrow(qv) == 1L) { + return(rep(qv$v[1], length(tau_out))) + } + x <- logit(qv$q) + x0 <- logit(tau_out) + y <- qv$v + m <- diff(y) / diff(x) + m * (x0 - x[1]) + y[1] +} + + +# mathematical operations on the values ----------------------------------- + + +#' @importFrom vctrs vec_math +#' @export +#' @method vec_math quantile_pred +vec_math.quantile_pred <- function(.fn, .x, ...) { + fn <- .fn + .fn <- getExportedValue("base", .fn) + if (fn %in% c("any", "all", "prod", "sum", "cumsum", "cummax", "cummin", "cumprod")) { + cli_abort("{.fn {fn}} is not a supported operation for {.cls quantile_pred}.") + } + quantile_levels <- .x %@% "quantile_levels" + .x <- as.matrix(.x) + quantile_pred(.fn(.x), quantile_levels) +} + +#' @importFrom vctrs vec_arith vec_arith.numeric +#' @export +#' @method vec_arith quantile_pred +vec_arith.quantile_pred <- function(op, x, y, ...) { + UseMethod("vec_arith.quantile_pred", y) +} + +#' @export +#' @method vec_arith.quantile_pred numeric +vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { + op_fn <- getExportedValue("base", op) + l <- vctrs::vec_recycle_common(x = x, y = y) + out <- op_fn(as.matrix(l$x), l$y) + quantile_pred(out, x %@% "quantile_levels") +} + +#' @export +#' @method vec_arith.numeric quantile_pred +vec_arith.numeric.quantile_pred <- function(op, x, y, ...) { + op_fn <- getExportedValue("base", op) + l <- vctrs::vec_recycle_common(x = x, y = y) + out <- op_fn(l$x, as.matrix(l$y)) + quantile_pred(out, y %@% "quantile_levels") +} diff --git a/R/reexports-tidymodels.R b/R/reexports-tidymodels.R index 3b28ac5c5..00cd7e4fd 100644 --- a/R/reexports-tidymodels.R +++ b/R/reexports-tidymodels.R @@ -18,10 +18,20 @@ recipes::bake #' @export recipes::rand_id -#' @importFrom tibble tibble +#' @importFrom tibble tibble as_tibble #' @export tibble::tibble +#' @export +tibble::as_tibble + #' @importFrom generics tidy #' @export generics::tidy + +#' @importFrom hardhat quantile_pred extract_quantile_levels +#' @export +hardhat::quantile_pred + +#' @export +hardhat::extract_quantile_levels diff --git a/R/reexports.R b/R/reexports.R new file mode 100644 index 000000000..4166b6e0c --- /dev/null +++ b/R/reexports.R @@ -0,0 +1,74 @@ +#' @importFrom dplyr filter +#' @export +dplyr::filter + +#' @importFrom dplyr mutate +#' @export +dplyr::mutate + +#' @importFrom dplyr rename +#' @export +dplyr::rename + +#' @importFrom dplyr select +#' @export +dplyr::select + +#' @importFrom epiprocess as_epi_df +#' @export +epiprocess::as_epi_df + +#' @importFrom epiprocess key_colnames +#' @export +epiprocess::key_colnames + +#' @importFrom generics fit +#' @export +generics::fit + +#' @importFrom generics forecast +#' @export +generics::forecast + +#' @importFrom generics tidy +#' @export +generics::tidy + +#' @importFrom recipes prep +#' @export +recipes::prep + +#' @importFrom recipes bake +#' @export +recipes::bake + +#' @importFrom recipes rand_id +#' @export +recipes::rand_id + +#' @importFrom tibble as_tibble +#' @export +tibble::as_tibble + +#' @importFrom tibble tibble +#' @export +tibble::tibble + +#' @importFrom tidyr pivot_longer +#' @export +tidyr::pivot_longer + +#' @importFrom tidyr pivot_wider +#' @export +tidyr::pivot_wider + +#' @importFrom tidyr unnest +#' @export +tidyr::unnest + +#' @importFrom hardhat quantile_pred extract_quantile_levels +#' @export +hardhat::quantile_pred + +#' @export +hardhat::extract_quantile_levels diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R new file mode 100644 index 000000000..5b9db2995 --- /dev/null +++ b/R/step_adjust_latency.R @@ -0,0 +1,366 @@ +#' Adapt the model to latent data +#' +#' @description +#' In the standard case, the arx models assume that the last observation is also +#' the day from which the forecast is being made. But if the data has latency, +#' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +#' to compensate. +#' This is most useful in realtime and +#' pseudo-prospective forecasting for data where there is some delay between the +#' event occurring and the event being reported. +#' +#' @details +#' This step allows the user to create models on the most recent +#' data, automatically accounting for latency patterns. Instead of using the last observation +#' date, `step_adjust_latency` uses the `as_of` date of the `epi_df` as the +#' `forecast_date`, and adjusts the model so that there is data available. To +#' demonstrate some of the subtleties, let's consider a toy dataset: +#' ```{r toy_df} +#' toy_df <- tribble( +#' ~geo_value, ~time_value, ~a, ~b, +#' "ma", as.Date("2015-01-11"), 20, 6, +#' "ma", as.Date("2015-01-12"), 23, NA, +#' "ma", as.Date("2015-01-13"), 25, NA, +#' "ca", as.Date("2015-01-11"), 100, 5, +#' "ca", as.Date("2015-01-12"), 103, 10, +#' ) %>% +#' as_epi_df(as_of = as.Date("2015-01-14")) +#' ``` +#' If we're looking to predict the value on the 15th, forecasting from the 14th +#' (the `as_of` date above), there are two issues we will need to address: +#' 1. `"ca"` is latent by 2 days, whereas `"ma"` is latent by 1 +#' 2. if we want to use `b` as an exogenous variable, for `"ma"` it is latent by +#' 3 days instead of just 1. +#' +#' Regardless of `method`, `epi_keys_checked="geo_value"` guarantees tha the +#' difference between `"ma"` and `"ca"` is accounted for by making the latency +#' adjustment at least 2. For some comparison, here's what the various methods +#' will do: +#' +#' ## `locf` +#' Short for "last observation carried forward", `locf` assumes that every day +#' between the last observation and the forecast day is exactly the same. +#' This is a very straightforward assumption, but wrecks any features that +#' depend on changes in value over time, such as the growth rate, or even +#' adjacent lags. A more robust version of this falls under the heading of +#' nowcasting, an eventual aim for this package. On the toy dataset, it +#' doesn't matter which day we're trying to predict, since it just fills +#' forward to the `forecast_date`: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(has_role("raw"), method="locf") +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' +#' ## `extend_lags` +#' `extend_lags` increases the lags so that they are guaranteed to have +#' data. This has the advantage of being applicable on +#' a per-column basis; if cases and deaths are reported at different +#' latencies, the lags for each are adjusted separately. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(has_role("raw"), method = "extend_lags") %>% +#' step_epi_lag(a, lag=1) %>% +#' step_epi_lag(b, lag=1) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' The maximum latency in column `a` is 2 days, so the lag is increased to 3, +#' while the max latency in column `b` is 3, so the same lag is increased to +#' 4; both of these changes are reflected in the column names. Meanwhile the +#' ahead is uneffected. +#' +#' As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +#' the values are brought forward in time, so that for a given row, column +#' `lag_3_a` represents the value 3 days before. +#' +#' ## `extend_ahead` +#' `extend_ahead` increases the ahead, turning a 3 day ahead forecast +#' into a 7 day one; this has the advantage of simplicity and is reflective of +#' the actual modelling task, but potentially leaves information unused if +#' different data sources have different latencies; it must use the latency of +#' the most latent data to insure there is data available. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(has_role("raw"), method="extend_ahead") %>% +#' step_epi_lag(a, lag=0) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' Even though we're doing a 1 day ahead forecast, because our worst latency +#' is 3 days from column `b`'s `"ma"` data, our outcome column is `ahead_4_a` +#' (so 4 days ahead). If we want to ignore any latency in column `b`, we need +#' to explicitly set the columns to consider while adjusting like this: +#' `step_adjust_latency(a, method="extend_ahead")`. +#' +#' # Programmatic details +#' `step_adjust_latency` uses the metadata, such as `time_type` and `as_of`, of +#' the `epi_df` used in the initial prep step, rather than baking or +#' prediction. This means reusing the same forecaster on new data is not +#' advised, though typically it is not advised in general. +#' +#' The latency adjustment only applies to columns created after this step, so +#' this step should go before both `step_epi_ahead` and `step_epi_lag`. This will work: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' # non-lag steps +#' step_adjust_latency(a, method = "extend_lags") %>% +#' step_epi_lag(a, lag=0) # other steps +#' ``` +#' while this will not: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_epi_lag(a, lag=0) %>% +#' step_adjust_latency(a, method = "extend_lags") +#' ``` +#' If you create columns that you then apply lags to (such as +#' `step_growth_rate()`), these should be created before +#' `step_adjust_latency`, so any subseqent latency can be addressed. +#' +#' @param method a character. Determines the method by which the +#' forecast handles latency. The options are: +#' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last +#' observation results in a forecast `ahead` after the `forecast_date` date. +#' E.g. if there are 3 days of latency between the last observation and the +#' `forecast_date` date for a 4 day ahead forecast, the ahead used in practice +#' is actually 7. +#' - `"locf"`: carries forward the last observed value(s) up to the forecast +#' date. +#' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that +#' the shortest lag at predict time is at the last observation. E.g. if the +#' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used +#' become `c(3,10,17)`. +#' @param epi_keys_checked a character vector. A list of keys to group by before +#' finding the `max_time_value` (the last day of data), defaulting to +#' `geo_value`. Different locations may have different latencies; to produce a +#' forecast at every location, we need to guarantee data at every location by +#' using the largest latency across every location; this means taking +#' `max_time_value` to be the minimum of the `max_time_value`s for each set of +#' key values (so the earliest date). If `NULL` or an empty character vector, +#' it will take the maximum across all values, irrespective of any keys. +#' +#' Note that this is a separate concern from different latencies across +#' different *data columns*, which is only handled by the choice of `method`. +#' @param keys_to_ignore a list of character vectors. Set this to avoid using +#' specific key values in the `epi_keys_checked` to set latency. For example, +#' say you have two locations `pr` and `gu` which have useful training data, +#' but have stopped providing up-to-date information, and so are no longer +#' part of the test set. Setting `keys_to_ignore = list(geo_value = c("pr", +#' "gu"))` will exclude them from the latency calculation. +#' @param fixed_latency either a positive integer, or a labeled positive integer +#' vector. Cannot be set at the same time as `fixed_forecast_date`. If +#' non-`NULL`, the amount to offset the ahead or lag by. If a single integer, +#' this is used for all columns; if a labeled vector, the labels must +#' correspond to the base column names (before lags/aheads). If `NULL`, the +#' latency is the distance between the `epi_df`'s `max_time_value` and the `forecast_date`. +#' @param fixed_forecast_date either a date of the same kind used in the +#' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives +#' the date from which the forecast is actually occurring. If `NULL`, the +#' `forecast_date` is determined either via the `fixed_latency`, or is set to +#' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. +#' @param check_latency_length bool, determines whether to warn if the latency +#' is unusually high. Turn off if you know your forecast is going to be far +#' into the future. +#' @template step-return +#' @inheritParams recipes::step_lag +#' +#' +#' @family row operation steps +#' @rdname step_adjust_latency +#' @export +#' @examples +#' rates <- covid_case_death_rates %>% +#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' # setting the `as_of` to something realistic +#' attributes(rates)$metadata$as_of <- max(rates$time_value) + 3 +#' +#' r <- epi_recipe(rates) %>% +#' step_adjust_latency(recipes::has_role("raw"), method = "extend_ahead") %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) +#' r +#' +#' rates_fit <- epi_workflow() %>% +#' add_epi_recipe(r) %>% +#' add_model(linear_reg()) %>% +#' fit(data = rates) +#' rates_fit +#' +#' @importFrom recipes detect_step +#' @importFrom rlang enquos is_empty +#' @importFrom dplyr tribble n +step_adjust_latency <- + function(recipe, + ..., + method = c( + "extend_ahead", + "locf", + "extend_lags" + ), + epi_keys_checked = NULL, + keys_to_ignore = c(), + fixed_latency = NULL, + fixed_forecast_date = NULL, + check_latency_length = TRUE, + id = rand_id("adjust_latency")) { + step_adjust_latency_checks( + id, method, recipe, fixed_latency, fixed_forecast_date + ) + method <- rlang::arg_match(method) + if (is.null(epi_keys_checked)) { + epi_keys_checked <- kill_time_value(key_colnames(recipe$template)) + } + recipes::add_step( + recipe, + step_adjust_latency_new( + terms = enquos(...), + role = NA, + trained = FALSE, + fixed_forecast_date = fixed_forecast_date, + forecast_date = NULL, + latency = fixed_latency, + latency_table = NULL, + latency_sign = NULL, + metadata = NULL, + method = method, + epi_keys_checked = epi_keys_checked, + keys_to_ignore = keys_to_ignore, + check_latency_length = check_latency_length, + columns = NULL, + skip = FALSE, + id = id + ) + ) + } + +step_adjust_latency_new <- + function(terms, role, trained, fixed_forecast_date, forecast_date, latency, + latency_table, latency_sign, metadata, method, epi_keys_checked, + keys_to_ignore, check_latency_length, columns, skip, id) { + step( + subclass = "adjust_latency", + terms = terms, + role = role, + trained = trained, + fixed_forecast_date = fixed_forecast_date, + forecast_date = forecast_date, + latency = latency, + latency_table = latency_table, + latency_sign = latency_sign, + metadata = metadata, + method = method, + epi_keys_checked = epi_keys_checked, + keys_to_ignore = keys_to_ignore, + check_latency_length = check_latency_length, + columns = columns, + skip = skip, + id = id + ) + } + +# lags introduces max(lags) NA's after the max_time_value. +#' @export +#' @importFrom glue glue +#' @importFrom dplyr rowwise +prep.step_adjust_latency <- function(x, training, info = NULL, ...) { + latency <- x$latency + col_names <- recipes::recipes_eval_select(x$terms, training, info) + + forecast_date <- x$fixed_forecast_date %||% + get_forecast_date(training, info, x$epi_keys_checked, latency) + + latency_table <- get_latency_table( + training, col_names, forecast_date, latency, + get_sign(x), x$epi_keys_checked, x$keys_to_ignore, info, x$terms + ) + + step_adjust_latency_new( + terms = x$terms, + role = x$role, + trained = TRUE, + fixed_forecast_date = x$fixed_forecast_date, + forecast_date = forecast_date, + latency = x$latency, + latency_table = latency_table, + latency_sign = get_sign(x), + metadata = attributes(training)$metadata, + method = x$method, + epi_keys_checked = x$epi_keys_checked, + keys_to_ignore = x$keys_to_ignore, + check_latency_length = x$check_latency_length, + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @importFrom dplyr %>% pull group_by_at +#' @importFrom tidyr fill +#' @export +bake.step_adjust_latency <- function(object, new_data, ...) { + if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { + new_data <- as_epi_df(new_data, as_of = object$forecast_date, other_keys = object$metadata$other_keys %||% character()) + attributes(new_data)$metadata <- object$metadata + compare_bake_prep_latencies(object, new_data) + } + if (object$method == "locf") { + # locf doesn't need to mess with the metadata at all, it just forward-fills + # the requested columns + rel_keys <- setdiff(key_colnames(new_data), "time_value") + modified_columns <- object$columns %>% unname() + if (object$check_latency_length) { + check_interminable_latency( + new_data, object$latency_table, modified_columns, object$forecast_date + ) + } + + new_data <- new_data %>% + pad_to_end(rel_keys, object$forecast_date, modified_columns) %>% + # group_by_at(rel_keys) %>% + arrange(time_value) %>% + as_tibble() %>% + tidyr::fill(.direction = "down", any_of(modified_columns)) %>% + ungroup() + } else if (object$method == "extend_lags" || object$method == "extend_ahead") { + attributes(new_data)$metadata$latency_table <- object$latency_table + attributes(new_data)$metadata$latency_sign <- object$latency_sign + } + return(new_data) +} + + + +#' @export +print.step_adjust_latency <- + function(x, width = max(20, options$width - 35), ...) { + if (!is.null(x$forecast_date)) { + conj <- "w/ forecast date" + extra_text <- x$forecast_date + } else if (!is.null(x$latency_table)) { + conj <- if (nrow(x$latency) == 1) { + "w/ latency" + } else { + "w/ latencies" + } + extra_text <- unique(x$latency_table$latency) + } else { + conj <- "latency" + extra_text <- "TBD at train time" + } + title <- trimws(paste("Adj.", x$method)) + print_epi_step(x$columns, x$terms, x$trained, title, + conjunction = conj, extra_text = extra_text + ) + } diff --git a/R/step_climate.R b/R/step_climate.R new file mode 100644 index 000000000..fa505298d --- /dev/null +++ b/R/step_climate.R @@ -0,0 +1,437 @@ +#' Calculate a climatological variable based on the history +#' +#' `step_climate()` creates a *specification* of a recipe step that will +#' generate one or more new columns of derived data. This step examines all +#' available seasons in the training data and calculates the a measure of center +#' for the "typical" season. Think of this like with the weather: to predict the +#' temperature in January in Pittsburgh, PA, I might look at all previous +#' January's on record, average their temperatures, and include that in my +#' model. So it is important to _align_ the forecast horizon with the climate. +#' This step will work best if added after `step_epi_ahead()`, but that is not +#' strictly required. See the details for more information. +#' +#' @details +#' Construction of a climate predictor can be helpful with strongly seasonal +#' data. But its utility is greatest when the estimated "climate" is aligned +#' to the forecast horizon. +#' For example, if today is December 1, and we want +#' to make a prediction for December 15, we want to know the climate for the +#' week of December 15 to use in our model. But we also want to align the rest +#' of our training data with the climate _2 weeks after_ those dates. +#' +#' To accomplish +#' this, if we have daily data, we could use `time_type = "week"` and +#' `forecast_ahead = 2`. The climate predictor would be created by taking +#' averages over each week (with a window of a few weeks before and after, as +#' determined by `window_size`), and then aligning these with the appropriate dates +#' in the training data so that each `time_value` will "see" the typical climate 2 +#' weeks in the future. +#' +#' Alternatively, in the same scenario, we could use `time_type = "day"` and +#' `forecast_ahead = 14`. The climate predictor would be created by taking +#' averages over a small window around each _day_, and then aligning these with +#' the appropriate dates in the training data so that each `time_value` will +#' "see" the climate 14 days in the future. +#' +#' The only differences between these options is the type of averaging performed +#' over the historical data. In the first case, days in the same week will get +#' the same value of the climate predictor (because we're looking at weekly +#' windows), while in the second case, every day in the data will have the +#' average climate for the _day_ that happens 14 days in the future. +#' +#' Autodetecting the forecast horizon can only be guaranteed to work correctly +#' when the time types are the same: for example using daily data for training +#' and daily climate calculations. However, using weekly data, predicting 4 +#' weeks ahead, and setting `time_type = "month"` is perfectly reasonable. It's +#' just that the climate is calculated over _months_ (January, February, March, +#' etc.) so how to properly align this when producing a forecast for the 5th week +#' in the year is challenging. For scenarios like these, it may be best to +#' approximately match the times with `forecast_ahead = 1`, for example. +#' +#' +#' @inheritParams step_growth_rate +#' @param forecast_ahead The forecast horizon. By default, this step will try to +#' detect whether a forecast horizon has already been specified with +#' [step_epi_ahead()]. Alternatively, one can specify an explicit +#' horizon with a scalar integer. Auto-detection is only possible +#' when the time type of the `epi_df` used to create the `epi_recipe` is the +#' same as the aggregation +#' `time_type` specified in this step (say, both daily or both weekly). If, +#' for example, daily data is used with monthly time aggregation, then +#' auto-detection is not possible (and may in fact lead to strange behaviour +#' even if `forecast_ahead` is specified with an integer). See details below. +#' @param time_type The duration over which time aggregation should be performed. +#' @param center_method The measure of center to be calculated over the time +#' window. +#' @param window_size Scalar integer. How many time units on each side should +#' be included. For example, if `window_size = 3` and `time_type = "day"`, +#' then on each day in the data, the center will be calculated using 3 days +#' before and three days after. So, in this case, it operates like a weekly +#' rolling average, centered at each day. +#' @param epi_keys Character vector or `NULL`. Any columns mentioned will be +#' grouped before performing any center calculation. So for example, given +#' state-level data, a national climate would be calculated if `NULL`, but +#' passing `epi_keys = "geo_value"` would calculate the climate separately +#' by state. +#' @param role What role should be assigned for any variables created by this +#' step? "predictor" is the most likely choice. +#' @template step-return +#' +#' +#' @export +#' @examples +#' # automatically detects the horizon +#' r <- epi_recipe(covid_case_death_rates) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_climate(death_rate, time_type = "day") +#' r +#' +#' r %>% +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) +#' +#' # same idea, but using weekly climate +#' r <- epi_recipe(covid_case_death_rates) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_climate(death_rate, +#' forecast_ahead = 1, time_type = "epiweek", +#' window_size = 1L +#' ) +#' r +#' +#' r %>% +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) +#' +#' # switching the order is possible if you specify `forecast_ahead` +#' r <- epi_recipe(covid_case_death_rates) %>% +#' step_climate(death_rate, forecast_ahead = 7, time_type = "day") %>% +#' step_epi_ahead(death_rate, ahead = 7) +#' r +#' +#' r %>% +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) +step_climate <- + function(recipe, + ..., + forecast_ahead = "detect", + role = "predictor", + time_type = c("detect", "epiweek", "week", "month", "day"), + center_method = c("median", "mean"), + window_size = 3L, + epi_keys = NULL, + prefix = "climate_", + skip = FALSE, + id = rand_id("climate")) { + if (!is_epi_recipe(recipe)) { + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") + } + + ## Handle ahead autodetection, single outcome, time type + n_outcomes <- sum(recipe$var_info$role == "outcome") + time_type <- rlang::arg_match(time_type) + edf_time_type <- attr(recipe$template, "metadata")$time_type + if (time_type == "detect") time_type <- edf_time_type + if (edf_time_type == "custom") { + cli_abort("This step only works with daily, weekly, or yearmonth data.") + } + if (n_outcomes > 1L) { + cli_abort("Only one {.var outcome} role can be used with this step.") + } + if (is.character(forecast_ahead)) { + forecast_ahead <- rlang::arg_match(forecast_ahead) + if (detect_step(recipe, "epi_ahead")) { + outcomes <- extract_argument(recipe, "step_epi_ahead", "role") == "outcome" + forecast_ahead <- extract_argument(recipe, "step_epi_ahead", "ahead")[outcomes] + if (length(forecast_ahead) != 1L) { + cli_abort(c( + "To detect the `forecast_ahead` automatically, `step_epi_ahead()` + with role = 'outcome' must be specified.", + i = "Check your recipe, or specify this argument directly in `step_climate()`." + )) + } + ttype_ord <- match(time_type, c("day", "epiweek", "week", "month")) + ttype_ord <- ttype_ord - as.integer(ttype_ord > 2) + edf_ttype_ord <- match(edf_time_type, c("day", "week", "yearmonth")) + if (ttype_ord != edf_ttype_ord) { + cli_abort(c("Automatic detection of the `forecast_ahead` is only + supported if the original data and the time type for aggregation + are in the same units.", + i = "Here, the data is in {.val {edf_time_type}}s while + `time_type` is {.val {time_type}}.", + i = "This is resolved most easily by specifying `forecast_ahead`." + )) + } + } else { + forecast_ahead <- 0L + } + } + arg_is_int(forecast_ahead) + + # check other args + center_method <- rlang::arg_match(center_method) + arg_is_chr(role) + arg_is_chr(epi_keys, allow_null = TRUE) + arg_is_nonneg_int(window_size) + arg_is_scalar(window_size) + arg_is_chr_scalar(prefix, id) + arg_is_lgl_scalar(skip) + + time_aggr <- switch(time_type, + epiweek = epiweek_leap, + week = isoweek_leap, + month = lubridate::month, + day = yday_leap + ) + + recipes::add_step( + recipe, + step_climate_new( + terms = enquos(...), + role = role, + trained = FALSE, + forecast_ahead = forecast_ahead, + time_type = time_type, + time_aggr = time_aggr, + modulus = NULL, + center_method = center_method, + window_size = window_size, + epi_keys = epi_keys, + climate_table = NULL, + prefix = prefix, + columns = NULL, + skip = skip, + id = id, + case_weights = NULL + ) + ) + } + + +step_climate_new <- + function(terms, + role, + trained, + forecast_ahead, + time_type, + time_aggr, + modulus, + center_method, + window_size, + epi_keys, + climate_table, + prefix, + columns, + skip, + id, + case_weights) { + recipes::step( + subclass = "climate", + terms = terms, + role = role, + trained = trained, + forecast_ahead = forecast_ahead, + time_type = time_type, + time_aggr = time_aggr, + modulus = modulus, + center_method = center_method, + window_size = window_size, + epi_keys = epi_keys, + climate_table = climate_table, + prefix = prefix, + columns = columns, + skip = skip, + id = id, + case_weights = case_weights + ) + } + + + +#' @export +prep.step_climate <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + recipes::check_type(training[, col_names], types = c("double", "integer")) + wts <- recipes::get_case_weights(info, training) + wts_used <- !is.null(wts) + wts <- wts %||% rep(1, nrow(training)) + + modulus <- switch(x$time_type, + epiweek = 52L, # only sometimes true + week = 52L, + month = 12L, + day = 365L # only sometimes true + ) + + fn <- switch(x$center_method, + mean = function(x, w) stats::weighted.mean(x, w, na.rm = TRUE), + median = function(x, w) stats::median(x, na.rm = TRUE) + ) + # suppose it's week 52, and there is no week 53 this year; then + # as originally written for 1 week ahead this grabs from week 52+1 %% 53 + # which will be week 53, not week 1. + ahead_period <- switch(x$time_type, + epiweek = lubridate::weeks(x$forecast_ahead), + week = lubridate::weeks(x$forecast_ahead), + month = months(x$forecast_ahead), + day = lubridate::days(x$forecast_ahead), + ) + climate_table <- + training %>% + mutate( + # subtracts a month w/o rollover (usual behavior on weeks/days) + .idx = time_value %m-% ahead_period, + .idx = x$time_aggr(.idx), + .weights = wts + ) %>% + select(.idx, .weights, all_of(c(col_names, x$epi_keys))) %>% + tidyr::pivot_longer(all_of(unname(col_names))) %>% + dplyr::reframe( + roll_modular_multivec(value, .idx, .weights, fn, x$window_size, modulus), + .by = c("name", x$epi_keys) + ) %>% + tidyr::pivot_wider( + names_from = "name", values_from = "climate_pred", names_prefix = x$prefix + ) + + step_climate_new( + terms = x$terms, + role = x$role, + trained = TRUE, + forecast_ahead = x$forecast_ahead, + time_type = x$time_type, + time_aggr = x$time_aggr, + modulus = modulus, + center_method = x$center_method, + window_size = x$window_size, + epi_keys = x$epi_keys, + climate_table = climate_table, + prefix = x$prefix, + columns = col_names, + skip = x$skip, + id = x$id, + case_weights = wts_used + ) +} + + +#' @export +bake.step_climate <- function(object, new_data, ...) { + climate_table <- object$climate_table + new_data %>% + mutate(.idx = object$time_aggr(time_value)) %>% + left_join(climate_table, by = c(".idx", object$epi_keys)) %>% + select(-.idx) +} + + +#' @export +print.step_climate <- function(x, width = max(20, options()$width - 30), ...) { + print_epi_step( + x$columns, x$terms, x$trained, + title = "Calculating climate_predictor for ", + conjunction = "by", + extra_text = paste(x$time_type, "using the", x$center_method) + ) + invisible(x) +} + +#' group col by .idx values and sum windows around each .idx value +#' @param idx_in the relevant periodic part of time value, e.g. the week number, +#' limited to the relevant range +#' @param col the list of values indexed by `idx_in` +#' @param weights how much to weigh each particular datapoint (also indexed by +#' `idx_in`) +#' @param aggr the aggregation function, probably Quantile, mean, or median +#' @param window_size the number of .idx entries before and after to include in +#' the aggregation +#' @param modulus the number of days/weeks/months in the year, not including any +#' leap days/weeks +#' @importFrom lubridate %m-% +#' @keywords internal +roll_modular_multivec <- function(col, idx_in, weights, aggr, window_size, modulus) { + # make a tibble where data gives the list of all datapoints with the + # corresponding .idx + tib <- tibble(col = col, weights = weights, .idx = idx_in) |> + arrange(.idx) |> + tidyr::nest(data = c(col, weights), .by = .idx) + # storage for the results, includes all possible time indexes + out <- tibble(.idx = c(1:modulus, 999), climate_pred = double(modulus + 1)) + for (tib_idx in tib$.idx) { + entries <- within_window(tib_idx, window_size, modulus) + out$climate_pred[out$.idx == tib_idx] <- with( + purrr::list_rbind(tib %>% filter(.idx %in% entries) %>% pull(data)), + aggr(col, weights) + ) + } + # filter to only the ones we actually computed + out %>% filter(.idx %in% idx_in) +} + +#' generate the idx values within `window_size` of `target_idx` given that our +#' time value is of the type matching modulus +#' @param target_idx the time index which we're drawing the window around +#' @param window_size the size of the window on one side of `target_idx` +#' @param modulus the number of days/weeks/months in the year, not including any leap days/weeks +#' @keywords internal +within_window <- function(target_idx, window_size, modulus) { + entries <- (target_idx - window_size):(target_idx + window_size) %% modulus + entries[entries == 0] <- modulus + # note that because we are 1-indexing, we're looking for indices that are 1 + # larger than the actual day/week in the year + if (modulus == 365) { + # we need to grab just the window around the leap day on the leap day + if (target_idx == 999) { + # there's an extra data point in front of the leap day + entries <- (59 - window_size):(59 + window_size - 1) %% modulus + entries[entries == 0] <- modulus + # adding in the leap day itself + entries <- c(entries, 999) + } else if ((59 %in% entries) || (60 %in% entries)) { + # if we're on the Feb/March boundary for daily data, we need to add in the + # leap day data + entries <- c(entries, 999) + } + } else if (modulus == 52) { + # we need to grab just the window around the leap week on the leap week + if (target_idx == 999) { + entries <- (53 - window_size):(53 + window_size - 1) %% 52 + entries[entries == 0] <- 52 + entries <- c(entries, 999) + } else if ((52 %in% entries) || (1 %in% entries)) { + # if we're on the year boundary for weekly data, we need to add in the + # leap week data (which is the extra week at the end) + entries <- c(entries, 999) + } + } + entries +} + + +#' a function that assigns Feb 29th to 999, and aligns all other dates the same +#' number in the year, regardless of whether it's a leap year +#' @keywords internal +#' @importFrom lubridate yday month leap_year +yday_leap <- function(time_value) { + dplyr::case_when( + !leap_year(time_value) ~ yday(time_value), + leap_day(time_value) ~ 999, + TRUE ~ yday(time_value) - as.numeric(month(time_value) > 2L) + ) +} +leap_day <- function(x) lubridate::month(x) == 2 & lubridate::day(x) == 29 +#' epiweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap +#' @keywords internal +epiweek_leap <- function(time_value) { + week_values <- lubridate::epiweek(time_value) + week_values[week_values == 53] <- 999 + week_values +} +#' isoweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap +#' @keywords internal +isoweek_leap <- function(time_value) { + week_values <- lubridate::isoweek(time_value) + week_values[week_values == 53] <- 999 + week_values +} diff --git a/R/step_epi_naomit.R b/R/step_epi_naomit.R index d81ba398d..0544bc5f9 100644 --- a/R/step_epi_naomit.R +++ b/R/step_epi_naomit.R @@ -2,13 +2,18 @@ #' #' @param recipe Recipe to be used for omission steps #' -#' @return Omits NA's from both predictors and outcomes at training time -#' to fit the model. Also only omits associated predictors and not -#' outcomes at prediction time due to lack of response and avoidance -#' of data loss. +#' @return Omits NA's from both predictors and outcomes at training time to fit +#' the model. Also only omits associated predictors and not outcomes at +#' prediction time due to lack of response and avoidance of data loss. Given a +#' `recipe`, this step is literally equivalent to +#' ```{r, eval=FALSE} +#' recipe %>% +#' recipes::step_naomit(all_predictors(), skip = FALSE) %>% +#' recipes::step_naomit(all_outcomes(), skip = TRUE) +#' ``` #' @export #' @examples -#' case_death_rate_subset %>% +#' covid_case_death_rates %>% #' epi_recipe() %>% #' step_epi_naomit() step_epi_naomit <- function(recipe) { diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 465d64e7f..376728752 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -1,12 +1,11 @@ #' Create a shifted predictor #' #' `step_epi_lag` and `step_epi_ahead` create a *specification* of a recipe step -#' that will add new columns of shifted data. The former will created a lag -#' column, while the latter will create a lead column. Shifted data will -#' by default include NA values where the shift was induced. -#' These can be properly removed with [step_epi_naomit()], or you may -#' specify an alternative filler value with the `default` -#' argument. +#' that will add new columns of shifted data. The `step_epi_lag` will create +#' a lagged `predictor` column, while `step_epi_ahead` will create a leading +#' `outcome` column. Shifted data will by default include NA values where the +#' shift was induced. These can be properly removed with [step_epi_naomit()], +#' or you may specify an alternative value with the `default` argument. #' #' #' @param recipe A recipe object. The step will be added to the @@ -30,8 +29,13 @@ #' @param id A unique identifier for the step #' @template step-return #' -#' @details The step assumes that the data are already _in the proper sequential -#' order_ for shifting. +#' @details +#' +#' Our `lag/ahead` functions respect the `geo_value` and `other_keys` of the +#' `epi_df`, and allow for discontiguous `time_value`s. Both of these features +#' are noticably lacking from `recipe::step_lag()`. +#' Our `lag/ahead` functions also appropriately adjust the amount of data to +#' avoid accidentally dropping recent predictors from the test data. #' #' The `prefix` and `id` arguments are unchangeable to ensure that the code runs #' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they @@ -42,7 +46,7 @@ #' @rdname step_epi_shift #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r @@ -66,7 +70,10 @@ step_epi_lag <- )) } arg_is_nonneg_int(lag) - arg_is_chr_scalar(prefix, id) + arg_is_chr_scalar(prefix, id, role) + if (role == "outcome" && length(lag) > 1L) { + cli_abort("Only one {.val outcome} may be created with this step.") + } recipes::add_step( recipe, @@ -79,6 +86,8 @@ step_epi_lag <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -109,8 +118,7 @@ step_epi_ahead <- i = "Did you perhaps pass an integer in `...` accidentally?" )) } - arg_is_nonneg_int(ahead) - arg_is_chr_scalar(prefix, id) + arg_is_chr_scalar(prefix, id, role) recipes::add_step( recipe, @@ -123,6 +131,8 @@ step_epi_ahead <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -132,7 +142,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -143,6 +153,8 @@ step_epi_lag_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -150,7 +162,7 @@ step_epi_lag_new <- step_epi_ahead_new <- function(terms, role, trained, ahead, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_ahead", terms = terms, @@ -161,6 +173,8 @@ step_epi_ahead_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -170,6 +184,22 @@ step_epi_ahead_new <- #' @export prep.step_epi_lag <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$lag, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } + step_epi_lag_new( terms = x$terms, role = x$role, @@ -178,7 +208,9 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -186,6 +218,22 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { #' @export prep.step_epi_ahead <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$ahead, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } + step_epi_ahead_new( terms = x$terms, role = x$role, @@ -194,7 +242,9 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -204,81 +254,41 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { #' @export bake.step_epi_lag <- function(object, new_data, ...) { - grid <- tidyr::expand_grid(col = object$columns, lag = object$lag) %>% - mutate( - newname = glue::glue("{object$prefix}{lag}_{col}"), - shift_val = lag, - lag = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object) } #' @export bake.step_epi_ahead <- function(object, new_data, ...) { - grid <- tidyr::expand_grid(col = object$columns, ahead = object$ahead) %>% - mutate( - newname = glue::glue("{object$prefix}{ahead}_{col}"), - shift_val = -ahead, - ahead = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object) } - #' @export print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + lag <- x$shift_grid$shift_val + lag <- c(lag, "(lat adj)") + } else { + lag <- x$lag + } print_epi_step(x$columns, x$terms, x$trained, "Lagging", conjunction = "by", - extra_text = x$lag + extra_text = lag ) invisible(x) } + #' @export print.step_epi_ahead <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + ahead <- x$shift_grid$shift_val + ahead <- c(ahead, "(lat adj)") + } else { + ahead <- x$ahead + } print_epi_step(x$columns, x$terms, x$trained, "Leading", conjunction = "by", - extra_text = x$ahead + extra_text = ahead ) invisible(x) } diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index c7d3f9fbd..564d525d9 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -1,8 +1,9 @@ #' Calculate a rolling window transformation #' -#' `step_epi_slide()` creates a *specification* of a recipe step -#' that will generate one or more new columns of derived data by "sliding" -#' a computation along existing data. +#' `step_epi_slide()` creates a *specification* of a recipe step that will +#' generate one or more new columns of derived data by "sliding" a computation +#' along existing data. This is a wrapper around `epiprocess::epi_slide()` +#' to allow its use within an `epi_recipe()`. #' #' @inheritParams step_epi_lag #' @param .f A function in one of the following formats: @@ -36,8 +37,7 @@ #' #' @export #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-01-01"), geo_value %in% c("ca", "ny")) #' rec <- epi_recipe(jhu) %>% #' step_epi_slide(case_rate, death_rate, diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 06f8da4cf..80b5bf682 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -1,7 +1,8 @@ #' Calculate a growth rate #' -#' `step_growth_rate()` creates a *specification* of a recipe step -#' that will generate one or more new columns of derived data. +#' `step_growth_rate()` creates a *specification* of a recipe step that will +#' generate one or more new columns of derived data. This is a wrapper around +#' `epiprocess::growth_rate()` to allow its use within an `epi_recipe()`. #' #' #' @inheritParams step_epi_lag @@ -22,9 +23,7 @@ #' being removed from the data. Alternatively, you could specify arbitrary #' large values, or perhaps zero. Setting this argument to `NULL` will result #' in no replacement. -#' @param additional_gr_args_list A list of additional arguments used by -#' [epiprocess::growth_rate()]. All `...` arguments may be passed here along -#' with `dup_rm` and `na_rm`. +#' @inheritParams epiprocess::growth_rate #' @template step-return #' #' @@ -32,13 +31,17 @@ #' @importFrom epiprocess growth_rate #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' tiny_geos <- c("as", "mp", "vi", "gu", "pr") +#' rates <- covid_case_death_rates %>% +#' filter(time_value >= as.Date("2021-11-01"), !(geo_value %in% tiny_geos)) +#' +#' r <- epi_recipe(rates) %>% #' step_growth_rate(case_rate, death_rate) #' r #' #' r %>% -#' prep(case_death_rate_subset) %>% -#' bake(case_death_rate_subset) +#' prep(rates) %>% +#' bake(new_data = NULL) step_growth_rate <- function(recipe, ..., @@ -46,11 +49,11 @@ step_growth_rate <- horizon = 7, method = c("rel_change", "linear_reg"), log_scale = FALSE, + na_rm = TRUE, replace_Inf = NA, prefix = "gr_", skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + id = rand_id("growth_rate")) { if (!is_epi_recipe(recipe)) { cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") } @@ -58,20 +61,12 @@ step_growth_rate <- arg_is_pos_int(horizon) arg_is_scalar(horizon) if (!is.null(replace_Inf)) { - if (length(replace_Inf) != 1L) cli_abort("replace_Inf must be a scalar.") + if (length(replace_Inf) != 1L) cli_abort("`replace_Inf` must be a scalar.") if (!is.na(replace_Inf)) arg_is_numeric(replace_Inf) } arg_is_chr(role) arg_is_chr_scalar(prefix, id) - arg_is_lgl_scalar(log_scale, skip) - - - if (!is.list(additional_gr_args_list)) { - cli_abort(c( - "`additional_gr_args_list` must be a {.cls list}.", - i = "See `?epiprocess::growth_rate` for available options." - )) - } + arg_is_lgl_scalar(log_scale, skip, na_rm) recipes::add_step( recipe, @@ -82,13 +77,13 @@ step_growth_rate <- horizon = horizon, method = method, log_scale = log_scale, + na_rm = na_rm, replace_Inf = replace_Inf, prefix = prefix, keys = key_colnames(recipe), columns = NULL, skip = skip, - id = id, - additional_gr_args_list = additional_gr_args_list + id = id ) ) } @@ -101,13 +96,13 @@ step_growth_rate_new <- horizon, method, log_scale, + na_rm, replace_Inf, prefix, keys, columns, skip, - id, - additional_gr_args_list) { + id) { recipes::step( subclass = "growth_rate", terms = terms, @@ -116,13 +111,13 @@ step_growth_rate_new <- horizon = horizon, method = method, log_scale = log_scale, + na_rm = na_rm, replace_Inf = replace_Inf, prefix = prefix, keys = keys, columns = columns, skip = skip, - id = id, - additional_gr_args_list = additional_gr_args_list + id = id ) } @@ -137,13 +132,13 @@ prep.step_growth_rate <- function(x, training, info = NULL, ...) { horizon = x$horizon, method = x$method, log_scale = x$log_scale, + na_rm = x$na_rm, replace_Inf = x$replace_Inf, prefix = x$prefix, keys = x$keys, columns = recipes::recipes_eval_select(x$terms, training, info), skip = x$skip, - id = x$id, - additional_gr_args_list = x$additional_gr_args_list + id = x$id ) } @@ -177,10 +172,12 @@ bake.step_growth_rate <- function(object, new_data, ...) { across( all_of(object$columns), ~ epiprocess::growth_rate( - time_value, .x, + .x, + x = time_value, method = object$method, - h = object$horizon, log_scale = object$log_scale, - !!!object$additional_gr_args_list + h = object$horizon, + log_scale = object$log_scale, + na_rm = object$na_rm ), .names = "{object$prefix}{object$horizon}_{object$method}_{.col}" ) diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 39ae1ba59..1c38b0659 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -1,7 +1,14 @@ #' Calculate a lagged difference #' -#' `step_lag_difference()` creates a *specification* of a recipe step -#' that will generate one or more new columns of derived data. +#' `step_lag_difference()` creates a *specification* of a recipe step that will +#' generate one or more new columns of derived data. For each column in the +#' specification, `step_lag_difference()` will calculate the difference +#' between the values at a distance of `horizon`. For example, with +#' `horizon=1`, this would simply be the difference between adjacent days. +#' +#' Much like `step_epi_lag()` this step works with the actual time values (so if +#' there are gaps it will fill with `NA` values), and respects the grouping +#' inherent in the `epi_df()` as specified by `geo_value` and `other_keys`. #' #' #' @inheritParams step_epi_lag @@ -15,14 +22,14 @@ #' @family row operation steps #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) %>% #' step_epi_naomit() #' r #' #' r %>% -#' prep(case_death_rate_subset) %>% -#' bake(case_death_rate_subset) +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) step_lag_difference <- function(recipe, ..., diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 4515920b2..0721b30c0 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -1,27 +1,34 @@ #' Convert raw scale predictions to per-capita #' -#' `step_population_scaling` creates a specification of a recipe step -#' that will perform per-capita scaling. Typical usage would -#' load a dataset that contains state-level population, and use it to convert -#' predictions made from a raw scale model to rate-scale by dividing by -#' the population. -#' Although, it is worth noting that there is nothing special about "population". -#' The function can be used to scale by any variable. Population is the -#' standard use case in the epidemiology forecasting scenario. Any value -#' passed will *divide* the selected variables while the `rate_rescaling` -#' argument is a common *multiplier* of the selected variables. +#' `step_population_scaling()` creates a specification of a recipe step that +#' will perform per-capita scaling. Typical usage would set `df` to be a dataset +#' that contains population for each `geo_value`, and use it to convert +#' predictions made from a raw scale model to rate-scale by dividing by the +#' population. Although, it is worth noting that there is nothing special about +#' "population", and the function can be used to scale by any variable. +#' Population is the standard use case in the epidemiology forecasting scenario. +#' Any value passed will *divide* the selected variables while the +#' `rate_rescaling` argument is a common *multiplier* of the selected variables. #' #' @inheritParams step_epi_lag -#' @param df a data frame that contains the population data to be used for -#' inverting the existing scaling. -#' @param by A (possibly named) character vector of variables to join by. +#' @param role For model terms created by this step, what analysis role should +#' they be assigned? +#' @param df a data frame containing the scaling data (typically population). The +#' target column is divided by the value in `df_pop_col`. +#' @param by A (possibly named) character vector of variables by which to join +#' `df` to the `epi_df`. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the training/test +#' data with roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in +#' `df`; these roles are automatically set if you are using an `epi_df`, or you +#' can use, e.g., `update_role`. If no such roles are set, it will try to +#' perform a natural join, using variables in common between the training/test +#' data and population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` @@ -29,14 +36,14 @@ #' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' -#' See [dplyr::left_join()] for more details. +#' See [dplyr::inner_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and will be used for scaling. #' This should be one column. #' @param rate_rescaling Sometimes raw scales are "per 100K" or "per 1M". #' Adjustments can be made here. For example, if the original #' scale is "per 100K", then set `rate_rescaling = 1e5` to get rates. -#' @param create_new TRUE to create a new column and keep the original column +#' @param create_new `TRUE` to create a new column and keep the original column #' in the `epi_df` #' @param suffix a character. The suffix added to the column name if #' `create_new = TRUE`. Default to "_scaled". @@ -44,7 +51,6 @@ #' @return Scales raw data by the population #' @export #' @examples -#' library(dplyr) #' jhu <- cases_deaths_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' select(geo_value, time_value, cases) @@ -89,13 +95,25 @@ step_population_scaling <- suffix = "_scaled", skip = FALSE, id = rand_id("population_scaling")) { - arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, id) - arg_is_lgl(create_new, skip) - arg_is_chr(df_pop_col, suffix, id) + if (rlang::dots_n(...) == 0L) { + cli_abort(c( + "`...` must not be empty.", + ">" = "Please provide one or more tidyselect expressions in `...` + specifying the columns to which scaling should be applied.", + ">" = "If you really want to list `step_population_scaling` in your + recipe but not have it do anything, you can use a tidyselection + that selects zero variables, such as `c()`." + )) + } + arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, skip, id) + arg_is_chr(role, df_pop_col, suffix, id) + hardhat::validate_column_names(df, df_pop_col) arg_is_chr(by, allow_null = TRUE) + arg_is_numeric(rate_rescaling) if (rate_rescaling <= 0) { cli_abort("`rate_rescaling` must be a positive number.") } + arg_is_lgl(create_new, skip) recipes::add_step( recipe, @@ -138,6 +156,42 @@ step_population_scaling_new <- #' @export prep.step_population_scaling <- function(x, training, info = NULL, ...) { + if (is.null(x$by)) { + rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col) + lhs_potential_keys <- info %>% + filter(role %in% c("geo_value", "key", "time_value")) %>% + extract2("variable") %>% + unique() # in case of weird var with multiple of above roles + if (length(lhs_potential_keys) == 0L) { + # We're working with a recipe and tibble, and *_role hasn't set up any of + # the above roles. Let's say any column could actually act as a key, and + # lean on `intersect` below to make this something reasonable. + lhs_potential_keys <- names(training) + } + suggested_min_keys <- info %>% + filter(role %in% c("geo_value", "key")) %>% + extract2("variable") %>% + unique() + # (0 suggested keys if we weren't given any epikeytime var info.) + x$by <- intersect(lhs_potential_keys, rhs_potential_keys) + if (length(x$by) == 0L) { + cli_stop(c( + "Couldn't guess a default for `by`", + ">" = "Please rename columns in your population data to match those in your training data, + or manually specify `by =` in `step_population_scaling()`." + ), class = "epipredict__step_population_scaling__default_by_no_intersection") + } + if (!all(suggested_min_keys %in% x$by)) { + cli_warn(c( + "{setdiff(suggested_min_keys, x$by)} {?was an/were} epikey column{?s} in the training data, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {x$by}.", + ">" = "Double-check whether column names on the population `df` match those for your training data.", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}.", + ">" = "Manually specify `by =` to silence." + ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") + } + } step_population_scaling_new( terms = x$terms, role = x$role, @@ -156,16 +210,20 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { #' @export bake.step_population_scaling <- function(object, new_data, ...) { - object$by <- object$by %||% intersect( - epi_keys_only(new_data), - colnames(select(object$df, !object$df_pop_col)) - ) + if (is.null(object$by)) { + cli::cli_abort(c( + "`by` was not set and no default was filled in", + ">" = "If this was a fit recipe generated from an older version + of epipredict that you loaded in from a file, + please regenerate with the current version of epipredict." + )) + } joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(new_data, joinby$x) hardhat::validate_column_names(object$df, joinby$y) if (object$suffix != "_scaled" && object$create_new == FALSE) { - cli::cli_warn(c( + cli_warn(c( "Custom `suffix` {.val {object$suffix}} was ignored in `step_population_scaling`.", i = "Perhaps `create_new` should be {.val {TRUE}}?" )) @@ -177,7 +235,10 @@ bake.step_population_scaling <- function(object, new_data, ...) { suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(new_data)) - left_join(new_data, object$df, by = object$by, suffix = c("", ".df")) %>% + inner_join(new_data, object$df, + by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"), + suffix = c("", ".df") + ) %>% mutate( across( all_of(object$columns), diff --git a/R/step_training_window.R b/R/step_training_window.R index eafc076c7..dec380567 100644 --- a/R/step_training_window.R +++ b/R/step_training_window.R @@ -14,8 +14,12 @@ #' @inheritParams step_epi_lag #' @template step-return #' -#' @details Note that `step_epi_lead()` and `step_epi_lag()` should come -#' after any filtering step. +#' @details It is recommended to do this after any `step_epi_ahead()`, +#' `step_epi_lag()`, or `step_epi_naomit()` steps. If `step_training_window()` +#' happens first, there will be less than `n_training` remaining examples, +#' since either leading or lagging will introduce `NA`'s later removed by +#' `step_epi_naomit()`. Typical usage will use this step last in an +#' `epi_recipe()`. #' #' @export #' diff --git a/R/tidy.R b/R/tidy.R index 61b298411..3969f9dd6 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -26,8 +26,7 @@ #' `type` (the method, e.g. "predict", "naomit"), and a character column `id`. #' #' @examples -#' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -37,7 +36,7 @@ #' #' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) #' latest <- get_test_data(recipe = r, x = jhu) - +#' #' f <- frosting() %>% #' layer_predict() %>% #' layer_naomit(.pred) @@ -96,7 +95,7 @@ tidy.frosting <- function(x, number = NA, id = NA, ...) { #' @export tidy.layer <- function(x, ...) { - cli::cli_abort( + cli_abort( "No `tidy()` method exists for a layer with class: {.cls {class(x)}}." ) } diff --git a/R/utils-enframer.R b/R/utils-enframer.R deleted file mode 100644 index 0a8152906..000000000 --- a/R/utils-enframer.R +++ /dev/null @@ -1,23 +0,0 @@ -enframer <- function(df, x, fill = NA) { - stopifnot(is.data.frame(df)) - stopifnot(length(fill) == 1 || length(fill) == nrow(df)) - arg_is_chr(x, allow_null = TRUE) - if (is.null(x)) { - return(df) - } - if (any(names(df) %in% x)) { - stop("In enframer: some new cols match existing column names") - } - for (v in x) df <- dplyr::mutate(df, !!v := fill) - df -} - -enlist <- function(...) { - # converted to thin wrapper around - rlang::dots_list( - ..., - .homonyms = "error", - .named = TRUE, - .check_assign = TRUE - ) -} diff --git a/R/utils-latency.R b/R/utils-latency.R new file mode 100644 index 000000000..be947eb0e --- /dev/null +++ b/R/utils-latency.R @@ -0,0 +1,500 @@ +#' create a table of the columns to modify, their shifts, and their prefixes +#' @keywords internal +#' @importFrom dplyr tibble +#' @importFrom tidyr unnest +construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name) { + # for the right step types (either "step_epi_lag" or "step_epi_shift"), grab + # the useful parameters, including the evaluated column names + extract_named_rates <- function(recipe_step) { + if (inherits(recipe_step, rel_step_type)) { + recipe_columns <- recipes_eval_select(recipe_step$terms, recipe$template, recipe$term_info) + if (any(recipe_columns %in% terms_used)) { + return(list(term = recipe_columns, shift = recipe_step[shift_name], prefix = recipe_step$prefix)) + } + } + return(NULL) + } + rel_list <- recipe$steps %>% + map(extract_named_rates) %>% + unlist(recursive = FALSE) %>% + split(c("term", "shift", "prefix")) + relevant_shifts <- tibble( + terms = lapply(rel_list$term, unname), + shift = lapply(rel_list$shift, unname), + prefix = unname(unlist(rel_list$prefix)) + ) %>% + unnest(c(terms, shift)) %>% + unnest(shift) + return(relevant_shifts) +} + +#' Extract the as_of for the forecast date, and make sure there's nothing very off about it. +#' @keywords internal +#' @importFrom dplyr select +#' @importFrom tidyr drop_na +#' @importFrom utils capture.output +get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns = NULL) { + if (is.null(columns)) { + columns <- info %>% + filter(source == "original") %>% + pull(variable) + # make sure that there's enough column names + if (length(columns) < 3) { + cli_abort( + glue::glue( + "The original columns of `time_value`, ", + "`geo_value` and at least one signal. The current colums are \n", + paste(capture.output(object$info), collapse = "\n\n") + ), + class = "epipredict__get_forecast_date__too_few_data_columns" + ) + } + } + max_time <- get_max_time(new_data, epi_keys_checked, columns) + # the source data determines the actual time_values + if (is.null(latency)) { + forecast_date <- attributes(new_data)$metadata$as_of + } else { + if (is.null(max_time)) { + cli_abort("max_time is null. This likely means there is one of {columns} that is all `NA`") + } + forecast_date <- max_time + latency + } + # make sure the as_of is sane + if (!inherits(forecast_date, class(new_data$time_value)) & !inherits(forecast_date, "POSIXt")) { + cli_abort( + paste( + "the data matrix `forecast_date` value is {forecast_date}, ", + "and not a valid `time_type` with type ", + "matching `time_value`'s type of ", + "{class(max_time)}." + ), + class = "epipredict__get_forecast_date__wrong_time_value_type_error" + ) + } + if (is.null(forecast_date) || is.na(forecast_date)) { + cli_warn( + paste( + "epi_data's `forecast_date` was `NA`, setting to ", + "the latest non-`NA` time value for these columns, {max_time}." + ), + class = "epipredict__get_forecast_date__max_time_warning" + ) + forecast_date <- max_time + } else if (!is.null(max_time) && (forecast_date < max_time)) { + cli_abort( + paste( + "`forecast_date` ({(forecast_date)}) is before the most ", + "recent data ({max_time}). Remove before ", + "predicting." + ), + class = "epipredict__get_forecast_date__misordered_forecast_date_error" + ) + } + # TODO cover the rest of the possible types for as_of and max_time... + if (inherits(new_data$time_value, "Date")) { + forecast_date <- as.Date(forecast_date) + } + return(forecast_date) +} + +get_max_time <- function(new_data, epi_keys_checked, columns) { + # these are the non-na time_values; + # get the minimum value across the checked epi_keys' maximum time values + max_time <- new_data %>% + select(all_of(columns)) %>% + drop_na() + if (nrow(max_time) == 0) { + return(NULL) + } + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && all(epi_keys_checked != "")) { + max_time <- max_time %>% group_by(across(all_of(epi_keys_checked))) + } + max_time <- max_time %>% + summarise(time_value = max(time_value)) %>% + pull(time_value) %>% + min() + return(max_time) +} + + + +#' the latency is also the amount the shift is off by +#' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you +#' need to shift the data to bring the 3 day lagged value to today. +#' @keywords internal +get_latency <- function(new_data, forecast_date, column, sign_shift, epi_keys_checked) { + shift_max_date <- new_data %>% + drop_na(all_of(column)) + if (nrow(shift_max_date) == 0) { + # if everything is an NA, there's infinite latency, but shifting by that is + # untenable. May as well not shift at all + return(0) + } + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && all(epi_keys_checked != "")) { + shift_max_date <- shift_max_date %>% group_by(across(all_of(epi_keys_checked))) + } + shift_max_date <- shift_max_date %>% + summarise(time_value = max(time_value)) %>% + pull(time_value) %>% + min() + return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date))) +} + + + +#' get the target date while in a layer +#' @param this_recipe the recipe to check for `step_adjust_latency` +#' @param workflow_max_time_value the `max_time` value coming out of the fit +#' workflow (this will be the maximal time value in a potentially different +#' dataset) +#' @param new_data the data we're currently working with, from which we'll take +#' a potentially different max_time_value +#' @keywords internal +get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { + forecast_date <- as.Date(max( + workflow_max_time_value, + this_recipe$max_time_value, + max(new_data$time_value) + )) + if (this_recipe %>% recipes::detect_step("adjust_latency")) { + # get the as_of in an `adjust_latency` step, regardless of where + handpicked_forecast_date <- map( + this_recipe$steps, + function(x) { + if (inherits(x, "step_adjust_latency")) x$forecast_date + } + ) %>% Filter(Negate(is.null), .) + if (length(handpicked_forecast_date) > 0) { + forecast_date <- handpicked_forecast_date[[1]] + } else { + # if we haven't chosen one, use either the max_time_value or the as_of + forecast_date <- max( + forecast_date, + attributes(new_data)$metadata$as_of + ) + } + } + forecast_date +} + + +#' pad every group at the right interval +#' @description +#' Perform last observation carried forward on a group by group basis. It uses +#' `guess_period` to find the appropriate interval to fill-forward by. It +#' maintains the grouping structure it recieves. It does *not* fill any +#' "interior" `NA` values occurring in the data beforehand. +#' @param x an epi_df to be filled forward. +#' @param columns_to_complete which columns to apply completion to. By default every non-key column of an epi_df +#' @param groups the grouping by which to fill forward +#' @importFrom tidyselect all_of +#' @importFrom rlang list2 +#' @importFrom vctrs vec_cast +#' @importFrom dplyr across arrange bind_rows group_by summarise +#' @keywords internal +pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { + if (is.null(columns_to_complete)) { + columns_to_complete <- setdiff(names(x), key_colnames(x)) + } + itval <- epiprocess::guess_period(c(x$time_value, end_date), "time_value") + # get the time values we need to fill in + completed_time_values <- x %>% + group_by(across(all_of(groups))) %>% + summarise( + time_value = list2( + time_value = seq_forward(from = max(time_value) + itval, to = end_date, by = itval) + ) + ) %>% + unnest("time_value") %>% + mutate(time_value = vec_cast(time_value, x$time_value)) + # pull the last value in each group and fill forward + grouped_and_arranged <- x %>% + arrange(across(all_of(c("time_value", groups)))) %>% + group_by(across(all_of(groups))) + + values_to_fill <- grouped_and_arranged %>% + slice(min(across(all_of(columns_to_complete), count_single_column)):n()) + filled_values <- values_to_fill %>% + bind_rows(completed_time_values) %>% + arrange(across(all_of(c("time_value", groups)))) %>% + fill(all_of(columns_to_complete), .direction = "down") %>% + slice(-1) # remove the oirginal rows + + grouped_and_arranged %>% + slice(1:min(across(all_of(columns_to_complete), count_single_column))) %>% + bind_rows(filled_values) %>% + arrange(across(all_of(key_colnames(x)))) %>% + ungroup() +} + +#' get the location of the last real value +#' @param col the relevant column +#' @keywords internal +count_single_column <- function(col) { + max(which(!is.na(col))) +} + + +#' seq, but returns null if from is larger +#' @keywords internal +seq_forward <- function(from, to, by) { + if (from > to) { + return(NULL) + } + seq(from = from, to = to, by = by) +} + + +#' warn when the latency is larger than would be reasonable +#' @param dataset the epi_df +#' @param latency_table the whole collection of latencies +#' @param target_columns the names of the columns that we're adjusting, and whether its unreasonably latent +#' @keywords internal +check_interminable_latency <- function(dataset, latency_table, target_columns, forecast_date, call = caller_env()) { + # check that the shift amount isn't too extreme + rel_latency_table <- latency_table %>% + filter(col_name %in% target_columns) + # no relevant columns, so this error definitely isn't happening + if (nrow(rel_latency_table) == 0) { + return() + } + latency_max <- rel_latency_table %>% + pull(latency) %>% + abs() %>% + max() + time_type <- attributes(dataset)$metadata$time_type + i_latency <- which.max(latency_table$latency) + if ( + (grepl("day", time_type) && (latency_max >= 28)) || + (grepl("week", time_type) && (latency_max >= 4)) || + ((time_type == "yearmonth") && (latency_max >= 2)) || + ((time_type == "yearquarter") && (latency_max >= 1)) || + ((time_type == "year") && (latency_max >= 1)) + ) { + max_time_value <- dataset %>% + filter(!is.na(!!(latency_table[[i_latency, "col_name"]]))) %>% + pull(time_value) %>% + max() + cli_warn( + message = c( + paste( + "The maximum latency is {latency_max}, ", + "which is questionable for it's `time_type` of ", + "{time_type}." + ), + "i" = "latency: {latency_table$latency[[i_latency]]}", + "i" = "`max_time` = {max_time_value} -> `forecast_date` = {forecast_date}" + ), + class = "epipredict__prep.step_latency__very_large_latency", + call = call + ) + } +} + +`%nin%` <- function(x, table) { + !(x %in% table) +} + +#' create the latency table +#' This is a table of column names and the latency adjustment necessary for that column. An example: +#' +#' col_name latency +#' +#' 1 case_rate 5 +#' 2 death_rate 5 +#' @keywords internal +#' @importFrom dplyr rowwise +get_latency_table <- function(training, columns, forecast_date, latency, + sign_shift, epi_keys_checked, keys_to_ignore, + info, terms) { + if (is.null(columns)) { + columns <- recipes_eval_select(terms, training, info) + } + # construct the latency table + latency_table <- tibble(col_name = names(training)) %>% + filter(col_name %nin% key_colnames(training)) + if (length(columns) > 0) { + latency_table <- latency_table %>% filter(col_name %in% columns) + } + training_dropped <- training %>% drop_ignored_keys(keys_to_ignore) + if (is.null(latency)) { + latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency( + training_dropped, + forecast_date, + col_name, + sign_shift, + epi_keys_checked + )) + } else if (length(latency) > 1) { + # if latency has a length, it must also have named elements. + # We assign based on comparing the name in the list + # with the column names, and drop any which don't have a latency assigned + latency_table <- latency_table %>% + filter(col_name %in% names(latency)) %>% + rowwise() %>% + mutate(latency = unname(latency[names(latency) == col_name])) + } else { + latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency( + training %>% drop_ignored_keys(keys_to_ignore), forecast_date, col_name, sign_shift, epi_keys_checked + )) + if (latency) { + latency_table <- latency_table %>% mutate(latency = latency) + } + } + return(latency_table %>% ungroup()) +} + +#' given a list named by key columns, remove any matching key values +#' keys_to_ignore should have the form list(col_name = c("value_to_ignore", "other_value_to_ignore")) +#' @keywords internal +drop_ignored_keys <- function(training, keys_to_ignore) { + # note that the extra parenthesis black magic is described here: https://github.com/tidyverse/dplyr/issues/6194 + # and is needed to bypass an incomplete port of `across` functions to `if_any` + training %>% + ungroup() %>% + filter((dplyr::if_all( + names(keys_to_ignore), + ~ . %nin% keys_to_ignore[[cur_column()]] + ))) +} + + +#' checks: the recipe type, whether a previous step is the relevant epi_shift, +#' that either `fixed_latency` or `fixed_forecast_date` is non-null, and that +#' `fixed_latency` only references columns that exist at the time of the step +#' inclusion +#' @keywords internal +step_adjust_latency_checks <- function(id, method, recipe, fixed_latency, fixed_forecast_date, call = caller_env()) { + arg_is_chr_scalar(id, method) + if (detect_step(recipe, "adjust_latency")) { + cli_abort("Only one `step_adjust_latency()` can be included in a recipe.", + class = "epipredict__step_adjust_latency__multiple_steps" + ) + } + if (!is_epi_recipe(recipe)) { + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", + class = "epipredict__step_adjust_latency__epi_recipe_only" + ) + } + if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { + cli_warn( + "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { + cli_warn( + "If `method` is {.val extend_lags} or {.val locf}, +then the previous `step_epi_lag`s won't work with modified data.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "locf") && (length(recipe$steps) > 0)) { + cli_warn( + paste0( + "There are steps before `step_adjust_latency`.", + " With the method {.val locf}, it is recommended to include this step before any others" + ), + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } + if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { + cli_abort( + "Only one of `fixed_latency` and `fixed_forecast_date` can be non-`NULL` at a time!", + class = "epipredict__step_adjust_latency__too_many_args_error" + ) + } + if (length(fixed_latency > 1)) { + template <- recipe$template + data_names <- names(template)[!names(template) %in% key_colnames(template)] + wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] + if (length(wrong_names) > 0) { + cli_abort( + "{.val fixed_latency} contains names not in the template dataset: {wrong_names}", + class = "epipredict__step_adjust_latency__undefined_names_error" + ) + } + } +} + +compare_bake_prep_latencies <- function(object, new_data, call = caller_env()) { + latency <- object$latency + current_forecast_date <- object$fixed_forecast_date %||% + get_forecast_date( + new_data, NULL, object$epi_keys_checked, latency, + c(key_colnames(new_data), object$columns) + ) + local_latency_table <- get_latency_table( + new_data, object$columns, current_forecast_date, latency, + get_sign(object), object$epi_keys_checked, object$keys_to_ignore, NULL, NULL + ) + comparison_table <- local_latency_table %>% + ungroup() %>% + dplyr::full_join( + object$latency_table %>% ungroup(), + by = join_by(col_name), + suffix = c(".bake", ".prep") + ) %>% + mutate(bakeMprep = latency.bake - latency.prep) + if (any(comparison_table$bakeMprep > 0)) { + cli_abort( + paste0( + "There is more latency at bake time than there was at prep time.", + " You will need to fit a model with more latency to predict on this dataset." + ), + class = "epipredict__latency__bake_prep_difference_error", + latency_table = comparison_table, + call = call + ) + } + if (any(comparison_table$bakeMprep < 0)) { + cli_warn( + paste0( + "There is less latency at bake time than there was at prep time.", + " This will still fit, but will discard the most recent data." + ), + class = "epipredict__latency__bake_prep_difference_warn", + latency_table = comparison_table, + call = call + ) + } + if (current_forecast_date != object$forecast_date) { + cli_warn( + paste0( + "The forecast date differs from the one set at train time; ", + " this means any dates added by `layer_forecast_date` will be inaccurate." + ), + class = "epipredict__latency__bake_prep_forecast_date_warn", + call = call + ) + } +} + + +#' @keywords internal +create_shift_grid <- function(prefix, amount, target_sign, columns, latency_table, latency_sign) { + if (!is.null(latency_table) && latency_sign == target_sign) { + # get the actually used latencies + rel_latency <- latency_table %>% filter(col_name %in% columns) + latency_adjusted <- TRUE + } else { + # adding zero if there's no latency table + rel_latency <- tibble(col_name = columns, latency = 0L) + latency_adjusted <- FALSE + } + shift_grid <- expand_grid(col = columns, amount = target_sign * amount) %>% + left_join(rel_latency, by = join_by(col == col_name), ) %>% + tidyr::replace_na(list(latency = 0)) %>% + mutate(shift_val = amount + latency) %>% + mutate( + newname = glue::glue("{prefix}{abs(shift_val)}_{col}"), # name is always positive + amount = NULL, + latency = NULL + ) + return(list(shift_grid, latency_adjusted)) +} diff --git a/R/utils-misc.R b/R/utils-misc.R index b4d1c28b7..71fd38615 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -1,60 +1,73 @@ -#' Check that newly created variable names don't overlap +# Copied from `epiprocess`: + +#' "Format" a character vector of column/variable names for cli interpolation #' -#' `check_pname` is to be used in a slather method to ensure that -#' newly created variable names don't overlap with existing names. -#' Throws an warning if check fails, and creates a random string. -#' @param res A data frame or tibble of the newly created variables. -#' @param preds An epi_df or tibble containing predictions. -#' @param object A layer object passed to [slather()]. -#' @param newname A string of variable names if the object doesn't contain a -#' $name element +#' Designed to give good output if interpolated with cli. Main purpose is to add +#' backticks around variable names when necessary, and something other than an +#' empty string if length 0. #' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return `chr` #' @keywords internal -check_pname <- function(res, preds, object, newname = NULL) { - if (is.null(newname)) newname <- object$name - new_preds_names <- colnames(preds) - intersection <- new_preds_names %in% newname - if (any(intersection)) { - newname <- rand_id(newname) - rlang::warn( - paste0( - "Name collision occured in `", - class(object)[1], - "`. The following variable names already exists: ", - paste0(new_preds_names[intersection], collapse = ", "), - ". Result instead has randomly generated string `", - newname, "`." - ) - ) +format_varnames <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + as.character(rlang::syms(x)) } - names(res) <- newname - res } - grab_forged_keys <- function(forged, workflow, new_data) { - forged_roles <- names(forged$extras$roles) - extras <- dplyr::bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) - # 1. these are the keys in the test data after prep/bake - new_keys <- names(extras) - # 2. these are the keys in the training data + # 1. keys in the training data post-prep, based on roles: old_keys <- key_colnames(workflow) - # 3. these are the keys in the test data as input - new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) - if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli::cli_warn(c( - "Not all epi keys that were present in the training data are available", - "in `new_data`. Predictions will have only the available keys." + # 2. keys in the test data post-bake, based on roles & structure: + forged_roles <- forged$extras$roles + new_key_tbl <- bind_cols(forged_roles$geo_value, forged_roles$key, forged_roles$time_value) + new_keys <- names(new_key_tbl) + if (length(new_keys) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_new_keys <- c("geo_value", "time_value") + forged_tbl <- bind_cols(forged$extras$roles) + new_keys <- potential_new_keys[potential_new_keys %in% names(forged_tbl)] + new_key_tbl <- forged_tbl[new_keys] + } + # Softly validate: + if (!(setequal(old_keys, new_keys))) { + cli_warn(c( + "Inconsistent epikeytime identifier columns specified/inferred in training vs. in testing data.", + "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}", + "i" = "testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}", + "*" = "", + ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role` + to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.' )) } - if (is_epi_df(new_data)) { - meta <- attr(new_data, "metadata") - extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys %||% character()) - } else if (all(c("geo_value", "time_value") %in% new_keys)) { - if (length(new_keys) > 2) other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] - extras <- as_epi_df(extras, other_keys = other_keys %||% character()) + # Convert `new_key_tbl` to `epi_df` if not renaming columns nor violating + # `epi_df` invariants. Require that our key is a unique key in any case. + if (all(c("geo_value", "time_value") %in% new_keys)) { + maybe_as_of <- attr(new_data, "metadata")$as_of # NULL if wasn't epi_df + try(return(as_epi_df(new_key_tbl, other_keys = new_keys, as_of = maybe_as_of)), + silent = TRUE + ) + } + if (anyDuplicated(new_key_tbl)) { + duplicate_key_tbl <- new_key_tbl %>% filter(.by = everything(), dplyr::n() > 1L) + error_part1 <- cli::format_error( + c( + "Specified/inferred key columns had repeated combinations in the forged/baked test data.", + "i" = "Key columns: {format_varnames(new_keys)}", + "Duplicated keys:" + ) + ) + error_part2 <- capture.output(print(duplicate_key_tbl)) + rlang::abort( + paste(collapse = "\n", c(error_part1, error_part2)), + class = "epipredict__grab_forged_keys__nonunique_key" + ) + } else { + return(new_key_tbl) } - extras } get_parsnip_mode <- function(trainer) { @@ -75,3 +88,14 @@ is_classification <- function(trainer) { is_regression <- function(trainer) { get_parsnip_mode(trainer) %in% c("regression", "unknown") } + + +enlist <- function(...) { + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) +} diff --git a/R/weighted_interval_score.R b/R/weighted_interval_score.R index cd67bbee9..47187fe56 100644 --- a/R/weighted_interval_score.R +++ b/R/weighted_interval_score.R @@ -7,12 +7,21 @@ #' al. (2020)](https://arxiv.org/abs/2005.12881) for discussion in the context #' of COVID-19 forecasting. #' -#' @param x distribution. A vector of class distribution. Ideally, this vector -#' contains `dist_quantiles()`, though other distributions are supported when -#' `quantile_levels` is specified. See below. +#' @param x A vector of class `quantile_pred`. #' @param actual double. Actual value(s) #' @param quantile_levels probabilities. If specified, the score will be -#' computed at this set of levels. +#' computed at this set of levels. Otherwise, those present in `x` will be +#' used. +#' @param na_handling character. Determines missing values are handled. +#' For `"impute"`, missing values will be +#' calculated if possible using the available quantiles. For `"drop"`, +#' explicitly missing values are ignored in the calculation of the score, but +#' implicitly missing values are imputed if possible. +#' For `"propogate"`, the resulting score will be `NA` if any missing values +#' exist. Finally, if +#' `quantile_levels` is specified, `"fail"` will result in +#' the score being `NA` when any required quantile levels (implicit or explicit) +#' do not have corresponding values. #' @param ... not used #' #' @return a vector of nonnegative scores. @@ -20,37 +29,36 @@ #' @export #' @examples #' quantile_levels <- c(.2, .4, .6, .8) -#' predq_1 <- 1:4 # -#' predq_2 <- 8:11 -#' dstn <- dist_quantiles(list(predq_1, predq_2), quantile_levels) +#' predq1 <- 1:4 # +#' predq2 <- 8:11 +#' dstn <- quantile_pred(rbind(predq1, predq2), quantile_levels) #' actual <- c(3.3, 7.1) #' weighted_interval_score(dstn, actual) #' weighted_interval_score(dstn, actual, c(.25, .5, .75)) #' -#' library(distributional) -#' dstn <- dist_normal(c(.75, 2)) -#' weighted_interval_score(dstn, 1, c(.25, .5, .75)) -#' #' # Missing value behaviours -#' dstn <- dist_quantiles(c(1, 2, NA, 4), 1:4 / 5) +#' dstn <- quantile_pred(matrix(c(1, 2, NA, 4), nrow = 1), 1:4 / 5) #' weighted_interval_score(dstn, 2.5) #' weighted_interval_score(dstn, 2.5, 1:9 / 10) #' weighted_interval_score(dstn, 2.5, 1:9 / 10, na_handling = "drop") #' weighted_interval_score(dstn, 2.5, na_handling = "propagate") -#' weighted_interval_score(dist_quantiles(1:4, 1:4 / 5), 2.5, 1:9 / 10, +#' weighted_interval_score( +#' quantile_pred(matrix(1:4, nrow = 1), 1:4 / 5), +#' actual = 2.5, +#' quantile_levels = 1:9 / 10, #' na_handling = "fail" #' ) #' #' #' # Using some actual forecasts -------- #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' training <- covid_case_death_rates %>% #' filter(time_value >= "2021-10-01", time_value <= "2021-12-01") #' preds <- flatline_forecaster( -#' jhu, "death_rate", +#' training, "death_rate", #' flatline_args_list(quantile_levels = c(.01, .025, 1:19 / 20, .975, .99)) #' )$predictions -#' actuals <- case_death_rate_subset %>% +#' actuals <- covid_case_death_rates %>% #' filter(time_value == as.Date("2021-12-01") + 7) %>% #' select(geo_value, time_value, actual = death_rate) #' preds <- left_join(preds, actuals, @@ -58,90 +66,44 @@ #' ) %>% #' mutate(wis = weighted_interval_score(.pred_distn, actual)) #' preds -weighted_interval_score <- function(x, actual, quantile_levels = NULL, ...) { +weighted_interval_score <- function( + x, + actual, + quantile_levels = NULL, + na_handling = c("impute", "drop", "propagate", "fail"), + ...) { UseMethod("weighted_interval_score") } -#' @export -weighted_interval_score.default <- function(x, actual, - quantile_levels = NULL, ...) { - cli_abort(c( - "Weighted interval score can only be calculated if `x`", - "has class {.cls distribution}." - )) -} #' @export -weighted_interval_score.distribution <- function( - x, actual, - quantile_levels = NULL, ...) { - assert_numeric(actual, finite = TRUE) - l <- vctrs::vec_recycle_common(x = x, actual = actual) - map2_dbl( - .x = vctrs::vec_data(l$x), - .y = l$actual, - .f = weighted_interval_score, - quantile_levels = quantile_levels, - ... - ) -} - -#' @export -weighted_interval_score.dist_default <- function(x, actual, - quantile_levels = NULL, ...) { - rlang::check_dots_empty() - if (is.null(quantile_levels)) { - cli_warn(c( - "Weighted interval score isn't implemented for {.cls {class(x)}}", - "as we don't know what set of quantile levels to use.", - "Use a {.cls dist_quantiles} or pass `quantile_levels`.", - "The result for this element will be `NA`." - )) - return(NA) - } - x <- extrapolate_quantiles(x, probs = quantile_levels) - weighted_interval_score(x, actual, quantile_levels = NULL) -} - -#' @param na_handling character. Determines how `quantile_levels` without a -#' corresponding `value` are handled. For `"impute"`, missing values will be -#' calculated if possible using the available quantiles. For `"drop"`, -#' explicitly missing values are ignored in the calculation of the score, but -#' implicitly missing values are imputed if possible. -#' For `"propogate"`, the resulting score will be `NA` if any missing values -#' exist in the original `quantile_levels`. Finally, if -#' `quantile_levels` is specified, `"fail"` will result in -#' the score being `NA` when any required quantile levels (implicit or explicit) -#' are do not have corresponding values. -#' @describeIn weighted_interval_score Weighted interval score with -#' `dist_quantiles` allows for different `NA` behaviours. -#' @export -weighted_interval_score.dist_quantiles <- function( +weighted_interval_score.quantile_pred <- function( x, actual, quantile_levels = NULL, na_handling = c("impute", "drop", "propagate", "fail"), ...) { rlang::check_dots_empty() - if (is.na(actual)) { - return(NA) - } - if (all(is.na(vctrs::field(x, "values")))) { - return(NA) - } + n <- vctrs::vec_size(x) + if (length(actual) == 1L) actual <- rep(actual, n) + assert_numeric(actual, finite = TRUE, len = n) + assert_numeric(quantile_levels, lower = 0, upper = 1, null.ok = TRUE) na_handling <- rlang::arg_match(na_handling) - old_quantile_levels <- field(x, "quantile_levels") + old_quantile_levels <- x %@% "quantile_levels" if (na_handling == "fail") { if (is.null(quantile_levels)) { cli_abort('`na_handling = "fail"` requires `quantile_levels` to be specified.') } - old_values <- field(x, "values") - if (!all(quantile_levels %in% old_quantile_levels) || any(is.na(old_values))) { - return(NA) + if (!all(quantile_levels %in% old_quantile_levels)) { + return(rep(NA_real_, n)) } } tau <- quantile_levels %||% old_quantile_levels - x <- extrapolate_quantiles(x, probs = tau, replace_na = (na_handling == "impute")) - q <- field(x, "values")[field(x, "quantile_levels") %in% tau] + x <- extrapolate_quantiles(x, tau, replace_na = (na_handling == "impute")) + x <- as.matrix(x)[, attr(x, "quantile_levels") %in% tau, drop = FALSE] na_rm <- (na_handling == "drop") + map2_dbl(vctrs::vec_chop(x), actual, ~ wis_one_quantile(.x, tau, .y, na_rm)) +} + +wis_one_quantile <- function(q, tau, actual, na_rm) { 2 * mean(pmax(tau * (actual - q), (1 - tau) * (q - actual)), na.rm = na_rm) } diff --git a/README.Rmd b/README.Rmd index 36af14cd9..28ae1409c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,24 +7,94 @@ output: github_document ```{r, include = FALSE} options(width = 76) knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", fig.path = "man/figures/README-", - out.width = "100%" + digits = 3, + comment = "#>", + collapse = TRUE, + cache = TRUE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) +``` +```{r pkgs, include=FALSE, echo=FALSE} +library(epipredict) +library(epidatr) +library(data.table) +library(dplyr) +library(tidyr) +library(ggplot2) +library(magrittr) +library(purrr) +library(scales) +``` + +```{r coloration, include=FALSE, echo=FALSE} +base <- "#002676" +primary <- "#941120" +secondary <- "#f9c80e" +tertiary <- "#177245" +fourth_colour <- "#A393BF" +fifth_colour <- "#2e8edd" +colvec <- c( + base = base, primary = primary, secondary = secondary, + tertiary = tertiary, fourth_colour = fourth_colour, + fifth_colour = fifth_colour ) +library(epiprocess) +suppressMessages(library(tidyverse)) +theme_update(legend.position = "bottom", legend.title = element_blank()) +delphi_pal <- function(n) { + if (n > 6L) warning("Not enough colors in this palette!") + unname(colvec)[1:n] +} +scale_fill_delphi <- function(..., aesthetics = "fill") { + discrete_scale(aesthetics = aesthetics, palette = delphi_pal, ...) +} +scale_color_delphi <- function(..., aesthetics = "color") { + discrete_scale(aesthetics = aesthetics, palette = delphi_pal, ...) +} +scale_colour_delphi <- scale_color_delphi ``` -# epipredict +# Epipredict [![R-CMD-check](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml) -**Note:** This package is currently in development and may not work as expected. Please file bug reports as issues in this repo, and we will do our best to address them quickly. +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) is a framework for building transformation and forecasting pipelines for epidemiological and other panel time-series datasets. +In addition to tools for building forecasting pipelines, it contains a number of “canned” forecasters meant to run with little modification as an easy way to get started forecasting. + +It is designed to work well with +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/), a utility for time series handling and geographic processing in an epidemiological context. +Both of the packages are meant to work well with the panel data provided by +[`{epidatr}`](https://cmu-delphi.github.io/epidatr/). +Pre-compiled example datasets are also available in +[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/). + +If you are looking for detail beyond the package documentation, see our +[forecasting book](https://cmu-delphi.github.io/delphi-tooling-book/). + ## Installation -To install (unless you're making changes to the package, use the stable version): +Unless you’re planning on contributing to package development, we suggest using the stable version. +To install, run: ```r # Stable version @@ -34,93 +104,255 @@ pak::pkg_install("cmu-delphi/epipredict@main") pak::pkg_install("cmu-delphi/epipredict@dev") ``` -## Documentation +The documentation for the stable version is at +, while the development version is at +. + + +## Motivating example -You can view documentation for the `main` branch at . -## Goals for `epipredict` +
+ Required packages -**We hope to provide:** +```{r install, run = FALSE} +library(epipredict) +library(epidatr) +library(epiprocess) +library(dplyr) +library(ggplot2) +``` +
-1. A set of basic, easy-to-use forecasters that work out of the box. You should be able to do a reasonably limited amount of customization on them. For the basic forecasters, we currently provide: - * Baseline flatline forecaster - * Autoregressive forecaster - * Autoregressive classifier - * CDC FluSight flatline forecaster -2. A framework for creating custom forecasters out of modular components. There are four types of components: - * Preprocessor: do things to the data before model training - * Trainer: train a model on data, resulting in a fitted model object - * Predictor: make predictions, using a fitted model object - * Postprocessor: do things to the predictions before returning +To demonstrate using [`{epipredict}`](https://cmu-delphi.github.io/epipredict/) for forecasting, suppose we want to +predict COVID-19 deaths per 100k people for each of a subset of states -**Target audiences:** +```{r subset_geos} +used_locations <- c("ca", "ma", "ny", "tx") +``` -* Basic. Has data, calls forecaster with default arguments. -* Intermediate. Wants to examine changes to the arguments, take advantage of -built in flexibility. -* Advanced. Wants to write their own forecasters. Maybe willing to build up -from some components. +on -The Advanced user should find their task to be relatively easy. Examples of -these tasks are illustrated in the [vignettes and articles](https://cmu-delphi.github.io/epipredict). +```{r fc_date} +forecast_date <- as.Date("2021-08-01") +``` -See also the (in progress) [Forecasting Book](https://cmu-delphi.github.io/delphi-tooling-book/). +We will be using a subset of +[Johns Hopkins Center for Systems Science and Engineering deaths data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). +Below the fold, we pull the dataset from the epidata API and clean it. -## Intermediate example +
+ Creating the dataset using `{epidatr}` and `{epiprocess}` -The package comes with some built-in historical data for illustration, but -up-to-date versions of this could be downloaded with the -[`{epidatr}` package](https://cmu-delphi.github.io/epidatr/) -and processed using -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^1] +This section is intended to demonstrate some of the ubiquitous cleaning operations needed to be able to forecast. +A subset of the dataset prepared here is also included ready-to-go in [`{epipredict}`](https://cmu-delphi.github.io/epipredict/) as `covid_case_death_rates`. -[^1]: Other epidemiological signals for non-Covid related illnesses are also -available with [`{epidatr}`](https://github.com/cmu-delphi/epidatr) which -interfaces directly to Delphi's -[Epidata API](https://cmu-delphi.github.io/delphi-epidata/) +First we pull both `jhu-csse` cases and deaths data from the +[Delphi API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html) using the +[`{epidatr}`](https://cmu-delphi.github.io/epidatr/) package: -```{r epidf, message=FALSE} -library(epipredict) -case_death_rate_subset +```{r case_death, warning = FALSE, eval = TRUE} +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, case_rate = value) + +deaths <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, death_rate = value) +cases_deaths <- + full_join(cases, deaths, by = c("time_value", "geo_value")) |> + filter(geo_value %in% used_locations) |> + as_epi_df(as_of = as.Date("2022-01-01")) +``` + +Since visualizing the results on every geography is somewhat overwhelming, +we’ll only train on a subset of locations. + +```{r date, warning = FALSE} +# plotting the data as it was downloaded +cases_deaths |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` + +As with the typical dataset, we will need to do some cleaning to +make it actually usable; we’ll use some utilities from +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) for this. +Specifically we'll trim outliers, especially negative values: + +```{r outlier} +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + mutate( + outlr_death_rate = detect_outlr_rm( + time_value, death_rate, + detect_negatives = TRUE + ), + outlr_case_rate = detect_outlr_rm( + time_value, case_rate, + detect_negatives = TRUE + ) + ) |> + unnest(cols = starts_with("outlr"), names_sep = "_") |> + ungroup() |> + mutate( + death_rate = outlr_death_rate_replacement, + case_rate = outlr_case_rate_replacement + ) |> + select(geo_value, time_value, case_rate, death_rate) ``` +
+ +After downloading and cleaning deaths per capita, as well as cases per 100k people, we can plot +a subset of the states, marking the desired forecast date with a vertical line: -To create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function. +
+ Plot + +```{r plot_locs} +forecast_date_label <- + tibble( + geo_value = rep(used_locations, 2), + .response_name = c(rep("case_rate", 4), rep("death_rate", 4)), + dates = rep(forecast_date - 7 * 2, 2 * length(used_locations)), + heights = c(rep(150, 4), rep(0.75, 4)) + ) +processed_data_plot <- + cases_deaths |> + filter(geo_value %in% used_locations) |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label, + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` +
+```{r show-processed-data, warning=FALSE, echo=FALSE} +processed_data_plot +``` + +To make a forecast, we will use a simple “canned” auto-regressive forecaster to +predict the death rate four weeks into the future using lagged[^3] deaths and +cases. + +[^3]: lagged by 3 in this context meaning using the value from 3 days ago. ```{r make-forecasts, warning=FALSE} -two_week_ahead <- arx_forecaster( - case_death_rate_subset, +four_week_ahead <- arx_forecaster( + cases_deaths |> filter(time_value <= forecast_date), outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), - ahead = 14 + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) ) ) -two_week_ahead +four_week_ahead ``` -In this case, we have used a number of different lags for the case rate, while -only using 3 weekly lags for the death rate (as predictors). The result is both -a fitted model object which could be used any time in the future to create -different forecasts, as well as a set of predicted values (and prediction -intervals) for each location 14 days after the last available time value in the -data. +In our model setup, we are using as predictors the case rate lagged 0-3 +days, one week, and two weeks, and the death rate lagged 0-2 weeks. +The result `four_week_ahead` is both a fitted model object which could be used +any time in the future to create different forecasts, and a set of predicted +values (and prediction intervals) for each location 28 days after the forecast +date. + +Plotting the prediction intervals on the true values for our location subset[^2]: -```{r print-model} -two_week_ahead$epi_workflow +[^2]: Alternatively, you could call `autoplot(four_week_ahead, observed_response = + cases_deaths)` to get the full collection of forecasts. This is too busy for + the space we have for plotting here. + +
+ Plot +```{r plotting_forecast, warning=FALSE} +epiworkflow <- four_week_ahead$epi_workflow +restricted_predictions <- + four_week_ahead$predictions |> + rename(time_value = target_date, value = .pred) |> + mutate(.response_name = "death_rate") +forecast_plot <- + four_week_ahead |> + autoplot(observed_response = cases_deaths) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label %>% filter(.response_name == "death_rate"), + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` +
-The fitted model here involved preprocessing the data to appropriately generate -lagged predictors, estimating a linear model with `stats::lm()` and then -postprocessing the results to be meaningful for epidemiological tasks. We can -also examine the predictions. +```{r show-single-forecast, warning=FALSE, echo=FALSE} +forecast_plot +``` -```{r show-preds} -two_week_ahead$predictions +And as a tibble of quantile level-value pairs: +```{r pivot_wider} +four_week_ahead$predictions |> + select(-.pred) |> + pivot_quantiles_longer(.pred_distn) |> + select(geo_value, forecast_date, target_date, quantile = .pred_distn_quantile_level, value = .pred_distn_value) ``` -The results above show a distributional forecast produced using data through -the end of 2021 for the 14th of January 2022. A prediction for the death rate -per 100K inhabitants is available for every state (`geo_value`) along with a -90% predictive interval. +The orange dot gives the point prediction, while the blue intervals give the +25-75%, the 10-90%, and 2.5-97.5% inter-quantile ranges[^4]. +For this particular day and these locations, the forecasts are relatively +accurate, with the true data being at least within the 10-90% interval. +A couple of things to note: + +1. `epipredict` methods are primarily direct forecasters; this means we don't need to + predict 1, 2,..., 27 days ahead to then predict 28 days ahead. +2. All of our existing engines are geo-pooled, meaning the training data is + shared across geographies. This has the advantage of increasing the amount of + available training data, with the restriction that the data needs to be on + comparable scales, such as rates. + +## Getting Help +If you encounter a bug or have a feature request, feel free to file an [issue on +our GitHub page](https://github.com/cmu-delphi/epipredict/issues). +For other questions, feel free to reach out to the authors, either via this +[contact form](https://docs.google.com/forms/d/e/1FAIpQLScqgT1fKZr5VWBfsaSp-DNaN03aV6EoZU4YljIzHJ1Wl_zmtg/viewform), +email, or the InsightNet Slack. +[^4]: Note that these are not the same quantiles that we fit when creating + `four_week_ahead`. They are extrapolated from those quantiles using `extrapolate_quantiles()` (which assumes an exponential decay in the tails). diff --git a/README.md b/README.md index 2cd3557c5..97ee81219 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,35 @@ -# epipredict +# Epipredict [![R-CMD-check](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml) -**Note:** This package is currently in development and may not work as -expected. Please file bug reports as issues in this repo, and we will do -our best to address them quickly. +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) is a +framework for building transformation and forecasting pipelines for +epidemiological and other panel time-series datasets. In addition to +tools for building forecasting pipelines, it contains a number of +“canned” forecasters meant to run with little modification as an easy +way to get started forecasting. + +It is designed to work well with +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/), a utility +for time series handling and geographic processing in an epidemiological +context. Both of the packages are meant to work well with the panel data +provided by [`{epidatr}`](https://cmu-delphi.github.io/epidatr/). +Pre-compiled example datasets are also available in +[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/). + +If you are looking for detail beyond the package documentation, see our +[forecasting book](https://cmu-delphi.github.io/delphi-tooling-book/). ## Installation -To install (unless you’re making changes to the package, use the stable -version): +Unless you’re planning on contributing to package development, we +suggest using the stable version. To install, run: ``` r # Stable version @@ -25,189 +39,310 @@ pak::pkg_install("cmu-delphi/epipredict@main") pak::pkg_install("cmu-delphi/epipredict@dev") ``` -## Documentation +The documentation for the stable version is at +, while the development version +is at . -You can view documentation for the `main` branch at -. +## Motivating example -## Goals for `epipredict` +
+ +Required packages + -**We hope to provide:** +``` r +library(epipredict) +library(epidatr) +library(epiprocess) +library(dplyr) +library(ggplot2) +``` -1. A set of basic, easy-to-use forecasters that work out of the box. - You should be able to do a reasonably limited amount of - customization on them. For the basic forecasters, we currently - provide: - - Baseline flatline forecaster - - Autoregressive forecaster - - Autoregressive classifier - - CDC FluSight flatline forecaster -2. A framework for creating custom forecasters out of modular - components. There are four types of components: - - Preprocessor: do things to the data before model training - - Trainer: train a model on data, resulting in a fitted model object - - Predictor: make predictions, using a fitted model object - - Postprocessor: do things to the predictions before returning +
-**Target audiences:** +To demonstrate using +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) for +forecasting, suppose we want to predict COVID-19 deaths per 100k people +for each of a subset of states -- Basic. Has data, calls forecaster with default arguments. -- Intermediate. Wants to examine changes to the arguments, take - advantage of built in flexibility. -- Advanced. Wants to write their own forecasters. Maybe willing to build - up from some components. +``` r +used_locations <- c("ca", "ma", "ny", "tx") +``` -The Advanced user should find their task to be relatively easy. Examples -of these tasks are illustrated in the [vignettes and -articles](https://cmu-delphi.github.io/epipredict). +on -See also the (in progress) [Forecasting -Book](https://cmu-delphi.github.io/delphi-tooling-book/). +``` r +forecast_date <- as.Date("2021-08-01") +``` + +We will be using a subset of [Johns Hopkins Center for Systems Science +and Engineering deaths +data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). +Below the fold, we pull the dataset from the epidata API and clean it. -## Intermediate example +
+ +Creating the dataset using `{epidatr}` and `{epiprocess}` + -The package comes with some built-in historical data for illustration, -but up-to-date versions of this could be downloaded with the -[`{epidatr}` package](https://cmu-delphi.github.io/epidatr/) and -processed using -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^1] +This section is intended to demonstrate some of the ubiquitous cleaning +operations needed to be able to forecast. A subset of the dataset +prepared here is also included ready-to-go in +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) as +`covid_case_death_rates`. + +First we pull both `jhu-csse` cases and deaths data from the [Delphi +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html) +using the [`{epidatr}`](https://cmu-delphi.github.io/epidatr/) package: ``` r -library(epipredict) -case_death_rate_subset -#> An `epi_df` object, 20,496 x 4 with metadata: -#> * geo_type = state -#> * time_type = day -#> * as_of = 2022-05-31 12:08:25.791826 -#> -#> # A tibble: 20,496 × 4 -#> geo_value time_value case_rate death_rate -#> * -#> 1 ak 2020-12-31 35.9 0.158 -#> 2 al 2020-12-31 65.1 0.438 -#> 3 ar 2020-12-31 66.0 1.27 -#> 4 as 2020-12-31 0 0 -#> 5 az 2020-12-31 76.8 1.10 -#> 6 ca 2020-12-31 96.0 0.751 -#> 7 co 2020-12-31 35.8 0.649 -#> 8 ct 2020-12-31 52.1 0.819 -#> 9 dc 2020-12-31 31.0 0.601 -#> 10 de 2020-12-31 65.2 0.807 -#> # ℹ 20,486 more rows +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, case_rate = value) + +deaths <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, death_rate = value) +cases_deaths <- + full_join(cases, deaths, by = c("time_value", "geo_value")) |> + filter(geo_value %in% used_locations) |> + as_epi_df(as_of = as.Date("2022-01-01")) +``` + +Since visualizing the results on every geography is somewhat +overwhelming, we’ll only train on a subset of locations. + +``` r +# plotting the data as it was downloaded +cases_deaths |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` + + + +As with the typical dataset, we will need to do some cleaning to make it +actually usable; we’ll use some utilities from +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) for this. +Specifically we’ll trim outliers, especially negative values: + +``` r +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + mutate( + outlr_death_rate = detect_outlr_rm( + time_value, death_rate, + detect_negatives = TRUE + ), + outlr_case_rate = detect_outlr_rm( + time_value, case_rate, + detect_negatives = TRUE + ) + ) |> + unnest(cols = starts_with("outlr"), names_sep = "_") |> + ungroup() |> + mutate( + death_rate = outlr_death_rate_replacement, + case_rate = outlr_case_rate_replacement + ) |> + select(geo_value, time_value, case_rate, death_rate) +``` + +
+ +After downloading and cleaning deaths per capita, as well as cases per +100k people, we can plot a subset of the states, marking the desired +forecast date with a vertical line: + +
+ +Plot + + +``` r +forecast_date_label <- + tibble( + geo_value = rep(used_locations, 2), + .response_name = c(rep("case_rate", 4), rep("death_rate", 4)), + dates = rep(forecast_date - 7 * 2, 2 * length(used_locations)), + heights = c(rep(150, 4), rep(0.75, 4)) + ) +processed_data_plot <- + cases_deaths |> + filter(geo_value %in% used_locations) |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label, + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` -To create and train a simple auto-regressive forecaster to predict the -death rate two weeks into the future using past (lagged) deaths and -cases, we could use the following function. +
+ + + +To make a forecast, we will use a simple “canned” auto-regressive +forecaster to predict the death rate four weeks into the future using +lagged[^1] deaths and cases. ``` r -two_week_ahead <- arx_forecaster( - case_death_rate_subset, +four_week_ahead <- arx_forecaster( + cases_deaths |> filter(time_value <= forecast_date), outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), - ahead = 14 + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) ) ) -two_week_ahead -#> ══ A basic forecaster of type ARX Forecaster ═══════════════════════════════ +four_week_ahead +#> ══ A basic forecaster of type ARX Forecaster ════════════════════════════════ #> -#> This forecaster was fit on 2024-11-11 11:38:31. +#> This forecaster was fit on 2025-05-22 11:56:44. #> #> Training data was an with: #> • Geography: state, #> • Time type: day, -#> • Using data up-to-date as of: 2022-05-31 12:08:25. +#> • Using data up-to-date as of: 2022-01-01. +#> • With the last data available on 2021-08-01 #> -#> ── Predictions ───────────────────────────────────────────────────────────── +#> ── Predictions ────────────────────────────────────────────────────────────── #> -#> A total of 56 predictions are available for -#> • 56 unique geographic regions, -#> • At forecast date: 2021-12-31, -#> • For target date: 2022-01-14. +#> A total of 4 predictions are available for +#> • 4 unique geographic regions, +#> • At forecast date: 2021-08-01, +#> • For target date: 2021-08-29, #> ``` -In this case, we have used a number of different lags for the case rate, -while only using 3 weekly lags for the death rate (as predictors). The -result is both a fitted model object which could be used any time in the -future to create different forecasts, as well as a set of predicted -values (and prediction intervals) for each location 14 days after the -last available time value in the data. +In our model setup, we are using as predictors the case rate lagged 0-3 +days, one week, and two weeks, and the death rate lagged 0-2 weeks. The +result `four_week_ahead` is both a fitted model object which could be +used any time in the future to create different forecasts, and a set of +predicted values (and prediction intervals) for each location 28 days +after the forecast date. + +Plotting the prediction intervals on the true values for our location +subset[^2]: + +
+ +Plot + ``` r -two_week_ahead$epi_workflow -#> -#> ══ Epi Workflow [trained] ══════════════════════════════════════════════════ -#> Preprocessor: Recipe -#> Model: linear_reg() -#> Postprocessor: Frosting -#> -#> ── Preprocessor ──────────────────────────────────────────────────────────── -#> -#> 6 Recipe steps. -#> 1. step_epi_lag() -#> 2. step_epi_lag() -#> 3. step_epi_ahead() -#> 4. step_naomit() -#> 5. step_naomit() -#> 6. step_training_window() -#> -#> ── Model ─────────────────────────────────────────────────────────────────── -#> -#> Call: -#> stats::lm(formula = ..y ~ ., data = data) -#> -#> Coefficients: -#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate -#> -0.0073358 0.0030365 0.0012467 0.0009536 -#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate -#> 0.0011425 0.0012481 0.0003041 0.1351769 -#> lag_7_death_rate lag_14_death_rate -#> 0.1471127 0.1062473 -#> -#> ── Postprocessor ─────────────────────────────────────────────────────────── -#> -#> 5 Frosting layers. -#> 1. layer_predict() -#> 2. layer_residual_quantiles() -#> 3. layer_add_forecast_date() -#> 4. layer_add_target_date() -#> 5. layer_threshold() -#> +epiworkflow <- four_week_ahead$epi_workflow +restricted_predictions <- + four_week_ahead$predictions |> + rename(time_value = target_date, value = .pred) |> + mutate(.response_name = "death_rate") +forecast_plot <- + four_week_ahead |> + autoplot(observed_response = cases_deaths) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label %>% filter(.response_name == "death_rate"), + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` -The fitted model here involved preprocessing the data to appropriately -generate lagged predictors, estimating a linear model with `stats::lm()` -and then postprocessing the results to be meaningful for epidemiological -tasks. We can also examine the predictions. +
+ + + +And as a tibble of quantile level-value pairs: ``` r -two_week_ahead$predictions -#> # A tibble: 56 × 5 -#> geo_value .pred .pred_distn forecast_date target_date -#> -#> 1 ak 0.449 quantiles(0.45)[2] 2021-12-31 2022-01-14 -#> 2 al 0.574 quantiles(0.57)[2] 2021-12-31 2022-01-14 -#> 3 ar 0.673 quantiles(0.67)[2] 2021-12-31 2022-01-14 -#> 4 as 0 quantiles(0.12)[2] 2021-12-31 2022-01-14 -#> 5 az 0.679 quantiles(0.68)[2] 2021-12-31 2022-01-14 -#> 6 ca 0.575 quantiles(0.57)[2] 2021-12-31 2022-01-14 -#> 7 co 0.862 quantiles(0.86)[2] 2021-12-31 2022-01-14 -#> 8 ct 1.07 quantiles(1.07)[2] 2021-12-31 2022-01-14 -#> 9 dc 2.12 quantiles(2.12)[2] 2021-12-31 2022-01-14 -#> 10 de 1.09 quantiles(1.09)[2] 2021-12-31 2022-01-14 -#> # ℹ 46 more rows +four_week_ahead$predictions |> + select(-.pred) |> + pivot_quantiles_longer(.pred_distn) |> + select(geo_value, forecast_date, target_date, quantile = .pred_distn_quantile_level, value = .pred_distn_value) +#> # A tibble: 20 × 5 +#> geo_value forecast_date target_date quantile value +#> +#> 1 ca 2021-08-01 2021-08-29 0.1 0.198 +#> 2 ca 2021-08-01 2021-08-29 0.25 0.285 +#> 3 ca 2021-08-01 2021-08-29 0.5 0.345 +#> 4 ca 2021-08-01 2021-08-29 0.75 0.405 +#> 5 ca 2021-08-01 2021-08-29 0.9 0.491 +#> 6 ma 2021-08-01 2021-08-29 0.1 0.0277 +#> # ℹ 14 more rows ``` -The results above show a distributional forecast produced using data -through the end of 2021 for the 14th of January 2022. A prediction for -the death rate per 100K inhabitants is available for every state -(`geo_value`) along with a 90% predictive interval. +The orange dot gives the point prediction, while the blue intervals give +the 25-75%, the 10-90%, and 2.5-97.5% inter-quantile ranges[^3]. For +this particular day and these locations, the forecasts are relatively +accurate, with the true data being at least within the 10-90% interval. +A couple of things to note: + +1. `epipredict` methods are primarily direct forecasters; this means we + don’t need to predict 1, 2,…, 27 days ahead to then predict 28 days + ahead. +2. All of our existing engines are geo-pooled, meaning the training + data is shared across geographies. This has the advantage of + increasing the amount of available training data, with the + restriction that the data needs to be on comparable scales, such as + rates. + +## Getting Help + +If you encounter a bug or have a feature request, feel free to file an +[issue on our GitHub +page](https://github.com/cmu-delphi/epipredict/issues). For other +questions, feel free to reach out to the authors, either via this +[contact +form](https://docs.google.com/forms/d/e/1FAIpQLScqgT1fKZr5VWBfsaSp-DNaN03aV6EoZU4YljIzHJ1Wl_zmtg/viewform), +email, or the InsightNet Slack. + +[^1]: lagged by 3 in this context meaning using the value from 3 days + ago. + +[^2]: Alternatively, you could call + `autoplot(four_week_ahead, observed_response = cases_deaths)` to get + the full collection of forecasts. This is too busy for the space we + have for plotting here. -[^1]: Other epidemiological signals for non-Covid related illnesses are - also available with - [`{epidatr}`](https://github.com/cmu-delphi/epidatr) which - interfaces directly to Delphi’s [Epidata - API](https://cmu-delphi.github.io/delphi-epidata/) +[^3]: Note that these are not the same quantiles that we fit when + creating `four_week_ahead`. They are extrapolated from those + quantiles using `extrapolate_quantiles()` (which assumes an + exponential decay in the tails). diff --git a/_pkgdown.yml b/_pkgdown.yml index f67b2b6d5..cd37dc3fd 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,29 +1,29 @@ url: https://cmu-delphi.github.io/epipredict/ -# This is to give a default value to the `mode` parameter in the -# `pkgdown::build_site()` function. This is useful when building the site locally, -# as it will default to `devel` mode. In practice, this should all be handled -# dynamically by the CI/CD pipeline. development: - mode: auto + mode: devel template: package: delphidocs -articles: -- title: Get started - navbar: ~ - contents: - - epipredict - - preprocessing-and-models - - arx-classifier - - update - -- title: Advanced methods - contents: - - articles/sliding - - articles/smooth-qr - - panel-data +navbar: + structure: + left: [intro, workflows, backtesting, reference, articles, news] + right: [search, github, lightswitch] + components: + workflows: + text: Epiworkflows + href: articles/custom_epiworkflows.html + backtesting: + text: Backtesting + href: articles/backtesting.html + articles: + text: Articles + menu: + - text: Using the add/update/remove/adjust functions + href: articles/update.html + - text: Using epipredict on non-epidemic panel data + href: articles/panel-data.html home: links: @@ -46,66 +46,90 @@ reference: contents: - contains("forecaster") - contains("classifier") - - title: Forecaster modifications + + - subtitle: Forecaster modifications desc: Constructors to modify forecaster arguments and utilities to produce `epi_workflow` objects contents: - contains("args_list") - contains("_epi_workflow") - - title: Helper functions for Hub submission - contents: - - flusight_hub_formatter - - title: Parsnip engines - desc: Prediction methods not available elsewhere - contents: - - quantile_reg - - smooth_quantile_reg - - grf_quantiles - - title: Custom panel data forecasting workflows - contents: - - epi_recipe - - epi_workflow - - add_epi_recipe - - adjust_epi_recipe - - Add_model - - predict.epi_workflow - - fit.epi_workflow - - augment.epi_workflow - - forecast.epi_workflow - - - title: Epi recipe preprocessing steps + ########################## + - title: Steps and Layers + + - subtitle: Epi recipe preprocessing steps + desc: > + Note that any `{recipes}` + [`step`](https://recipes.tidymodels.org/reference/index.html) is also valid contents: - starts_with("step_") - - contains("bake") - - title: Epi recipe verification checks + + - subtitle: Frosting post-processing layers contents: - - check_enough_train_data - - title: Forecast postprocessing - desc: Create a series of postprocessing operations + - starts_with("layer_") + + ########################## + - title: Epiworkflows + - subtitle: Basic forecasting workflow functions + contents: + - epi_recipe + - epi_workflow + - add_epi_recipe + - fit.epi_workflow + + - subtitle: Forecast post-processing workflow functions + desc: Create and apply series of post-processing operations contents: - frosting - ends_with("_frosting") - - get_test_data - tidy.frosting - - title: Frosting layers - contents: - - contains("layer") - contains("slather") + + - subtitle: Prediction + desc: Methods for prediction and modifying predictions + contents: + - predict.epi_workflow + - augment.epi_workflow + - get_test_data + - forecast.epi_workflow + + - subtitle: Modifying forecasting epiworkflows + desc: > + Modify or inspect an existing recipe, workflow, or frosting. See also [the + article on the topic](../articles/update.html) + contents: + - adjust_epi_recipe + - Add_model + - add_layer + - layer-processors + - update.layer + + ########################## - title: Automatic forecast visualization contents: - autoplot.epi_workflow - autoplot.canned_epipred - - title: Utilities for quantile distribution processing + + ########################## + - title: Parsnip engines + desc: > + Prediction methods not available in the [general parsnip + repository](https://www.tidymodels.org/find/parsnip/) + contents: + - quantile_reg + - smooth_quantile_reg + - grf_quantiles + + ########################## + - title: Utilities contents: + - flusight_hub_formatter + - clean_f_name + - check_enough_data + + - subtitle: Utilities for quantile distribution processing + contents: + - pivot_quantiles - dist_quantiles + - contains("quantile_pred") - extrapolate_quantiles - nested_quantiles - weighted_interval_score - - starts_with("pivot_quantiles") - - title: Other utilities - contents: - - clean_f_name - - title: Included datasets - contents: - - case_death_rate_subset - - state_census - - grad_employ_subset diff --git a/data-raw/case_death_rate_subset.R b/data-raw/case_death_rate_subset.R deleted file mode 100644 index 4bf416dbf..000000000 --- a/data-raw/case_death_rate_subset.R +++ /dev/null @@ -1,32 +0,0 @@ -library(tidyverse) -library(covidcast) -library(epidatr) -library(epiprocess) - -x <- covidcast( - data_source = "jhu-csse", - signals = "confirmed_7dav_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20201231, 20211231), - geo_values = "*" -) %>% - fetch() %>% - select(geo_value, time_value, case_rate = value) - -y <- covidcast( - data_source = "jhu-csse", - signals = "deaths_7dav_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20201231, 20211231), - geo_values = "*" -) %>% - fetch() %>% - select(geo_value, time_value, death_rate = value) - -case_death_rate_subset <- x %>% - full_join(y, by = c("geo_value", "time_value")) %>% - as_epi_df() - -usethis::use_data(case_death_rate_subset, overwrite = TRUE) diff --git a/data-raw/grad_employ_subset.R b/data-raw/grad_employ_subset.R deleted file mode 100644 index 38719a02e..000000000 --- a/data-raw/grad_employ_subset.R +++ /dev/null @@ -1,106 +0,0 @@ -library(epipredict) -library(epiprocess) -library(cansim) -library(dplyr) -library(stringr) -library(tidyr) - -# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501 -statcan_grad_employ <- get_cansim("37-10-0115-01") - -gemploy <- statcan_grad_employ %>% - select(c( - "REF_DATE", - "GEO", - # "DGUID", - # "UOM", - # "UOM_ID", - # "SCALAR_FACTOR", - # "SCALAR_ID", - # "VECTOR", - # "COORDINATE", - "VALUE", - "STATUS", - # "SYMBOL", - # "TERMINATED", - # "DECIMALS", - # "GeoUID", - # "Hierarchy for GEO", - # "Classification Code for Educational qualification", - # "Hierarchy for Educational qualification", - # "Classification Code for Field of study", - # "Hierarchy for Field of study", - # "Classification Code for Gender", - # "Hierarchy for Gender", - # "Classification Code for Age group", - # "Hierarchy for Age group", - # "Classification Code for Status of student in Canada", - # "Hierarchy for Status of student in Canada", - # "Classification Code for Characteristics after graduation", - # "Hierarchy for Characteristics after graduation", - # "Classification Code for Graduate statistics", - # "Hierarchy for Graduate statistics", - # "val_norm", - # "Date", - "Educational qualification", - "Field of study", - "Gender", - "Age group", - "Status of student in Canada", - "Characteristics after graduation", - "Graduate statistics" - )) %>% - rename( - "geo_value" = "GEO", - "time_value" = "REF_DATE", - "value" = "VALUE", - "status" = "STATUS", - "edu_qual" = "Educational qualification", - "fos" = "Field of study", - "gender" = "Gender", - "age_group" = "Age group", - "student_status" = "Status of student in Canada", - "grad_charac" = "Characteristics after graduation", - "grad_stat" = "Graduate statistics" - ) %>% - mutate( - grad_stat = recode_factor( - grad_stat, - `Number of graduates` = "num_graduates", - `Median employment income two years after graduation` = "med_income_2y", - `Median employment income five years after graduation` = "med_income_5y" - ), - time_value = as.integer(time_value) - ) %>% - pivot_wider(names_from = grad_stat, values_from = value) %>% - filter( - # Drop aggregates for some columns - geo_value != "Canada" & - age_group != "15 to 64 years" & - edu_qual != "Total, educational qualification" & - # Keep aggregates for keys we don't want to keep - fos == "Total, field of study" & - gender == "Total, gender" & - student_status == "Canadian and international students" & - # Since we're looking at 2y and 5y employment income, the only - # characteristics remaining are: - # - Graduates reporting employment income - # - Graduates reporting wages, salaries, and commissions only - # For simplicity, keep the first one only - grad_charac == "Graduates reporting employment income" & - # Only keep "good" data - is.na(status) & - # Drop NA value rows - !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y) - ) %>% - select(-c(status, gender, student_status, grad_charac, fos)) - -nrow(gemploy) -ncol(gemploy) - -grad_employ_subset <- gemploy %>% - as_epi_df( - as_of = "2022-07-19", - other_keys = c("age_group", "edu_qual") - ) -usethis::use_data(grad_employ_subset, overwrite = TRUE) diff --git a/data-raw/state_census.R b/data-raw/state_census.R deleted file mode 100644 index 22dde5a41..000000000 --- a/data-raw/state_census.R +++ /dev/null @@ -1,11 +0,0 @@ -library(dplyr) -library(tidyr) - -state_census <- covidcast::state_census %>% - select(STATE, NAME, POPESTIMATE2019, ABBR) %>% - rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% - mutate(abbr = tolower(abbr)) %>% - as_tibble() - - -usethis::use_data(state_census, overwrite = TRUE) diff --git a/data/case_death_rate_subset.rda b/data/case_death_rate_subset.rda deleted file mode 100644 index 2e5ced29e..000000000 Binary files a/data/case_death_rate_subset.rda and /dev/null differ diff --git a/data/grad_employ_subset.rda b/data/grad_employ_subset.rda deleted file mode 100644 index 9380b43b5..000000000 Binary files a/data/grad_employ_subset.rda and /dev/null differ diff --git a/data/state_census.rda b/data/state_census.rda deleted file mode 100644 index 1118db0d0..000000000 Binary files a/data/state_census.rda and /dev/null differ diff --git a/epipredict.Rproj b/epipredict.Rproj index 69fafd4b6..0c2659d89 100644 --- a/epipredict.Rproj +++ b/epipredict.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: a71c2044-10c8-46a9-9702-f4bfc95042c8 RestoreWorkspace: No SaveWorkspace: No diff --git a/inst/extdata/can_prov_cases.rds b/inst/extdata/can_prov_cases.rds deleted file mode 100644 index b6a10a422..000000000 Binary files a/inst/extdata/can_prov_cases.rds and /dev/null differ diff --git a/inst/extdata/canada-case-rates.R b/inst/extdata/canada-case-rates.R deleted file mode 100644 index 7cf88d602..000000000 --- a/inst/extdata/canada-case-rates.R +++ /dev/null @@ -1,23 +0,0 @@ -path_to_csvs <- here::here("../../COVID-BC/Covid19Canada/updates.nosync/") -files <- list.files(path_to_csvs) -ca_as_ofs <- as.Date(substr(files, 1, 10)) %>% - intersect(fc_time_values) %>% - as.Date(origin = "1970-01-01") - -can <- purrr::map(ca_as_ofs, ~ { - readr::read_csv(here::here(path_to_csvs, paste0(.x, ".csv"))) %>% - left_join(ca_pop) %>% - mutate(time_value = lubridate::dmy(date_report)) %>% - filter(province %in% ca_pop$province, time_value > "2020-04-01") %>% - mutate( - geo_value = province, - case_rate = cases / population * 1e5 - ) %>% - select(geo_value, time_value, case_rate) %>% - as_epi_df(geo_type = "province", as_of = .x) -}) -names(can) <- ca_as_ofs -can <- can %>% - bind_rows(.id = "version") %>% - mutate(version = lubridate::ymd(version)) -saveRDS(can, "inst/extdata/can_prov_cases.rds") diff --git a/inst/extdata/epi_archive.rds b/inst/extdata/epi_archive.rds deleted file mode 100644 index 8ca52be76..000000000 Binary files a/inst/extdata/epi_archive.rds and /dev/null differ diff --git a/inst/other-vignettes/symptom-surveys.Rmd b/inst/other-vignettes/symptom-surveys.Rmd index f480db575..cf8013572 100644 --- a/inst/other-vignettes/symptom-surveys.Rmd +++ b/inst/other-vignettes/symptom-surveys.Rmd @@ -145,22 +145,40 @@ own forecaster under the `epipredict` framework, we could easily add steps to re-scale and transform the signals to our `epi_recipe`. This would make the code more succinct and self-contained. +We will compare two CLI-in-community indicators from +different sources. The data are available in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), +and can be loaded with: + ```{r, message = FALSE, warning = FALSE} -library(epidatr) library(dplyr) library(purrr) library(epipredict) library(recipes) case_num <- 200 -as_of_date <- "2020-05-14" -geo_values <- pub_covidcast( +z <- epidatasets::county_smoothed_cli_comparison +``` + +The data can also be constructed from data the Delphi API with the following code: + +```{r, message = FALSE, warning = FALSE, eval = FALSE} +library(epidatr) + +d <- "2020-09-21" + +case_num <- 200 +geos_date <- "2020-05-14" + +# Find counties that on 2020-05-14 had >= 200 cases reported. +# For later datasets, we will only keep data for these geos. +geo_values_initial <- pub_covidcast( source = "jhu-csse", signals = "confirmed_cumulative_num", geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(20200514, 20200514) + time_values = epirange(geos_date, geos_date), + as_of = d ) %>% filter(value >= case_num) %>% pull(geo_value) %>% @@ -177,9 +195,10 @@ goog_sm_cli <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(goog = value) @@ -189,9 +208,10 @@ fb_survey <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(fb = value) @@ -201,26 +221,31 @@ jhu_7dav_incid <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(case = value) -# Find "complete" counties, present in all three data signals at all times +# Find "complete" counties, present in all three data signals, and also +# present in the `geo_values_initial` object. geo_values_complete <- intersect( intersect(goog_sm_cli$geo_value, fb_survey$geo_value), jhu_7dav_incid$geo_value ) -# Make one big matrix by joining these three data frames -z <- full_join(full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")), +# Join the three data frames together +z <- full_join( + full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")), jhu_7dav_incid, by = c("geo_value", "time_value") ) %>% filter(geo_value %in% geo_values_complete) %>% - as_epi_df() + as_epi_df(as_of = d) +``` +```{r, message = FALSE, warning = FALSE} Logit <- function(x, a = 0.01) log((x + a) / (1 - x + a)) Sigmd <- function(y, a = 0.01) (exp(y) * (1 + a) - a) / (1 + exp(y)) @@ -423,7 +448,7 @@ res_err4 <- res_all4 %>% knitr::kable( res_err4 %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -472,7 +497,7 @@ res_dif4 <- res_all4 %>% knitr::kable( res_dif4 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -501,7 +526,7 @@ ggplot_colors <- c("#FC4E07", "#00AFBB", "#E7B800") ggplot(res_dif4 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -649,7 +674,7 @@ knitr::kable( res_err2 %>% select(-ends_with("diff")) %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -684,7 +709,7 @@ to incorporate during periods of time where forecasting is easier. ggplot( res_err2 %>% group_by(model, lead, forecast_date) %>% - summarize(err = median(err)) %>% ungroup(), + summarise(err = median(err)) %>% ungroup(), aes(x = forecast_date, y = err) ) + geom_line(aes(color = model)) + @@ -722,7 +747,7 @@ res_dif2 <- res_all2 %>% knitr::kable( res_dif2 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -739,7 +764,7 @@ quite small. ```{r} ggplot(res_dif2 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -802,7 +827,7 @@ err_by_lead <- res %>% ) %>% mutate(model = factor(model, labels = model_names[1:2])) %>% group_by(model, lead) %>% - summarize(err = median(err)) %>% + summarise(err = median(err)) %>% ungroup() ggplot(err_by_lead, aes(x = lead, y = err)) + diff --git a/inst/templates/layer.R b/inst/templates/layer.R deleted file mode 100644 index 59556db5f..000000000 --- a/inst/templates/layer.R +++ /dev/null @@ -1,44 +0,0 @@ -layer_{{ name }} <- function(frosting, # mandatory - ..., - args, # add as many as you need - more_args, - id = rand_id("{{{ name }}}")) { - - # validate any additional arguments here - - # if you don't need ... then uncomment the line below - ## rlang::check_dots_empty() - add_layer( - frosting, - layer_{{{ name }}}_new( - terms = dplyr::enquos(...), # remove if ... should be empty - args, - id = id - ) - ) -} - -layer_{{{ name }}}_new <- function(terms, args, more_args, id) { - layer("{{{ name }}}", - terms = terms, - args = args, - more_args = more_args, - id = id) -} - -#' @export -slather.layer_{{{ name }}} <- - function(object, components, workflow, new_data, ...) { - rlang::check_dots_empty() - - # if layer_ used ... in tidyselect, we need to evaluate it now - exprs <- rlang::expr(c(!!!object$terms)) - pos <- tidyselect::eval_select(exprs, components$predictions) - col_names <- names(pos) - # now can select with `tidyselect::all_of(col_names)` - - # add additional necessary processing steps here - - # always return components - components - } diff --git a/man/Add_model.Rd b/man/Add_model.Rd index 17b65793c..8f316532e 100644 --- a/man/Add_model.Rd +++ b/man/Add_model.Rd @@ -71,8 +71,7 @@ aliases with the lower-case names. However, in the event that properly. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/add_epi_recipe.Rd b/man/add_epi_recipe.Rd index 0da2d55b3..e750c72ba 100644 --- a/man/add_epi_recipe.Rd +++ b/man/add_epi_recipe.Rd @@ -4,7 +4,7 @@ \alias{add_epi_recipe} \alias{remove_epi_recipe} \alias{update_epi_recipe} -\title{Add an \code{epi_recipe} to a workflow} +\title{Add/remove/update the \code{epi_recipe} of an \code{epi_workflow}} \usage{ add_epi_recipe(x, recipe, ..., blueprint = default_epi_recipe_blueprint()) @@ -30,27 +30,28 @@ might be done automatically by the underlying model.} \code{x}, updated with a new recipe preprocessor. } \description{ -Add an \code{epi_recipe} to a workflow +\itemize{ +\item \code{add_recipe()} specifies the terms of the model and any preprocessing that +is required through the usage of a recipe. +\item \code{remove_recipe()} removes the recipe as well as any downstream objects +\item \code{update_recipe()} first removes the recipe, then replaces the previous +recipe with the new one. +} } \details{ -\code{add_epi_recipe} has the same behaviour as -\code{\link[workflows:add_recipe]{workflows::add_recipe()}} but sets a different -default blueprint to automatically handle \link[epiprocess:epi_df]{epiprocess::epi_df} data. +\code{add_epi_recipe()} has the same behaviour as \code{\link[workflows:add_recipe]{workflows::add_recipe()}} but +sets a different default blueprint to automatically handle +\code{epiprocess::epi_df()} data. } \examples{ -library(dplyr) -library(recipes) - -jhu <- case_death_rate_subset \%>\% - filter(time_value > "2021-08-01") \%>\% - arrange(geo_value, time_value) +jhu <- covid_case_death_rates \%>\% + filter(time_value > "2021-08-01") r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% - step_naomit(all_predictors()) \%>\% - step_naomit(all_outcomes(), skip = TRUE) + step_epi_naomit() workflow <- epi_workflow() \%>\% add_epi_recipe(r) @@ -70,11 +71,4 @@ workflow } \seealso{ \code{\link[workflows:add_recipe]{workflows::add_recipe()}} -\itemize{ -\item \code{add_recipe()} specifies the terms of the model and any preprocessing that -is required through the usage of a recipe. -\item \code{remove_recipe()} removes the recipe as well as any downstream objects -\item \code{update_recipe()} first removes the recipe, then replaces the previous -recipe with the new one. -} } diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index 94812cbe2..3e7be7647 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -4,7 +4,7 @@ \alias{add_frosting} \alias{remove_frosting} \alias{update_frosting} -\title{Add frosting to a workflow} +\title{Add/remove/update the \code{frosting} of an \code{epi_workflow}} \usage{ add_frosting(x, frosting, ...) @@ -23,11 +23,10 @@ update_frosting(x, frosting, ...) \code{x}, updated with a new frosting postprocessor } \description{ -Add frosting to a workflow +Add/remove/update the \code{frosting} of an \code{epi_workflow} } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/add_shifted_columns.Rd b/man/add_shifted_columns.Rd new file mode 100644 index 000000000..aad22e805 --- /dev/null +++ b/man/add_shifted_columns.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{add_shifted_columns} +\alias{add_shifted_columns} +\title{backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single}} +\usage{ +add_shifted_columns(new_data, object) +} +\description{ +backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single} +} +\keyword{internal} diff --git a/man/adjust_epi_recipe.Rd b/man/adjust_epi_recipe.Rd index 7468c4ce2..b1dc6c60e 100644 --- a/man/adjust_epi_recipe.Rd +++ b/man/adjust_epi_recipe.Rd @@ -52,10 +52,9 @@ must be inputted as \code{...}. See the examples below for brief illustrations of the different types of updates. } \examples{ -library(dplyr) library(workflows) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/adjust_frosting.Rd b/man/adjust_frosting.Rd index c089b3443..e6b33f3d1 100644 --- a/man/adjust_frosting.Rd +++ b/man/adjust_frosting.Rd @@ -35,8 +35,7 @@ must be inputted as \code{...}. See the examples below for brief illustrations of the different types of updates. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/apply_frosting.Rd b/man/apply_frosting.Rd index ef18796cc..ece3261e8 100644 --- a/man/apply_frosting.Rd +++ b/man/apply_frosting.Rd @@ -5,7 +5,7 @@ \alias{apply_frosting.default} \alias{apply_frosting.epi_recipe} \alias{apply_frosting.epi_workflow} -\title{Apply postprocessing to a fitted workflow} +\title{Apply post-processing to a fitted workflow} \usage{ apply_frosting(workflow, ...) @@ -39,6 +39,6 @@ and predict on} \code{\link[=slather]{slather()}} for supported layers} } \description{ -This function is intended for internal use. It implements postprocessing +This function is intended for internal use. It implements post-processing inside of the \code{predict()} method for a fitted workflow. } diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index c9ae4e733..ce4d57fce 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -10,11 +10,12 @@ arx_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, - quantile_levels = c(0.05, 0.95), + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -32,20 +33,45 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} + +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: +\itemize{ +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0, 7, 14)} and the last day of +data was 3 days ago, the lags become \code{c(3, 10, 17)}. +}} + +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of training residuals. A \code{NULL} value will result in point forecasts only.} -\item{symmetrize}{Logical. The default \code{TRUE} calculates -symmetric prediction intervals. This argument only applies when -residual quantiles are used. It is not applicable with -\code{trainer = quantile_reg()}, for example.} +\item{symmetrize}{Logical. The default \code{TRUE} calculates symmetric prediction +intervals. This argument only applies when residual quantiles are used. It +is not applicable with \code{trainer = quantile_reg()}, for example. Typically, one +would only want non-symmetric quantiles when increasing trajectories are +quite different from decreasing ones, such as a strictly postive variable +near zero.} \item{nonneg}{Logical. The default \code{TRUE} enforces nonnegative predictions by hard-thresholding at 0.} @@ -57,16 +83,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} @@ -87,3 +103,6 @@ arx_args_list() arx_args_list(symmetrize = FALSE) arx_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) } +\seealso{ +\code{\link[=arx_forecaster]{arx_forecaster()}} +} diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 311950d62..7359c8764 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -10,13 +10,13 @@ arx_class_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, method = c("rel_change", "linear_reg"), log_scale = FALSE, - additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -34,17 +34,40 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} + +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: +\itemize{ +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0, 7, 14)} and the last day of +data was 3 days ago, the lags become \code{c(3, 10, 17)}. +}} + +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged differences. The second case is closer to the requirements for the \href{https://github.com/cdcepi/Flusight-forecast-data/blob/745511c436923e1dc201dea0f4181f21a8217b52/data-experimental/README.md}{2022-23 CDC Flusight Hospitalization Experimental Target}. -See the Classification Vignette for details of how to create a reasonable +See the \href{https://cmu-delphi.github.io/delphi-tooling-book/arx-classifier.html}{Classification chapter from the forecasting book} Vignette for details of how to create a reasonable baseline for this case. Selecting \code{"growth_rate"} (the default) uses \code{\link[epiprocess:growth_rate]{epiprocess::growth_rate()}} to create the outcome using some of the additional arguments below. Choosing \code{"lag_difference"} instead simply @@ -72,20 +95,6 @@ calculate the growth rate.} \item{log_scale}{Scalar logical. Whether to compute growth rates on the log scale.} -\item{additional_gr_args}{List. Optional arguments controlling growth rate -calculation. See \code{\link[epiprocess:growth_rate]{epiprocess::growth_rate()}} and the related Vignette for -more details.} - -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} diff --git a/man/arx_class_epi_workflow.Rd b/man/arx_class_epi_workflow.Rd index 713365f17..aec29a286 100644 --- a/man/arx_class_epi_workflow.Rd +++ b/man/arx_class_epi_workflow.Rd @@ -23,10 +23,9 @@ If discrete classes are already in the \code{epi_df}, it is recommended to code up a classifier from scratch using \code{\link[=epi_recipe]{epi_recipe()}}.} \item{predictors}{A character vector giving column(s) of predictor variables. -This defaults to the \code{outcome}. However, if manually specified, only those variables -specifically mentioned will be used. (The \code{outcome} will not be added.) -By default, equals the outcome. If manually specified, does not add the -outcome variable, so make sure to specify it.} +This defaults to the \code{outcome}. However, if manually specified, only those +variables specifically mentioned will be used, and the \code{outcome} will not be +added.} \item{trainer}{A \code{{parsnip}} model describing the type of estimation. For now, we enforce \code{mode = "classification"}. Typical values are @@ -47,8 +46,7 @@ before fitting and predicting. Supplying a trainer to the function may alter the returned \code{epi_workflow} object but can be omitted. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) arx_class_epi_workflow(jhu, "death_rate", c("case_rate", "death_rate")) @@ -65,5 +63,5 @@ arx_class_epi_workflow( ) } \seealso{ -\code{\link[=arx_classifier]{arx_classifier()}} +\code{\link[=arx_classifier]{arx_classifier()}} \code{\link[=arx_class_args_list]{arx_class_args_list()}} } diff --git a/man/arx_classifier.Rd b/man/arx_classifier.Rd index c7c2cf059..48ed72fcc 100644 --- a/man/arx_classifier.Rd +++ b/man/arx_classifier.Rd @@ -23,10 +23,9 @@ If discrete classes are already in the \code{epi_df}, it is recommended to code up a classifier from scratch using \code{\link[=epi_recipe]{epi_recipe()}}.} \item{predictors}{A character vector giving column(s) of predictor variables. -This defaults to the \code{outcome}. However, if manually specified, only those variables -specifically mentioned will be used. (The \code{outcome} will not be added.) -By default, equals the outcome. If manually specified, does not add the -outcome variable, so make sure to specify it.} +This defaults to the \code{outcome}. However, if manually specified, only those +variables specifically mentioned will be used, and the \code{outcome} will not be +added.} \item{trainer}{A \code{{parsnip}} model describing the type of estimation. For now, we enforce \code{mode = "classification"}. Typical values are @@ -43,14 +42,127 @@ and (2) \code{epi_workflow}, a list that encapsulates the entire estimation workflow } \description{ -This is an autoregressive classification model for -\link[epiprocess:epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning -that it estimates a class at a particular target horizon. +This is an autoregressive classification model for continuous data. It does +"direct" forecasting, meaning that it estimates a class at a particular +target horizon. +} +\details{ +The \code{arx_classifier()} is an autoregressive classification model for \code{epi_df} +data that is used to predict a discrete class for each case under +consideration. It is a direct forecaster in that it estimates the classes +at a specific horizon or ahead value. + +To get a sense of how the \code{arx_classifier()} works, let's consider a simple +example with minimal inputs. For this, we will use the built-in +\code{covid_case_death_rates} that contains confirmed COVID-19 cases and deaths +from JHU CSSE for all states over Dec 31, 2020 to Dec 31, 2021. From this, +we'll take a subset of data for five states over June 4, 2021 to December +31, 2021. Our objective is to predict whether the case rates are increasing +when considering the 0, 7 and 14 day case rates: + +\if{html}{\out{
}}\preformatted{jhu <- covid_case_death_rates \%>\% + filter( + time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value \%in\% c("ca", "fl", "tx", "ny", "nj") + ) + +out <- arx_classifier(jhu, outcome = "case_rate", predictors = "case_rate") + +out$predictions +#> # A tibble: 5 x 4 +#> geo_value .pred_class forecast_date target_date +#> +#> 1 ca (-Inf,0.25] 2021-12-31 2022-01-07 +#> 2 fl (-Inf,0.25] 2021-12-31 2022-01-07 +#> 3 nj (-Inf,0.25] 2021-12-31 2022-01-07 +#> 4 ny (-Inf,0.25] 2021-12-31 2022-01-07 +#> 5 tx (-Inf,0.25] 2021-12-31 2022-01-07 +}\if{html}{\out{
}} + +The key takeaway from the predictions is that there are two prediction +classes: \verb{(-Inf, 0.25]} and \verb{(0.25, Inf)}: the classes to predict must be +discrete. The discretization of the real-valued outcome is controlled by +the \code{breaks} argument, which defaults to \code{0.25}. Such breaks will be +automatically extended to cover the entire real line. For example, the +default break of \code{0.25} is silently extended to \code{breaks = c(-Inf, .25, Inf)} and, therefore, results in two classes: \verb{[-Inf, 0.25]} and \verb{(0.25, Inf)}. These two classes are used to discretize the outcome. The conversion +of the outcome to such classes is handled internally. So if discrete +classes already exist for the outcome in the \code{epi_df}, then we recommend to +code a classifier from scratch using the \code{epi_workflow} framework for more +control. + +The \code{trainer} is a \code{parsnip} model describing the type of estimation such +that \code{mode = "classification"} is enforced. The two typical trainers that +are used are \code{parsnip::logistic_reg()} for two classes or +\code{parsnip::multinom_reg()} for more than two classes. + +\if{html}{\out{
}}\preformatted{workflows::extract_spec_parsnip(out$epi_workflow) +#> Logistic Regression Model Specification (classification) +#> +#> Computational engine: glm +}\if{html}{\out{
}} + +From the parsnip model specification, we can see that the trainer used is +logistic regression, which is expected for our binary outcome. More +complicated trainers like \code{parsnip::naive_Bayes()} or +\code{parsnip::rand_forest()} may also be used (however, we will stick to the +basics in this gentle introduction to the classifier). + +If you use the default trainer of logistic regression for binary +classification and you decide against using the default break of 0.25, then +you should only input one break so that there are two classification bins +to properly dichotomize the outcome. For example, let's set a break of 0.5 +instead of relying on the default of 0.25. We can do this by passing 0.5 to +the \code{breaks} argument in \code{arx_class_args_list()} as follows: + +\if{html}{\out{
}}\preformatted{out_break_0.5 <- arx_classifier( + jhu, + outcome = "case_rate", + predictors = "case_rate", + args_list = arx_class_args_list( + breaks = 0.5 + ) +) +#> Warning: glm.fit: algorithm did not converge +#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred + +out_break_0.5$predictions +#> # A tibble: 5 x 4 +#> geo_value .pred_class forecast_date target_date +#> +#> 1 ca (-Inf,0.5] 2021-12-31 2022-01-07 +#> 2 fl (-Inf,0.5] 2021-12-31 2022-01-07 +#> 3 nj (-Inf,0.5] 2021-12-31 2022-01-07 +#> 4 ny (-Inf,0.5] 2021-12-31 2022-01-07 +#> 5 tx (-Inf,0.5] 2021-12-31 2022-01-07 +}\if{html}{\out{
}} + +Indeed, we can observe that the two \code{.pred_class} are now (-Inf, 0.5] and +(0.5, Inf). See \code{help(arx_class_args_list)} for other available +modifications. + +Additional arguments that may be supplied to \code{arx_class_args_list()} include +the expected \code{lags} and \code{ahead} arguments for an autoregressive-type model. +These have default values of 0, 7, and 14 days for the lags of the +predictors and 7 days ahead of the forecast date for predicting the +outcome. There is also \code{n_training} to indicate the upper bound for the +number of training rows per key. If you would like some practice with using +this, then remove the filtering command to obtain data within "2021-06-04" +and "2021-12-31" and instead set \code{n_training} to be the number of days +between these two dates, inclusive of the end points. The end results +should be the same. In addition to \code{n_training}, there are \code{forecast_date} +and \code{target_date} to specify the date that the forecast is created and +intended, respectively. We will not dwell on such arguments here as they +are not unique to this classifier or absolutely essential to understanding +how it operates. The remaining arguments will be discussed organically, as +they are needed to serve our purposes. For information on any remaining +arguments that are not discussed here, please see the function +documentation for a complete list and their definitions. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% - filter(time_value >= as.Date("2021-11-01")) +tiny_geos <- c("as", "mp", "vi", "gu", "pr") +jhu <- covid_case_death_rates \%>\% + filter(time_value >= as.Date("2021-11-01"), !(geo_value \%in\% tiny_geos)) out <- arx_classifier(jhu, "death_rate", c("case_rate", "death_rate")) diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index 4070a3337..97fa16273 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -15,21 +15,19 @@ arx_fcast_epi_workflow( \arguments{ \item{epi_data}{An \code{epi_df} object} -\item{outcome}{A character (scalar) specifying the outcome (in the -\code{epi_df}).} +\item{outcome}{A character (scalar) specifying the outcome (in the \code{epi_df}).} \item{predictors}{A character vector giving column(s) of predictor variables. -This defaults to the \code{outcome}. However, if manually specified, only those variables -specifically mentioned will be used. (The \code{outcome} will not be added.) -By default, equals the outcome. If manually specified, does not add the -outcome variable, so make sure to specify it.} +This defaults to the \code{outcome}. However, if manually specified, only those +variables specifically mentioned will be used, and the \code{outcome} will not be +added.} \item{trainer}{A \code{{parsnip}} model describing the type of estimation. For now, we enforce \code{mode = "regression"}. May be \code{NULL} if you'd like to decide later.} -\item{args_list}{A list of customization arguments to determine -the type of forecasting model. See \code{\link[=arx_args_list]{arx_args_list()}}.} +\item{args_list}{A list of customization arguments to determine the type of +forecasting model. See \code{\link[=arx_args_list]{arx_args_list()}}.} } \value{ An unfitted \code{epi_workflow}. @@ -42,8 +40,7 @@ may alter the returned \code{epi_workflow} object (e.g., if you intend to use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-12-01")) arx_fcast_epi_workflow( @@ -58,5 +55,5 @@ arx_fcast_epi_workflow(jhu, "death_rate", ) } \seealso{ -\code{\link[=arx_forecaster]{arx_forecaster()}} +\code{\link[=arx_forecaster]{arx_forecaster()}}, \code{\link[=arx_args_list]{arx_args_list()}} } diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d8c7671dc..63b5cfb8c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -15,45 +15,49 @@ arx_forecaster( \arguments{ \item{epi_data}{An \code{epi_df} object} -\item{outcome}{A character (scalar) specifying the outcome (in the -\code{epi_df}).} +\item{outcome}{A character (scalar) specifying the outcome (in the \code{epi_df}).} \item{predictors}{A character vector giving column(s) of predictor variables. -This defaults to the \code{outcome}. However, if manually specified, only those variables -specifically mentioned will be used. (The \code{outcome} will not be added.) -By default, equals the outcome. If manually specified, does not add the -outcome variable, so make sure to specify it.} +This defaults to the \code{outcome}. However, if manually specified, only those +variables specifically mentioned will be used, and the \code{outcome} will not be +added.} -\item{trainer}{A \code{{parsnip}} model describing the type of estimation. -For now, we enforce \code{mode = "regression"}.} +\item{trainer}{A \code{{parsnip}} model describing the type of estimation. For +now, we enforce \code{mode = "regression"}.} -\item{args_list}{A list of customization arguments to determine -the type of forecasting model. See \code{\link[=arx_args_list]{arx_args_list()}}.} +\item{args_list}{A list of customization arguments to determine the type of +forecasting model. See \code{\link[=arx_args_list]{arx_args_list()}}.} } \value{ -A list with (1) \code{predictions} an \code{epi_df} of predicted values -and (2) \code{epi_workflow}, a list that encapsulates the entire estimation -workflow +An \code{arx_fcast}, with the fields \code{predictions} and \code{epi_workflow}. +\code{predictions} is a \code{tibble} of predicted values while \code{epi_workflow()} is +the fit workflow used to make those predictions } \description{ This is an autoregressive forecasting model for -\link[epiprocess:epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning -that it estimates a model for a particular target horizon. +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It does "direct" +forecasting, meaning that it estimates a model for a particular target +horizon of the \code{outcome} based on the lags of the \code{predictors}. See the \href{../articles/epipredict.html}{Get started vignette} for some worked examples and +\href{../articles/custom_epiworkflows.html}{Custom epi_workflows vignette} for a +recreation using a custom \code{epi_workflow()}. } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) out <- arx_forecaster( - jhu, "death_rate", + jhu, + "death_rate", c("case_rate", "death_rate") ) -out <- arx_forecaster(jhu, "death_rate", +out <- arx_forecaster(jhu, + "death_rate", c("case_rate", "death_rate"), trainer = quantile_reg(), args_list = arx_args_list(quantile_levels = 1:9 / 10) ) +out } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/augment.epi_workflow.Rd b/man/augment.epi_workflow.Rd index 8007a4d30..cbc807ebb 100644 --- a/man/augment.epi_workflow.Rd +++ b/man/augment.epi_workflow.Rd @@ -17,5 +17,8 @@ new_data with additional columns containing the predicted values } \description{ -Augment data with predictions +\code{augment()}, unlike \code{forecast()}, has the goal of modifying the training +data, rather than just producing new forecasts. It does a prediction on +\code{new_data}, which will produce a prediction for most \code{time_values}, and then +adds \code{.pred} as a column to \code{new_data} and returns the resulting join. } diff --git a/man/autoplot-epipred.Rd b/man/autoplot-epipred.Rd index 27bfdf5f7..03cba3e2d 100644 --- a/man/autoplot-epipred.Rd +++ b/man/autoplot-epipred.Rd @@ -4,36 +4,49 @@ \alias{autoplot-epipred} \alias{autoplot.epi_workflow} \alias{autoplot.canned_epipred} +\alias{plot.epi_workflow} +\alias{plot.canned_epipred} \title{Automatically plot an \code{epi_workflow} or \code{canned_epipred} object} \usage{ \method{autoplot}{epi_workflow}( object, predictions = NULL, - .levels = c(0.5, 0.8, 0.95), + observed_response = NULL, + .levels = c(0.5, 0.8, 0.9), ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "dodgerblue4", .point_pred_color = "orange", - .max_facets = Inf + .facet_filter = NULL, + .max_facets = deprecated() ) \method{autoplot}{canned_epipred}( object, + observed_response = NULL, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "dodgerblue4", .point_pred_color = "orange", - .max_facets = Inf + .facet_filter = NULL, + .max_facets = deprecated() ) + +\method{plot}{epi_workflow}(x, ...) + +\method{plot}{canned_epipred}(x, ...) } \arguments{ -\item{object}{An \code{epi_workflow}} +\item{object, x}{An \code{epi_workflow}} \item{predictions}{A data frame with predictions. If \code{NULL}, only the original data is shown.} +\item{observed_response}{An epi_df of the data to plot against. This is for the case +where you have the actual results to compare the forecast against.} + \item{.levels}{A numeric vector of levels to plot for any prediction bands. More than 3 levels begins to be difficult to see.} @@ -60,8 +73,16 @@ color.} \item{.point_pred_color}{If available, point forecasts will be shown with this color.} -\item{.max_facets}{Cut down of the number of facets displayed. Especially -useful for testing when there are many \code{geo_value}'s or keys.} +\item{.facet_filter}{Select which facets will be displayed. Especially +useful for when there are many \code{geo_value}'s or keys. This is a +<\code{\link[=args_data_masking]{rlang}}> expression along the lines of \code{\link[dplyr:filter]{dplyr::filter()}}. +However, it must be a single expression combined with the \code{&} operator. This +contrasts to the typical use case which allows multiple comma-separated expressions +which are implicitly combined with \code{&}. When multiple variables are selected +with \code{...}, their names can be filtered in combination with other factors +by using \code{.response_name}. See the examples below.} + +\item{.max_facets}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}} } \description{ For a fit workflow, the training data will be displayed, the response by @@ -70,8 +91,7 @@ will be shown as well. Unfit workflows will result in an error, (you can simply call \code{autoplot()} on the original \code{epi_df}). } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) r <- epi_recipe(jhu) \%>\% @@ -81,9 +101,7 @@ r <- epi_recipe(jhu) \%>\% step_epi_naomit() f <- frosting() \%>\% - layer_residual_quantiles( - quantile_levels = c(.025, .1, .25, .75, .9, .975) - ) \%>\% + layer_residual_quantiles() \%>\% layer_threshold(starts_with(".pred")) \%>\% layer_add_target_date() @@ -93,7 +111,7 @@ autoplot(wf) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) preds <- predict(wf, latest) -autoplot(wf, preds, .max_facets = 4) +autoplot(wf, preds, .facet_filter = geo_value \%in\% c("ca", "ny", "de", "mt")) # ------- Show multiple horizons @@ -108,17 +126,17 @@ p <- lapply(c(7, 14, 21, 28), function(h) { }) p <- do.call(rbind, p) -autoplot(wf, p, .max_facets = 4) +autoplot(wf, p, .facet_filter = geo_value \%in\% c("ca", "ny", "de", "mt")) # ------- Plotting canned forecaster output -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) flat <- flatline_forecaster(jhu, "death_rate") -autoplot(flat, .max_facets = 4) +autoplot(flat, .facet_filter = geo_value \%in\% c("ca", "ny", "de", "mt")) arx <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(ahead = 14L) ) -autoplot(arx, .max_facets = 6) +autoplot(arx, .facet_filter = geo_value \%in\% c("ca", "ny", "de", "mt", "mo", "in")) } diff --git a/man/case_death_rate_subset.Rd b/man/case_death_rate_subset.Rd deleted file mode 100644 index 119c8ee26..000000000 --- a/man/case_death_rate_subset.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{case_death_rate_subset} -\alias{case_death_rate_subset} -\title{Subset of JHU daily state cases and deaths} -\format{ -A tibble with 20,496 rows and 4 variables: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -} -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -calculating moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -} -} -\usage{ -case_death_rate_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths -is based on reports made available by the Center for -Systems Science and Engineering at Johns Hopkins University. -This example data ranges from Dec 31, 2020 to Dec 31, 2021, -and includes all states. -} -\keyword{datasets} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 2f9300572..a837bd7a0 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -14,7 +14,6 @@ cdc_baseline_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ... ) } @@ -34,8 +33,14 @@ set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_ key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of @@ -46,10 +51,12 @@ These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in linear interpolation on the X scale. This is achieved with \code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} -\item{symmetrize}{Logical. The default \code{TRUE} calculates -symmetric prediction intervals. This argument only applies when -residual quantiles are used. It is not applicable with -\code{trainer = quantile_reg()}, for example.} +\item{symmetrize}{Logical. The default \code{TRUE} calculates symmetric prediction +intervals. This argument only applies when residual quantiles are used. It +is not applicable with \code{trainer = quantile_reg()}, for example. Typically, one +would only want non-symmetric quantiles when increasing trajectories are +quite different from decreasing ones, such as a strictly postive variable +near zero.} \item{nonneg}{Logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this @@ -63,16 +70,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 0c7f1e436..c201bd99d 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -38,7 +38,8 @@ This forecaster is meant to produce exactly the CDC Baseline used for } \examples{ library(dplyr) -weekly_deaths <- case_death_rate_subset \%>\% +library(epiprocess) +weekly_deaths <- covid_case_death_rates \%>\% select(geo_value, time_value, death_rate) \%>\% left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% @@ -51,23 +52,22 @@ weekly_deaths <- case_death_rate_subset \%>\% cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) -if (require(ggplot2)) { - forecast_date <- unique(preds$forecast_date) - four_states <- c("ca", "pa", "wa", "ny") - preds \%>\% - filter(geo_value \%in\% four_states) \%>\% - ggplot(aes(target_date)) + - geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + - geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + - geom_line(aes(y = .pred), color = "orange") + - geom_line( - data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = deaths_7dsum) - ) + - scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + - labs(x = "Date", y = "Weekly deaths") + - facet_wrap(~geo_value, scales = "free_y") + - theme_bw() + - geom_vline(xintercept = forecast_date) -} +library(ggplot2) +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths_7dsum) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) } diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd new file mode 100644 index 000000000..969caa1d2 --- /dev/null +++ b/man/check_enough_data.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_enough_data.R +\name{check_enough_data} +\alias{check_enough_data} +\title{Check the dataset contains enough data points.} +\usage{ +check_enough_data( + recipe, + ..., + min_observations = NULL, + epi_keys = NULL, + drop_na = TRUE, + role = NA, + trained = FALSE, + skip = TRUE, + id = rand_id("enough_data") +) +} +\arguments{ +\item{recipe}{A recipe object. The check will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this check. +See \code{\link[=selections]{selections()}} for more details. You will usually want to use +\code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.} + +\item{min_observations}{The minimum number of data points required for +training. If this is NULL, the total number of predictors will be used.} + +\item{epi_keys}{A character vector of column names on which to group the data +and check threshold within each group. Useful if your forecaster trains +per group (for example, per geo_value).} + +\item{drop_na}{A logical for whether to count NA values as valid rows.} + +\item{role}{Not used by this check since no new variables are +created.} + +\item{trained}{A logical for whether the selectors in \code{...} +have been resolved by \code{\link[=prep]{prep()}}.} + +\item{skip}{A logical. If \code{TRUE}, only training data is checked, while if +\code{FALSE}, both training and predicting data is checked. Technically, this +answers the question "should the check be skipped when the recipe is baked +by \code{\link[=bake]{bake()}}?" While all operations are baked when \code{\link[=prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this check to identify it.} +} +\description{ +\code{check_enough_data} creates a \emph{specification} of a recipe +operation that will check if variables contain enough data. +} +\details{ +This check will break the \code{prep} and/or bake function if any of the +checked columns have not enough non-NA values. If the check passes, nothing +is changed in the data. It is best used after every other step. + +For checking training data, it is best to set \code{...} to be +\verb{all_predictors(), all_outcomes()}, while for checking prediction data, it +is best to set \code{...} to be \code{all_predictors()} only, with \code{n = 1}. +} +\section{tidy() results}{ +When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column +\code{terms} (the selectors or variables selected) is returned. +} + +\concept{checks} diff --git a/man/check_enough_train_data.Rd b/man/check_enough_train_data.Rd deleted file mode 100644 index 57a4a9d78..000000000 --- a/man/check_enough_train_data.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_enough_train_data.R -\name{check_enough_train_data} -\alias{check_enough_train_data} -\title{Check the dataset contains enough data points.} -\usage{ -check_enough_train_data( - recipe, - ..., - n = NULL, - epi_keys = NULL, - drop_na = TRUE, - role = NA, - trained = FALSE, - columns = NULL, - skip = TRUE, - id = rand_id("enough_train_data") -) -} -\arguments{ -\item{recipe}{A recipe object. The check will be added to the -sequence of operations for this recipe.} - -\item{...}{One or more selector functions to choose variables for this check. -See \code{\link[=selections]{selections()}} for more details. You will usually want to use -\code{\link[recipes:has_role]{recipes::all_predictors()}} here.} - -\item{n}{The minimum number of data points required for training. If this is -NULL, the total number of predictors will be used.} - -\item{epi_keys}{A character vector of column names on which to group the data -and check threshold within each group. Useful if your forecaster trains -per group (for example, per geo_value).} - -\item{drop_na}{A logical for whether to count NA values as valid rows.} - -\item{role}{Not used by this check since no new variables are -created.} - -\item{trained}{A logical for whether the selectors in \code{...} -have been resolved by \code{\link[=prep]{prep()}}.} - -\item{columns}{An internal argument that tracks which columns are evaluated -for this check. Should not be used by the user.} - -\item{skip}{A logical. Should the check be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be -conducted on new data (e.g. processing the outcome variable(s)). -Care should be taken when using \code{skip = TRUE} as it may affect -the computations for subsequent operations.} - -\item{id}{A character string that is unique to this check to identify it.} -} -\description{ -\code{check_enough_train_data} creates a \emph{specification} of a recipe -operation that will check if variables contain enough data. -} -\details{ -This check will break the \code{bake} function if any of the checked -columns have not enough non-NA values. If the check passes, nothing is -changed to the data. -} -\section{tidy() results}{ -When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column -\code{terms} (the selectors or variables selected) is returned. -} - -\concept{checks} diff --git a/man/check_interminable_latency.Rd b/man/check_interminable_latency.Rd new file mode 100644 index 000000000..8ae9ec6df --- /dev/null +++ b/man/check_interminable_latency.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{check_interminable_latency} +\alias{check_interminable_latency} +\title{warn when the latency is larger than would be reasonable} +\usage{ +check_interminable_latency( + dataset, + latency_table, + target_columns, + forecast_date, + call = caller_env() +) +} +\arguments{ +\item{dataset}{the epi_df} + +\item{latency_table}{the whole collection of latencies} + +\item{target_columns}{the names of the columns that we're adjusting, and whether its unreasonably latent} +} +\description{ +warn when the latency is larger than would be reasonable +} +\keyword{internal} diff --git a/man/check_pname.Rd b/man/check_pname.Rd deleted file mode 100644 index 6740c6523..000000000 --- a/man/check_pname.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-misc.R -\name{check_pname} -\alias{check_pname} -\title{Check that newly created variable names don't overlap} -\usage{ -check_pname(res, preds, object, newname = NULL) -} -\arguments{ -\item{res}{A data frame or tibble of the newly created variables.} - -\item{preds}{An epi_df or tibble containing predictions.} - -\item{object}{A layer object passed to \code{\link[=slather]{slather()}}.} - -\item{newname}{A string of variable names if the object doesn't contain a -$name element} -} -\description{ -\code{check_pname} is to be used in a slather method to ensure that -newly created variable names don't overlap with existing names. -Throws an warning if check fails, and creates a random string. -} -\keyword{internal} diff --git a/man/climate_args_list.Rd b/man/climate_args_list.Rd new file mode 100644 index 000000000..b2b91061b --- /dev/null +++ b/man/climate_args_list.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/climatological_forecaster.R +\name{climate_args_list} +\alias{climate_args_list} +\title{Climatological forecaster argument constructor} +\usage{ +climate_args_list( + forecast_date = NULL, + forecast_horizon = 0:4, + time_type = c("epiweek", "week", "month", "day"), + center_method = c("median", "mean"), + window_size = 3L, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), + symmetrize = FALSE, + nonneg = TRUE, + quantile_by_key = character(0L), + ... +) +} +\arguments{ +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} + +\item{forecast_horizon}{Vector of integers giving the number of time steps, +in units of the \code{time_type}, +from the \code{reference_date} for which predictions should be produced.} + +\item{time_type}{The duration over which time aggregation should be performed.} + +\item{center_method}{The measure of center to be calculated over the time +window.} + +\item{window_size}{Scalar integer. How many time units on each side should +be included. For example, if \code{window_size = 3} and \code{time_type = "day"}, +then on each day in the data, the center will be calculated using 3 days +before and three days after. So, in this case, it operates like a weekly +rolling average, centered at each day.} + +\item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce +prediction intervals. These are created by computing the quantiles of +training residuals. A \code{NULL} value will result in point forecasts only.} + +\item{symmetrize}{Logical. The default \code{TRUE} calculates symmetric prediction +intervals. This argument only applies when residual quantiles are used. It +is not applicable with \code{trainer = quantile_reg()}, for example. Typically, one +would only want non-symmetric quantiles when increasing trajectories are +quite different from decreasing ones, such as a strictly postive variable +near zero.} + +\item{nonneg}{Logical. The default \code{TRUE} enforces nonnegative predictions +by hard-thresholding at 0.} + +\item{quantile_by_key}{Character vector. Groups residuals by listed keys +before calculating residual quantiles. See the \code{by_key} argument to +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} for more information. The default, +\code{character(0)} performs no grouping. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{...}{Further arguments passed to or from other methods (not currently +used).} +} +\value{ +A list containing updated parameter choices with class \code{climate_alist}. +} +\description{ +Climatological forecaster argument constructor +} +\examples{ +climate_args_list() +climate_args_list( + forecast_horizon = 0:10, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99) +) + +} +\seealso{ +\code{\link[=climatological_forecaster]{climatological_forecaster()}}, \code{\link[=step_climate]{step_climate()}} +} diff --git a/man/climatological_forecaster.Rd b/man/climatological_forecaster.Rd new file mode 100644 index 000000000..d773ebce6 --- /dev/null +++ b/man/climatological_forecaster.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/climatological_forecaster.R +\name{climatological_forecaster} +\alias{climatological_forecaster} +\title{Climatological forecaster} +\usage{ +climatological_forecaster(epi_data, outcome, args_list = climate_args_list()) +} +\arguments{ +\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} + +\item{outcome}{A scalar character for the column name we wish to predict.} + +\item{args_list}{A list of additional arguments as created by the +\code{\link[=climate_args_list]{climate_args_list()}} constructor function.} +} +\value{ +A data frame of point and interval) forecasts at a all horizons +for each unique combination of \code{key_vars}. +} +\description{ +This is another "baseline" type forecaster, but it is especially appropriate +for strongly seasonal diseases (e.g., influenza). The idea is to predict +the "typical season" by summarizing over all available history in the +\code{epi_data}. This is analogous to a "climate" forecast rather than a "weather" +forecast, essentially predicting "typical January" behavior by relying on a +long history of such periods rather than heavily using recent data. +} +\details{ +The point forecast is either the mean or median of the \code{outcome} in a small +window around the target period, computed over the entire available history, +separately for each key in the \code{epi_df} (\code{geo_value} and any additional keys). +The forecast quantiles are computed from the residuals for this point prediction. +By default, the residuals are ungrouped, meaning every key will have the same +shape distribution (though different centers). Note that if your data is not +or comparable scales across keys, this default is likely inappropriate. In that +case, you can choose by which keys quantiles are computed using +\code{climate_args_list(quantile_by_key = ...)}. +} +\examples{ +cases <- cases_deaths_subset +# set as_of to the last day in the data +# "case_rate_7d_av" is on the same scale for all geographies +attr(cases, "metadata")$as_of <- as.Date("2021-12-31") +fcast <- climatological_forecaster(cases, "case_rate_7d_av") +autoplot(fcast) + +# Compute quantiles separately by location, and a backcast +# "cases" is on different scales by geography, due to population size +# so, it is better to compute quantiles separately +backcast <- climatological_forecaster( + cases, "case_rate_7d_av", + climate_args_list( + quantile_by_key = "geo_value", + forecast_date = as.Date("2021-06-01") + ) +) +autoplot(backcast) + +# compute the climate "daily" rather than "weekly" +# use a two week window (on both sides) +# "cases" is on different scales by geography, due to population size +daily_fcast <- climatological_forecaster( + cases, "cases", + climate_args_list( + quantile_by_key = "geo_value", + time_type = "day", + window_size = 14L, + forecast_horizon = 0:30 + ) +) +autoplot(daily_fcast) + + ggplot2::coord_cartesian(xlim = c(as.Date("2021-10-01"), NA)) +} +\seealso{ +\code{\link[=step_climate]{step_climate()}} +} diff --git a/man/construct_shift_tibble.Rd b/man/construct_shift_tibble.Rd new file mode 100644 index 000000000..619583f1d --- /dev/null +++ b/man/construct_shift_tibble.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{construct_shift_tibble} +\alias{construct_shift_tibble} +\title{create a table of the columns to modify, their shifts, and their prefixes} +\usage{ +construct_shift_tibble(terms_used, recipe, rel_step_type, shift_name) +} +\description{ +create a table of the columns to modify, their shifts, and their prefixes +} +\keyword{internal} diff --git a/man/count_single_column.Rd b/man/count_single_column.Rd new file mode 100644 index 000000000..7922511e7 --- /dev/null +++ b/man/count_single_column.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{count_single_column} +\alias{count_single_column} +\title{get the location of the last real value} +\usage{ +count_single_column(col) +} +\arguments{ +\item{col}{the relevant column} +} +\description{ +get the location of the last real value +} +\keyword{internal} diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 1a3226e36..1bfa437b0 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R +% Please edit documentation in R/quantile_pred-methods.R \name{dist_quantiles} \alias{dist_quantiles} \title{A distribution parameterized by a set of quantiles} @@ -19,19 +19,10 @@ different distributions, the sizes must match. See the examples below.} A vector of class \code{"distribution"}. } \description{ -A distribution parameterized by a set of quantiles +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} } -\examples{ -dist_quantiles(1:4, 1:4 / 5) -dist_quantiles(list(1:3, 1:4), list(1:3 / 4, 1:4 / 5)) -dstn <- dist_quantiles(list(1:4, 8:11), c(.2, .4, .6, .8)) -dstn - -quantile(dstn, p = c(.1, .25, .5, .9)) -median(dstn) - -# it's a bit annoying to inspect the data -distributional::parameters(dstn[1]) -nested_quantiles(dstn[1])[[1]] - +\details{ +This function is deprecated. The recommended alternative is +\code{\link[hardhat:quantile_pred]{hardhat::quantile_pred()}}. } +\keyword{internal} diff --git a/man/drop_ignored_keys.Rd b/man/drop_ignored_keys.Rd new file mode 100644 index 000000000..6adeb9983 --- /dev/null +++ b/man/drop_ignored_keys.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{drop_ignored_keys} +\alias{drop_ignored_keys} +\title{given a list named by key columns, remove any matching key values +keys_to_ignore should have the form list(col_name = c("value_to_ignore", "other_value_to_ignore"))} +\usage{ +drop_ignored_keys(training, keys_to_ignore) +} +\description{ +given a list named by key columns, remove any matching key values +keys_to_ignore should have the form list(col_name = c("value_to_ignore", "other_value_to_ignore")) +} +\keyword{internal} diff --git a/man/epi_recipe.Rd b/man/epi_recipe.Rd index d0105d1ec..fefd162e1 100644 --- a/man/epi_recipe.Rd +++ b/man/epi_recipe.Rd @@ -9,19 +9,37 @@ \usage{ epi_recipe(x, ...) -\method{epi_recipe}{default}(x, ...) +\method{epi_recipe}{epi_df}( + x, + reference_date = NULL, + formula = NULL, + ..., + vars = NULL, + roles = NULL +) -\method{epi_recipe}{epi_df}(x, formula = NULL, ..., vars = NULL, roles = NULL) - -\method{epi_recipe}{formula}(formula, data, ...) +\method{epi_recipe}{formula}(formula, data, reference_date = NULL, ...) } \arguments{ -\item{x, data}{A data frame, tibble, or epi_df of the \emph{template} data set -(see below). This is always coerced to the first row to avoid memory issues} +\item{x, data}{An epi_df of the \emph{template} data set (see below).} \item{...}{Further arguments passed to or from other methods (not currently used).} +\item{reference_date}{Either a date of the same class as the \code{time_value} +column in the \code{epi_df} or \code{NULL}. If a date, it gives the date to which all +operations are relative. Typically, in real-time tasks this is the date that +the model is created (and presumably trained). In forecasting, this is +often the same as the most recent date of +data availability, but when data is "latent" (reported after the date to +which it corresponds), or if performing a nowcast, the \code{reference_date} may +be later than this. Setting \code{reference_date} +to a value BEFORE the most recent data is not a true "forecast", +because future data is being used to create the model, but this may be +reasonable in model building, nowcasting (predicting finalized values from +preliminary data), or if producing a backcast. If \code{NULL}, it will be set +to the \code{as_of} date of the \code{epi_df}.} + \item{formula}{A model formula. No in-line functions should be used here (e.g. \code{log(x)}, \code{x:y}, etc.) and minus signs are not allowed. These types of transformations should be enacted using \code{step} functions in this package. @@ -38,17 +56,17 @@ anything but common roles are \code{"outcome"}, \code{"predictor"}, } \value{ An object of class \code{recipe} with sub-objects: -\item{var_info}{A tibble containing information about the original data -set columns} +\item{var_info}{A tibble containing information about the original data set +columns.} \item{term_info}{A tibble that contains the current set of terms in the data set. This initially defaults to the same data contained in \code{var_info}.} -\item{steps}{A list of \code{step} or \code{check} objects that define the sequence of -preprocessing operations that will be applied to data. The default value is -\code{NULL}} -\item{template}{A tibble of the data. This is initialized to be the same -as the data given in the \code{data} argument but can be different after -the recipe is trained.} +\item{steps}{A list of \code{step} or \code{check} objects that define the sequence +of preprocessing operations that will be applied to data. The default value +is \code{NULL}.} +\item{template}{A tibble of the data. This is initialized to be the same as +the data given in the \code{data} argument but can be different after the recipe +is trained.} } \description{ A recipe is a description of the steps to be applied to a data set in @@ -57,19 +75,14 @@ around \code{\link[recipes:recipe]{recipes::recipe()}} to properly handle the ad columns present in an \code{epi_df} } \examples{ -library(dplyr) -library(recipes) -jhu <- case_death_rate_subset \%>\% - filter(time_value > "2021-08-01") \%>\% - arrange(geo_value, time_value) +jhu <- covid_case_death_rates \%>\% + filter(time_value > "2021-08-01") r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% - step_naomit(all_predictors()) \%>\% - # below, `skip` means we don't do this at predict time - step_naomit(all_outcomes(), skip = TRUE) + step_epi_naomit() r } diff --git a/man/epi_shift.Rd b/man/epi_shift.Rd deleted file mode 100644 index 14316a8db..000000000 --- a/man/epi_shift.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_shift.R -\name{epi_shift} -\alias{epi_shift} -\title{Shift predictors while maintaining grouping and time_value ordering} -\usage{ -epi_shift(x, shifts, time_value, keys = NULL, out_name = "x") -} -\arguments{ -\item{x}{Data frame. Variables to shift} - -\item{shifts}{List. Each list element is a vector of shifts. -Negative values produce leads. The list should have the same -length as the number of columns in \code{x}.} - -\item{time_value}{Vector. Same length as \code{x} giving time stamps.} - -\item{keys}{Data frame, vector, or \code{NULL}. Additional grouping vars.} - -\item{out_name}{Chr. The output list will use this as a prefix.} -} -\value{ -a list of tibbles -} -\description{ -This is a lower-level function. As such it performs no error checking. -} -\keyword{internal} diff --git a/man/epi_shift_single.Rd b/man/epi_shift_single.Rd new file mode 100644 index 000000000..871879004 --- /dev/null +++ b/man/epi_shift_single.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{epi_shift_single} +\alias{epi_shift_single} +\title{Shift predictors while maintaining grouping and time_value ordering} +\usage{ +epi_shift_single(x, col, shift_val, newname, key_cols) +} +\arguments{ +\item{x}{Data frame.} + +\item{shift_val}{a single integer. Negative values produce leads.} + +\item{newname}{the name for the newly shifted column} + +\item{key_cols}{vector, or \code{NULL}. Additional grouping vars.} +} +\value{ +a list of tibbles +} +\description{ +This is a lower-level function. As such it performs no error checking. +} +\keyword{internal} diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd index b29078d52..c84626363 100644 --- a/man/epi_workflow.Rd +++ b/man/epi_workflow.Rd @@ -25,15 +25,15 @@ A new \code{epi_workflow} object. } \description{ This is a container object that unifies preprocessing, fitting, prediction, -and postprocessing for predictive modeling on epidemiological data. It extends -the functionality of a \code{\link[workflows:workflow]{workflows::workflow()}} to handle the typical panel -data structures found in this field. This extension is handled completely -internally, and should be invisible to the user. For all intents and purposes, -this operates exactly like a \code{\link[workflows:workflow]{workflows::workflow()}}. For more details -and numerous examples, see there. +and post-processing for predictive modeling on epidemiological data. It +extends the functionality of a \code{\link[workflows:workflow]{workflows::workflow()}} to handle the typical +panel data structures found in this field. This extension is handled +completely internally, and should be invisible to the user. For all intents +and purposes, this operates exactly like a \code{\link[workflows:workflow]{workflows::workflow()}}. For some +\code{{epipredict}} specific examples, see the \href{../articles/custom_epiworkflows.html}{custom epiworkflows vignette}. } \examples{ -jhu <- case_death_rate_subset +jhu <- covid_case_death_rates r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% @@ -46,5 +46,5 @@ wf <- epi_workflow(r, parsnip::linear_reg()) wf } \seealso{ -workflows::workflow +\code{\link[workflows:workflow]{workflows::workflow()}} } diff --git a/man/epiweek_leap.Rd b/man/epiweek_leap.Rd new file mode 100644 index 000000000..4fecbf2ca --- /dev/null +++ b/man/epiweek_leap.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{epiweek_leap} +\alias{epiweek_leap} +\title{epiweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap} +\usage{ +epiweek_leap(time_value) +} +\description{ +epiweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap +} +\keyword{internal} diff --git a/man/extract_argument.Rd b/man/extract_argument.Rd index 69c610c98..a276d59a6 100644 --- a/man/extract_argument.Rd +++ b/man/extract_argument.Rd @@ -24,7 +24,7 @@ Extract an argument made to a frosting layer or recipe step \examples{ f <- frosting() \%>\% layer_predict() \%>\% - layer_residual_quantiles(quantile_levels = c(0.0275, 0.975), symmetrize = FALSE) \%>\% + layer_residual_quantiles(symmetrize = FALSE) \%>\% layer_naomit(.pred) extract_argument(f, "layer_residual_quantiles", "symmetrize") diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 4b1d1282c..68def9b2e 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -2,42 +2,50 @@ % Please edit documentation in R/extrapolate_quantiles.R \name{extrapolate_quantiles} \alias{extrapolate_quantiles} -\title{Summarize a distribution with a set of quantiles} +\title{Extrapolate the quantiles to new quantile levels} \usage{ extrapolate_quantiles(x, probs, replace_na = TRUE, ...) } \arguments{ -\item{x}{a \code{distribution} vector} +\item{x}{A vector of class \code{quantile_pred}.} \item{probs}{a vector of probabilities at which to calculate quantiles} \item{replace_na}{logical. If \code{x} contains \code{NA}'s, these are imputed if -possible (if \code{TRUE}) or retained (if \code{FALSE}). This only effects -elements of class \code{dist_quantiles}.} +possible (if \code{TRUE}) or retained (if \code{FALSE}).} \item{...}{additional arguments passed on to the \code{quantile} method} } \value{ -a \code{distribution} vector containing \code{dist_quantiles}. Any elements -of \code{x} which were originally \code{dist_quantiles} will now have a superset +a \code{quantile_pred} vector. Each element +of \code{x} will now have a superset of the original \code{quantile_values} (the union of those and \code{probs}). } \description{ -Summarize a distribution with a set of quantiles +This both interpolates between quantile levels already defined in \code{x} and +extrapolates quantiles outside their bounds. The interpolation method is +determined by the \code{quantile} argument \code{middle}, which can be either \code{"cubic"} +for a (Hyman) cubic spline interpolation, or \code{"linear"} for simple linear +interpolation. } -\examples{ -library(distributional) -dstn <- dist_normal(c(10, 2), c(5, 10)) -extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) +\details{ +There is only one extrapolation method for values greater than the largest +available quantile level or smaller than the smallest available quantile +level. It assumes a roughly exponential tail, whose decay rate and offset is +derived from the slope of the two most extreme quantile levels on a logistic +scale. See the internal function \code{tail_extrapolate()} for the exact +implementation. -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) -# because this distribution is already quantiles, any extra quantiles are -# appended -extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) +This function takes a \code{quantile_pred} vector and returns the same +type of object, expanded to include +\emph{additional} quantiles computed at \code{probs}. If you want behaviour more +similar to \code{\link[stats:quantile]{stats::quantile()}}, then \code{quantile(x,...)} may be more +appropriate. +} +\examples{ +dstn <- quantile_pred(rbind(1:4, 8:11), c(.2, .4, .6, .8)) +# extra quantiles are appended +as_tibble(extrapolate_quantiles(dstn, probs = c(0.25, 0.5, 0.75))) -dstn <- c( - dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) -) -extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) +extrapolate_quantiles(dstn, probs = c(0.0001, 0.25, 0.5, 0.75, 0.99999)) } diff --git a/man/figures/README-date-1.png b/man/figures/README-date-1.png new file mode 100644 index 000000000..b66ec04c8 Binary files /dev/null and b/man/figures/README-date-1.png differ diff --git a/man/figures/README-show-processed-data-1.png b/man/figures/README-show-processed-data-1.png new file mode 100644 index 000000000..e312c9fd4 Binary files /dev/null and b/man/figures/README-show-processed-data-1.png differ diff --git a/man/figures/README-show-single-forecast-1.png b/man/figures/README-show-single-forecast-1.png new file mode 100644 index 000000000..8ec4f90a1 Binary files /dev/null and b/man/figures/README-show-single-forecast-1.png differ diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index 3dfa0029a..ef0f4ff40 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -22,13 +22,13 @@ The \code{epi_workflow} object, updated with a fit parsnip model in the \code{object$fit$fit} slot. } \description{ -This is the \code{fit()} method for an \code{epi_workflow} object that +This is the \code{fit()} method for an \code{epi_workflow()} object that estimates parameters for a given model from a set of data. -Fitting an \code{epi_workflow} involves two main steps, which are +Fitting an \code{epi_workflow()} involves two main steps, which are preprocessing the data and fitting the underlying parsnip model. } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% @@ -40,5 +40,5 @@ wf } \seealso{ -workflows::fit-workflow +\code{\link[workflows:fit-workflow]{workflows::fit-workflow()}} } diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 059dfa038..368b8d141 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -9,11 +9,10 @@ flatline_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, - quantile_levels = c(0.05, 0.95), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ... ) } @@ -29,20 +28,28 @@ So for example, \code{ahead = 7} will create residuals by comparing values key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of training residuals. A \code{NULL} value will result in point forecasts only.} -\item{symmetrize}{Logical. The default \code{TRUE} calculates -symmetric prediction intervals. This argument only applies when -residual quantiles are used. It is not applicable with -\code{trainer = quantile_reg()}, for example.} +\item{symmetrize}{Logical. The default \code{TRUE} calculates symmetric prediction +intervals. This argument only applies when residual quantiles are used. It +is not applicable with \code{trainer = quantile_reg()}, for example. Typically, one +would only want non-symmetric quantiles when increasing trajectories are +quite different from decreasing ones, such as a strictly postive variable +near zero.} \item{nonneg}{Logical. The default \code{TRUE} enforces nonnegative predictions by hard-thresholding at 0.} @@ -54,16 +61,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/flatline_forecaster.Rd b/man/flatline_forecaster.Rd index 1803f1078..cc789bac2 100644 --- a/man/flatline_forecaster.Rd +++ b/man/flatline_forecaster.Rd @@ -11,7 +11,7 @@ flatline_forecaster(epi_data, outcome, args_list = flatline_args_list()) \item{outcome}{A scalar character for the column name we wish to predict.} -\item{args_list}{A list of dditional arguments as created by the +\item{args_list}{A list of additional arguments as created by the \code{\link[=flatline_args_list]{flatline_args_list()}} constructor function.} } \value{ @@ -21,12 +21,10 @@ ahead (unique horizon) for each unique combination of \code{key_vars}. \description{ This is a simple forecasting model for \link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent -observation as the -forecast for any future date, and produces intervals based on the quantiles -of the residuals of such a "flatline" forecast over all available training -data. -} -\details{ +observation as the forecast for any future date, and produces intervals +based on the quantiles of the residuals of such a "flatline" forecast over +all available training data. + By default, the predictive intervals are computed separately for each combination of key values (\code{geo_value} + any additional keys) in the \code{epi_data} argument. @@ -34,8 +32,31 @@ combination of key values (\code{geo_value} + any additional keys) in the This forecaster is very similar to that used by the \href{https://covid19forecasthub.org}{COVID19ForecastHub} } +\details{ +Here is (roughly) the code for the \code{flatline_forecaster()} applied to the +\code{case_rate} for \code{epidatasets::covid_case_death_rates}. + +\if{html}{\out{
}}\preformatted{jhu <- covid_case_death_rates \%>\% + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) +r <- epi_recipe(covid_case_death_rates) \%>\% + step_epi_ahead(case_rate, ahead = 7, skip = TRUE) \%>\% + recipes::update_role(case_rate, new_role = "predictor") \%>\% + recipes::add_role(all_of(key_colnames(jhu)), new_role = "predictor") + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_residual_quantiles() \%>\% + layer_add_forecast_date() \%>\% + layer_add_target_date() \%>\% + layer_threshold(starts_with(".pred")) + +eng <- linear_reg() \%>\% set_engine("flatline") +wf <- epi_workflow(r, eng, f) \%>\% fit(jhu) +preds <- forecast(wf) +}\if{html}{\out{
}} +} \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) out <- flatline_forecaster(jhu, "death_rate") diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd index b2be9b4fe..2c283d7b5 100644 --- a/man/flusight_hub_formatter.Rd +++ b/man/flusight_hub_formatter.Rd @@ -42,7 +42,8 @@ format for this forecast task is \href{https://github.com/cdcepi/FluSight-foreca } \examples{ library(dplyr) -weekly_deaths <- case_death_rate_subset \%>\% +library(epiprocess) +weekly_deaths <- covid_case_death_rates \%>\% filter( time_value >= as.Date("2021-09-01"), geo_value \%in\% c("ca", "ny", "dc", "ga", "vt") diff --git a/man/forecast.epi_workflow.Rd b/man/forecast.epi_workflow.Rd index b9f6870b8..1792fe1f9 100644 --- a/man/forecast.epi_workflow.Rd +++ b/man/forecast.epi_workflow.Rd @@ -2,31 +2,33 @@ % Please edit documentation in R/epi_workflow.R \name{forecast.epi_workflow} \alias{forecast.epi_workflow} -\title{Produce a forecast from an epi workflow} +\title{Produce a forecast from an epi workflow and it's training data} \usage{ -\method{forecast}{epi_workflow}(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) +\method{forecast}{epi_workflow}(object, ...) } \arguments{ \item{object}{An epi workflow.} \item{...}{Not used.} - -\item{fill_locf}{Logical. Should we use locf to fill in missing data?} - -\item{n_recent}{Integer or NULL. If filling missing data with locf = TRUE, -how far back are we willing to tolerate missing data? Larger values allow -more filling. The default NULL will determine this from the the recipe. For -example, suppose n_recent = 3, then if the 3 most recent observations in any -geo_value are all NA’s, we won’t be able to fill anything, and an error -message will be thrown. (See details.)} - -\item{forecast_date}{By default, this is set to the maximum time_value in x. -But if there is data latency such that recent NA's should be filled, this may -be after the last available time_value.} } \value{ A forecast tibble. } \description{ -Produce a forecast from an epi workflow +\code{forecast.epi_workflow} predicts by restricting the training data to the +latest available data, and predicting on that. It binds together +\code{get_test_data()} and \code{predict()}. +} +\examples{ +jhu <- covid_case_death_rates \%>\% + filter(time_value > "2021-08-01") + +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() + +epi_workflow(r, parsnip::linear_reg()) \%>\% + fit(jhu) \%>\% + forecast() } diff --git a/man/format_varnames.Rd b/man/format_varnames.Rd new file mode 100644 index 000000000..59fe218af --- /dev/null +++ b/man/format_varnames.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-misc.R +\name{format_varnames} +\alias{format_varnames} +\title{"Format" a character vector of column/variable names for cli interpolation} +\usage{ +format_varnames(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +\code{chr} +} +\description{ +Designed to give good output if interpolated with cli. Main purpose is to add +backticks around variable names when necessary, and something other than an +empty string if length 0. +} +\keyword{internal} diff --git a/man/frosting.Rd b/man/frosting.Rd index a75f21b61..c00875125 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/frosting.R \name{frosting} \alias{frosting} -\title{Create frosting for postprocessing predictions} +\title{Create frosting for post-processing predictions} \usage{ frosting(layers = NULL, requirements = NULL) } @@ -15,20 +15,19 @@ frosting(layers = NULL, requirements = NULL) A frosting object. } \description{ -This generates a postprocessing container (much like \code{recipes::recipe()}) -to hold steps for postprocessing predictions. +This generates a post-processing container (much like \code{recipes::recipe()}) +to hold steps for post-processing predictions. } \details{ The arguments are currently placeholders and must be NULL } \examples{ -library(dplyr) -# Toy example to show that frosting can be created and added for postprocessing +# Toy example to show that frosting can be created and added for post-processing f <- frosting() wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/get_forecast_date.Rd b/man/get_forecast_date.Rd new file mode 100644 index 000000000..6a35fff04 --- /dev/null +++ b/man/get_forecast_date.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date} +\alias{get_forecast_date} +\title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} +\usage{ +get_forecast_date(new_data, info, epi_keys_checked, latency, columns = NULL) +} +\description{ +Extract the as_of for the forecast date, and make sure there's nothing very off about it. +} +\keyword{internal} diff --git a/man/get_forecast_date_in_layer.Rd b/man/get_forecast_date_in_layer.Rd new file mode 100644 index 000000000..c866a88e7 --- /dev/null +++ b/man/get_forecast_date_in_layer.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date_in_layer} +\alias{get_forecast_date_in_layer} +\title{get the target date while in a layer} +\usage{ +get_forecast_date_in_layer(this_recipe, workflow_max_time_value, new_data) +} +\arguments{ +\item{this_recipe}{the recipe to check for \code{step_adjust_latency}} + +\item{workflow_max_time_value}{the \code{max_time} value coming out of the fit +workflow (this will be the maximal time value in a potentially different +dataset)} + +\item{new_data}{the data we're currently working with, from which we'll take +a potentially different max_time_value} +} +\description{ +get the target date while in a layer +} +\keyword{internal} diff --git a/man/get_latency.Rd b/man/get_latency.Rd new file mode 100644 index 000000000..5d6d7190f --- /dev/null +++ b/man/get_latency.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency} +\alias{get_latency} +\title{the latency is also the amount the shift is off by} +\usage{ +get_latency(new_data, forecast_date, column, sign_shift, epi_keys_checked) +} +\arguments{ +\item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you +need to shift the data to bring the 3 day lagged value to today.} +} +\description{ +the latency is also the amount the shift is off by +} +\keyword{internal} diff --git a/man/get_latency_table.Rd b/man/get_latency_table.Rd new file mode 100644 index 000000000..853918b23 --- /dev/null +++ b/man/get_latency_table.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency_table} +\alias{get_latency_table} +\title{create the latency table +This is a table of column names and the latency adjustment necessary for that column. An example:} +\usage{ +get_latency_table( + training, + columns, + forecast_date, + latency, + sign_shift, + epi_keys_checked, + keys_to_ignore, + info, + terms +) +} +\description{ +col_name latency +\if{html}{\out{}} \if{html}{\out{}} +1 case_rate 5 +2 death_rate 5 +} +\keyword{internal} diff --git a/man/get_sign.Rd b/man/get_sign.Rd new file mode 100644 index 000000000..0be3e6306 --- /dev/null +++ b/man/get_sign.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{get_sign} +\alias{get_sign} +\title{lags move columns forward to bring the past up to today, while aheads drag +the future back to today} +\usage{ +get_sign(object) +} +\description{ +lags move columns forward to bring the past up to today, while aheads drag +the future back to today +} +\keyword{internal} diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index b18685d89..8d99a50a7 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -4,61 +4,38 @@ \alias{get_test_data} \title{Get test data for prediction based on longest lag period} \usage{ -get_test_data( - recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value) -) +get_test_data(recipe, x) } \arguments{ \item{recipe}{A recipe object.} \item{x}{An epi_df. The typical usage is to pass the same data as that used for fitting the recipe.} - -\item{fill_locf}{Logical. Should we use \code{locf} to fill in missing data?} - -\item{n_recent}{Integer or NULL. If filling missing data with \code{locf = TRUE}, -how far back are we willing to tolerate missing data? Larger values allow -more filling. The default \code{NULL} will determine this from the -the \code{recipe}. For example, suppose \code{n_recent = 3}, then if the -3 most recent observations in any \code{geo_value} are all \code{NA}’s, we won’t be -able to fill anything, and an error message will be thrown. (See details.)} - -\item{forecast_date}{By default, this is set to the maximum -\code{time_value} in \code{x}. But if there is data latency such that recent \code{NA}'s -should be filled, this may be \emph{after} the last available \code{time_value}.} } \value{ An object of the same type as \code{x} with columns \code{geo_value}, \code{time_value}, any additional keys, as well other variables in the original dataset. } \description{ -Based on the longest lag period in the recipe, -\code{get_test_data()} creates an \link[epiprocess:epi_df]{epi_df} -with columns \code{geo_value}, \code{time_value} -and other variables in the original dataset, -which will be used to create features necessary to produce forecasts. +If \code{predict()} is given the full training dataset, it will produce a +prediction for every \code{time_value} which has enough data. For most cases, this +generates predictions for \code{time_values} where the \code{outcome} has already been +observed. \code{get_test_data()} is designed to restrict the given dataset to the +minimum amount needed to produce a forecast on the \code{forecast_date} for future +data, rather than a prediction on past \code{time_value}s. Primarily this is +based on the longest lag period in the recipe. } \details{ The minimum required (recent) data to produce a forecast is equal to the maximum lag requested (on any predictor) plus the longest horizon used if growth rate calculations are requested by the recipe. This is calculated internally. - -It also optionally fills missing values -using the last-observation-carried-forward (LOCF) method. If this -is not possible (say because there would be only \code{NA}'s in some location), -it will produce an error suggesting alternative options to handle missing -values with more advanced techniques. } \examples{ # create recipe -rec <- epi_recipe(case_death_rate_subset) \%>\% +rec <- epi_recipe(covid_case_death_rates) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) -get_test_data(recipe = rec, x = case_death_rate_subset) +get_test_data(recipe = rec, x = covid_case_death_rates) } diff --git a/man/grad_employ_subset.Rd b/man/grad_employ_subset.Rd deleted file mode 100644 index 46ba36913..000000000 --- a/man/grad_employ_subset.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{grad_employ_subset} -\alias{grad_employ_subset} -\title{Subset of Statistics Canada median employment income for postsecondary graduates} -\format{ -An \link[epiprocess:epi_df]{epiprocess::epi_df} with 10193 rows and 8 variables: -\describe{ -\item{geo_value}{The province in Canada associated with each -row of measurements.} -\item{time_value}{The time value, a year integer in YYYY format} -\item{edu_qual}{The education qualification} -\item{fos}{The field of study} -\item{age_group}{The age group; either 15 to 34 or 35 to 64} -\item{num_graduates}{The number of graduates for the given row of characteristics} -\item{med_income_2y}{The median employment income two years after graduation} -\item{med_income_5y}{The median employment income five years after graduation} -} -} -\source{ -This object contains modified data from the following Statistics Canada -data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{ -Characteristics and median employment income of longitudinal cohorts of postsecondary -graduates two and five years after graduation, by educational qualification and -field of study (primary groupings) -} - -Modifications: -\itemize{ -\item Only provincial-level geo_values are kept -\item Only age group, field of study, and educational qualification are kept as -covariates. For the remaining covariates, we keep aggregated values and -drop the level-specific rows. -\item No modifications were made to the time range of the data -} -} -\usage{ -grad_employ_subset -} -\description{ -Subset of Statistics Canada median employment income for postsecondary graduates -} -\keyword{datasets} diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd index e6852a55b..2e4b8bcbb 100644 --- a/man/grf_quantiles.Rd +++ b/man/grf_quantiles.Rd @@ -52,8 +52,8 @@ details, see \href{https://grf-labs.github.io/grf/articles/categorical_inputs.ht #> Model fit template: #> grf::quantile_forest(X = missing_arg(), Y = missing_arg(), mtry = min_cols(~integer(1), #> x), num.trees = integer(1), min.node.size = min_rows(~integer(1), -#> x), quantiles = c(0.1, 0.5, 0.9), num.threads = 1L, seed = stats::runif(1, -#> 0, .Machine$integer.max)) +#> x), quantiles = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), +#> num.threads = 1L, seed = stats::runif(1, 0, .Machine$integer.max)) }\if{html}{\out{}} } @@ -84,8 +84,7 @@ predict(out, new_data = tib[1:5, ]) \%>\% # -- a more complicated task -library(dplyr) -dat <- case_death_rate_subset \%>\% +dat <- covid_case_death_rates \%>\% filter(time_value > as.Date("2021-10-01")) rec <- epi_recipe(dat) \%>\% step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/isoweek_leap.Rd b/man/isoweek_leap.Rd new file mode 100644 index 000000000..355d27b1e --- /dev/null +++ b/man/isoweek_leap.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{isoweek_leap} +\alias{isoweek_leap} +\title{isoweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap} +\usage{ +isoweek_leap(time_value) +} +\description{ +isoweek, but it assigns week 53 the value of 999 instead so it mirrors the assignments in yday_leap +} +\keyword{internal} diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index e27f2bacd..6370962cc 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/layer_add_forecast_date.R \name{layer_add_forecast_date} \alias{layer_add_forecast_date} -\title{Postprocessing step to add the forecast date} +\title{Post-processing step to add the forecast date} \usage{ layer_add_forecast_date( frosting, @@ -14,10 +14,12 @@ layer_add_forecast_date( \item{frosting}{a \code{frosting} postprocessor} \item{forecast_date}{The forecast date to add as a column to the \code{epi_df}. -For most cases, this should be specified in the form "yyyy-mm-dd". Note that -when the forecast date is left unspecified, it is set to the maximum time -value from the data used in pre-processing, fitting the model, and -postprocessing.} +For most cases, this should be specified in the form "yyyy-mm-dd". Note +that when the forecast date is left unspecified, it is set to one of two +values. If there is a \code{step_adjust_latency} step present, it uses the +\code{forecast_date} as set in that function. Otherwise, it uses the maximum +\code{time_value} across the data used for pre-processing, fitting the model, +and post-processing.} \item{id}{a random id string} } @@ -25,19 +27,18 @@ postprocessing.} an updated \code{frosting} postprocessor } \description{ -Postprocessing step to add the forecast date +Post-processing step to add the forecast date } \details{ To use this function, either specify a forecast date or leave the forecast date unspecifed here. In the latter case, the forecast date will be set as the maximum time value from the data used in pre-processing, -fitting the model, and postprocessing. In any case, when the forecast date is +fitting the model, and post-processing. In any case, when the forecast date is less than the maximum \code{as_of} value (from the data used pre-processing, -model fitting, and postprocessing), an appropriate warning will be thrown. +model fitting, and post-processing), an appropriate warning will be thrown. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index dc0d2f190..f10178898 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/layer_add_target_date.R \name{layer_add_target_date} \alias{layer_add_target_date} -\title{Postprocessing step to add the target date} +\title{Post-processing step to add the target date} \usage{ layer_add_target_date( frosting, @@ -13,14 +13,14 @@ layer_add_target_date( \arguments{ \item{frosting}{a \code{frosting} postprocessor} -\item{target_date}{The target date to add as a column to the -\code{epi_df}. If there's a forecast date specified in a layer, then -it is the forecast date plus \code{ahead} (from \code{step_epi_ahead} in -the \code{epi_recipe}). Otherwise, it is the maximum \code{time_value} -(from the data used in pre-processing, fitting the model, and -postprocessing) plus \code{ahead}, where \code{ahead} has been specified in -preprocessing. The user may override these by specifying a -target date of their own (of the form "yyyy-mm-dd").} +\item{target_date}{The target date to add as a column to the \code{epi_df}. If +there's a forecast date specified upstream (either in a +\code{step_adjust_latency} or in a \code{layer_forecast_date}), then it is the +forecast date plus \code{ahead} (from \code{step_epi_ahead} in the \code{epi_recipe}). +Otherwise, it is the maximum \code{time_value} (from the data used in +pre-processing, fitting the model, and post-processing) plus \code{ahead}, where +\code{ahead} has been specified in preprocessing. The user may override these by +specifying a target date of their own (of the form "yyyy-mm-dd").} \item{id}{a random id string} } @@ -28,17 +28,23 @@ target date of their own (of the form "yyyy-mm-dd").} an updated \code{frosting} postprocessor } \description{ -Postprocessing step to add the target date +Post-processing step to add the target date } \details{ -By default, this function assumes that a value for \code{ahead} -has been specified in a preprocessing step (most likely in -\code{step_epi_ahead}). Then, \code{ahead} is added to the maximum \code{time_value} -in the test data to get the target date. +By default, this function assumes that a value for \code{ahead} has been +specified in a preprocessing step (most likely in \code{step_epi_ahead}). Then, +\code{ahead} is added to the \code{forecast_date} in the test data to get the target +date. \code{forecast_date} itself can be set in 3 ways: +\enumerate{ +\item The default \code{forecast_date} is simply the maximum \code{time_value} over every +dataset used (prep, training, and prediction). +\item if \code{step_adjust_latency} is present, it will typically use the training +\code{epi_df}'s \code{as_of} +\item \code{layer_add_forecast_date}, which inherits from 2 if not manually specifed +} } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% @@ -58,8 +64,14 @@ wf1 <- wf \%>\% add_frosting(f) p <- forecast(wf1) p -# Use ahead + max time value from pre, fit, post -# which is the same if include `layer_add_forecast_date()` +# Use ahead + forecast_date from adjust_latency +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_naomit() f2 <- frosting() \%>\% layer_predict() \%>\% layer_add_target_date() \%>\% @@ -69,13 +81,24 @@ wf2 <- wf \%>\% add_frosting(f2) p2 <- forecast(wf2) p2 -# Specify own target date +# Use ahead + max time value from pre, fit, post +# which is the same if include `layer_add_forecast_date()` f3 <- frosting() \%>\% layer_predict() \%>\% - layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_add_target_date() \%>\% layer_naomit(.pred) wf3 <- wf \%>\% add_frosting(f3) -p3 <- forecast(wf3) -p3 +p3 <- forecast(wf2) +p2 + +# Specify own target date +f4 <- frosting() \%>\% + layer_predict() \%>\% + layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_naomit(.pred) +wf4 <- wf \%>\% add_frosting(f4) + +p4 <- forecast(wf4) +p4 } diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index c3bc4f257..12b173da1 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -62,7 +62,7 @@ an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict( in an additional \verb{} named \code{.pred_distn_all} containing 2-column \code{\link[tibble:tibble]{tibble::tibble()}}'s. For each desired combination of \code{key}'s, the tibble will contain one row per ahead -with the associated \code{\link[=dist_quantiles]{dist_quantiles()}}. +with the associated \code{\link[=quantile_pred]{quantile_pred()}}. } \description{ This layer creates quantile forecasts by taking a sample from the @@ -84,15 +84,15 @@ the future. This version continues to use the same set of residuals, and adds them on to produce wider intervals as \code{ahead} increases. } \examples{ -library(dplyr) -r <- epi_recipe(case_death_rate_subset) \%>\% +library(recipes) +r <- epi_recipe(covid_case_death_rates) \%>\% # data is "daily", so we fit this to 1 ahead, the result will contain # 1 day ahead residuals step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% - recipes::update_role(death_rate, new_role = "predictor") \%>\% - recipes::add_role(time_value, geo_value, new_role = "predictor") + update_role(death_rate, new_role = "predictor") \%>\% + add_role(time_value, geo_value, new_role = "predictor") -forecast_date <- max(case_death_rate_subset$time_value) +forecast_date <- max(covid_case_death_rates$time_value) f <- frosting() \%>\% layer_predict() \%>\% @@ -100,7 +100,7 @@ f <- frosting() \%>\% eng <- linear_reg(engine = "flatline") -wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +wf <- epi_workflow(r, eng, f) \%>\% fit(covid_case_death_rates) preds <- forecast(wf) \%>\% select(-time_value) \%>\% mutate(forecast_date = forecast_date) @@ -111,22 +111,21 @@ preds <- preds \%>\% pivot_quantiles_wider(.pred_distn) \%>\% mutate(target_date = forecast_date + ahead) -if (require("ggplot2")) { - four_states <- c("ca", "pa", "wa", "ny") - preds \%>\% - filter(geo_value \%in\% four_states) \%>\% - ggplot(aes(target_date)) + - geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + - geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + - geom_line(aes(y = .pred), color = "orange") + - geom_line( - data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = death_rate) - ) + - scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + - labs(x = "Date", y = "Death rate") + - facet_wrap(~geo_value, scales = "free_y") + - theme_bw() + - geom_vline(xintercept = forecast_date) -} +library(ggplot2) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = covid_case_death_rates \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) } diff --git a/man/layer_naomit.Rd b/man/layer_naomit.Rd index d77112f95..eea81959f 100644 --- a/man/layer_naomit.Rd +++ b/man/layer_naomit.Rd @@ -24,8 +24,7 @@ an updated \code{frosting} postprocessor Omit \code{NA}s from predictions or other columns } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_point_from_distn.Rd b/man/layer_point_from_distn.Rd index 276f7cb17..3e770e912 100644 --- a/man/layer_point_from_distn.Rd +++ b/man/layer_point_from_distn.Rd @@ -28,14 +28,13 @@ will overwrite the \code{.pred} column, removing the distribution information.} an updated \code{frosting} postprocessor. } \description{ -This function adds a postprocessing layer to extract a point forecast from +This function adds a post-processing layer to extract a point forecast from a distributional forecast. NOTE: With default arguments, this will remove information, so one should usually call this AFTER \code{layer_quantile_distn()} or set the \code{name} argument to something specific. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 253116694..6a08f57cd 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -26,14 +26,20 @@ for this step. See \code{\link[recipes:selections]{recipes::selections()}} for m \item{df}{a data frame that contains the population data to be used for inverting the existing scaling.} -\item{by}{A (possibly named) character vector of variables to join by. +\item{by}{A (possibly named) character vector of variables to join \code{df} onto +the \code{epi_df} by. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the test data with +roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in \code{df}; +these roles are automatically set if you are using an \code{epi_df}, or you can +use, e.g., \code{update_role}. If no such roles are set, it will try to perform a +natural join, using variables in common between the training/test data and +population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} @@ -62,19 +68,18 @@ in the \code{epi_df}.} an updated \code{frosting} postprocessor } \description{ -\code{layer_population_scaling} creates a specification of a frosting layer -that will "undo" per-capita scaling. Typical usage would -load a dataset that contains state-level population, and use it to convert -predictions made from a rate-scale model to raw scale by multiplying by -the population. -Although, it is worth noting that there is nothing special about "population". -The function can be used to scale by any variable. Population is the -standard use case in the epidemiology forecasting scenario. Any value -passed will \emph{multiply} the selected variables while the \code{rate_rescaling} -argument is a common \emph{divisor} of the selected variables. +\code{layer_population_scaling} creates a specification of a frosting layer that +will "undo" per-capita scaling done in \code{step_population_scaling()}. +Typical usage would set \code{df} to be a dataset that contains a list of +population for the \code{geo_value}s, and use it to convert predictions made from +a raw scale model to rate-scale by dividing by the population. +Although, it is worth noting that there is nothing special about +"population", and the function can be used to scale by any variable. +Population is the standard use case in the epidemiology forecasting scenario. +Any value passed will \emph{multiply} the selected variables while the +\code{rate_rescaling} argument is a common \emph{divisor} of the selected variables. } \examples{ -library(dplyr) jhu <- cases_deaths_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% select(geo_value, time_value, cases) diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 8ae92f4c8..d678c9fae 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/layer_predict.R \name{layer_predict} \alias{layer_predict} -\title{Prediction layer for postprocessing} +\title{Prediction layer for post-processing} \usage{ layer_predict( frosting, @@ -54,12 +54,11 @@ An updated \code{frosting} object \description{ Implements prediction on a fitted \code{epi_workflow}. One may want different types of prediction, and to potentially apply this after some amount of -postprocessing. This would typically be the first layer in a \code{frosting} +post-processing. This would typically be the first layer in a \code{frosting} postprocessor. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% @@ -85,5 +84,5 @@ p2 <- predict(wf2, latest) p2 } \seealso{ -\code{parsnip::predict.model_fit()} +\code{\link[parsnip:predict.model_fit]{parsnip::predict.model_fit()}} } diff --git a/man/layer_predictive_distn.Rd b/man/layer_predictive_distn.Rd index 240db5f5b..7f5464513 100644 --- a/man/layer_predictive_distn.Rd +++ b/man/layer_predictive_distn.Rd @@ -31,31 +31,15 @@ an updated \code{frosting} postprocessor with additional columns of the residual quantiles added to the prediction } \description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} +} +\details{ This function calculates an \emph{approximation} to a parametric predictive distribution. Predictive distributions from linear models require \verb{x* (X'X)^\{-1\} x*} -along with the degrees of freedom. This function approximates both. It -should be reasonably accurate for models fit using \code{lm} when the new point -\verb{x*} isn't too far from the bulk of the data. -} -\examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% - filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) - -r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() - -wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) - -f <- frosting() \%>\% - layer_predict() \%>\% - layer_predictive_distn() \%>\% - layer_naomit(.pred) -wf1 <- wf \%>\% add_frosting(f) - -p <- forecast(wf1) -p +along with the degrees of freedom. This function approximates both. It should +be reasonably accurate for models fit using \code{lm} when the new point \verb{x*} +isn't too far from the bulk of the data. Outside of that specific case, it is +recommended to use \code{layer_residual_quantiles()}, or if you are working with a +model that produces distributional predictions, use \code{layer_quantile_distn()}. } diff --git a/man/layer_quantile_distn.Rd b/man/layer_quantile_distn.Rd index 68192deee..9ce3cd57d 100644 --- a/man/layer_quantile_distn.Rd +++ b/man/layer_quantile_distn.Rd @@ -7,7 +7,7 @@ layer_quantile_distn( frosting, ..., - quantile_levels = c(0.25, 0.75), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), truncate = c(-Inf, Inf), name = ".pred_distn", id = rand_id("quantile_distn") @@ -32,6 +32,8 @@ quantiles will be added to the predictions. } \description{ This function calculates quantiles when the prediction was \emph{distributional}. +If the model producing the forecast is not distributional, it is recommended +to use \code{layer_residual_quantiles()} instead. } \details{ Currently, the only distributional modes/engines are @@ -45,8 +47,7 @@ If these engines were used, then this layer will grab out estimated (or extrapolated) quantiles at the requested quantile values. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_residual_quantiles.Rd b/man/layer_residual_quantiles.Rd index 39e1ecfbe..ab19a9511 100644 --- a/man/layer_residual_quantiles.Rd +++ b/man/layer_residual_quantiles.Rd @@ -7,7 +7,7 @@ layer_residual_quantiles( frosting, ..., - quantile_levels = c(0.05, 0.95), + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), symmetrize = TRUE, by_key = character(0L), name = ".pred_distn", @@ -20,9 +20,13 @@ layer_residual_quantiles( \item{...}{Unused, include for consistency with other layers.} \item{quantile_levels}{numeric vector of probabilities with values in (0,1) -referring to the desired quantile.} +referring to the desired quantile. Note that 0.5 will always be included +even if left out by the user.} -\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} +\item{symmetrize}{logical. If \code{TRUE} then the interval will be symmetric. +Typically, one would only want non-symmetric quantiles when increasing +trajectories are quite different from decreasing ones, such as a strictly +postive variable near zero.} \item{by_key}{A character vector of keys to group the residuals by before calculating quantiles. The default, \code{c()} performs no grouping.} @@ -36,11 +40,13 @@ an updated \code{frosting} postprocessor with additional columns of the residual quantiles added to the prediction } \description{ -Creates predictions based on residual quantiles +This function calculates predictive quantiles based on the empirical +quantiles of the model's residuals. If the model producing the forecast is +distributional, it is recommended to use \code{layer_residual_quantiles()} +instead, as those will be more accurate. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% @@ -53,7 +59,7 @@ wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) f <- frosting() \%>\% layer_predict() \%>\% layer_residual_quantiles( - quantile_levels = c(0.0275, 0.975), + quantile_levels = c(0.025, 0.975), symmetrize = FALSE ) \%>\% layer_naomit(.pred) diff --git a/man/layer_threshold.Rd b/man/layer_threshold.Rd index 0f4b1dfb7..e89347d9d 100644 --- a/man/layer_threshold.Rd +++ b/man/layer_threshold.Rd @@ -35,14 +35,18 @@ Default value is \code{Inf}.} an updated \code{frosting} postprocessor } \description{ -This postprocessing step is used to set prediction values that are -smaller than the lower threshold or higher than the upper threshold equal -to the threshold values. +This post-processing step is used to set prediction values that are smaller +than the lower threshold or higher than the upper threshold equal to the +threshold values. +} +\details{ +Making case count predictions strictly positive is a typical example usage. +It must be called after there is a column containing quantiles. This means at earliest it can be called after \code{layer_predict()} for distributional models, or after \code{layer_residual_quantiles()} for point prediction models. Typical best practice will use \code{starts_with(".pred")} as the variables to threshold. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value < "2021-03-08", geo_value \%in\% c("ak", "ca", "ar")) + r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% @@ -51,7 +55,7 @@ wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) f <- frosting() \%>\% layer_predict() \%>\% - layer_threshold(.pred, lower = 0.180, upper = 0.310) + layer_threshold(starts_with(".pred"), lower = 0.180, upper = 0.310) wf <- wf \%>\% add_frosting(f) p <- forecast(wf) p diff --git a/man/layer_unnest.Rd b/man/layer_unnest.Rd index 3c25608a6..f50ce6ccd 100644 --- a/man/layer_unnest.Rd +++ b/man/layer_unnest.Rd @@ -20,5 +20,40 @@ be used to select a range of variables.} an updated \code{frosting} postprocessor } \description{ -Unnest prediction list-cols +For any model that produces forecasts for multiple outcomes, such as multiple +aheads, the resulting prediction is a list of forecasts inside a column of +the prediction tibble, which is may not be desirable. This layer "lengthens" +the result, moving each outcome to a separate row, in the same manner as +\code{tidyr::unnest()} would. At the moment, the only such engine is +\code{smooth_quantile_reg()}. +} +\examples{ +jhu <- covid_case_death_rates \%>\% + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + +aheads <- 1:7 + +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = aheads) \%>\% + step_epi_naomit() + +wf <- epi_workflow( + r, + smooth_quantile_reg( + quantile_levels = c(.05, .1, .25, .5, .75, .9, .95), + outcome_locations = aheads + ) +) \%>\% + fit(jhu) + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit() \%>\% + layer_unnest(.pred) + +wf1 <- wf \%>\% add_frosting(f) + +p <- forecast(wf1) +p } diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index b34b718ca..c8c05d94f 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -13,14 +13,19 @@ nested_quantiles(x) a list-col } \description{ -Turn a vector of quantile distributions into a list-col +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} +} +\details{ +This function is deprecated. The recommended alternative is +\code{\link[hardhat:quantile_pred]{hardhat::quantile_pred()}} with \code{\link[tibble:as_tibble]{tibble::as_tibble()}} } \examples{ -library(dplyr) -library(tidyr) -edf <- case_death_rate_subset[1:3, ] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) +pred_quantile <- quantile_pred(matrix(rnorm(20), 5), c(.2, .4, .6, .8)) +nested_quantiles(pred_quantile) + +pred_quantile \%>\% + as_tibble() \%>\% + tidyr::nest(.by = .row) \%>\% + dplyr::select(-.row) -edf_nested <- edf \%>\% mutate(q = nested_quantiles(q)) -edf_nested \%>\% unnest(q) } diff --git a/man/pad_to_end.Rd b/man/pad_to_end.Rd new file mode 100644 index 000000000..b9cde372e --- /dev/null +++ b/man/pad_to_end.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{pad_to_end} +\alias{pad_to_end} +\title{pad every group at the right interval} +\usage{ +pad_to_end(x, groups, end_date, columns_to_complete = NULL) +} +\arguments{ +\item{x}{an epi_df to be filled forward.} + +\item{groups}{the grouping by which to fill forward} + +\item{columns_to_complete}{which columns to apply completion to. By default every non-key column of an epi_df} +} +\description{ +Perform last observation carried forward on a group by group basis. It uses +\code{guess_period} to find the appropriate interval to fill-forward by. It +maintains the grouping structure it recieves. It does \emph{not} fill any +"interior" \code{NA} values occurring in the data beforehand. +} +\keyword{internal} diff --git a/man/pivot_quantiles.Rd b/man/pivot_quantiles.Rd new file mode 100644 index 000000000..fcf8ded67 --- /dev/null +++ b/man/pivot_quantiles.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles} +\alias{pivot_quantiles} +\alias{pivot_quantiles_longer} +\alias{pivot_quantiles_wider} +\title{Pivot a column containing \code{quantile_pred} to explicit rows or columns} +\usage{ +pivot_quantiles_longer(.data, ...) + +pivot_quantiles_wider(.data, ...) +} +\arguments{ +\item{.data}{A data frame, or a data frame extension such as a tibble or +epi_df.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame. Note that only one variable +can be selected for this operation.} +} +\value{ +An object of the same class as \code{.data}. +} +\description{ +Both functions expand a column of \code{quantile_pred}s into the separate +quantiles. Since each consists of a set of names (quantiles) and values, +these operate analogously with \code{pivot_wider} and \code{pivot_longer}. +} +\details{ +\code{piot_quantiles_wider} creates a new column for each \code{quantile_level}, with +the values as the corresponding quantile values. When pivoting multiple +columns, the original column name will be used as a prefix. + +Similarly, \code{pivot_quantiles_longer} assigns the selected columns +\code{quantile_level}s in one column and the \code{value}s in another. If multiple +columns are selected, these will be prefixed with the column name. +} +\examples{ +d1 <- quantile_pred(rbind(1:3, 2:4), 1:3 / 4) +d2 <- quantile_pred(rbind(2:4, 3:5), 2:4 / 5) +tib <- tibble(g = c("a", "b"), d1 = d1, d2 = d2) + +pivot_quantiles_longer(tib, "d1") +pivot_quantiles_longer(tib, dplyr::ends_with("1")) +pivot_quantiles_longer(tib, d2) + +pivot_quantiles_wider(tib, "d1") +pivot_quantiles_wider(tib, dplyr::ends_with("2")) +pivot_quantiles_wider(tib, d2) +} diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd deleted file mode 100644 index 9879d5d07..000000000 --- a/man/pivot_quantiles_longer.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pivot_quantiles.R -\name{pivot_quantiles_longer} -\alias{pivot_quantiles_longer} -\title{Pivot columns containing \code{dist_quantile} longer} -\usage{ -pivot_quantiles_longer(.data, ..., .ignore_length_check = FALSE) -} -\arguments{ -\item{.data}{A data frame, or a data frame extension such as a tibble or -epi_df.} - -\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted -expressions separated by commas. Variable names can be used as if they -were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables.} - -\item{.ignore_length_check}{If multiple columns are selected, as long as -each row has contains the same number of quantiles, the result will be -reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2[1]} -has 7, then the only option would be to recycle everything, creating a -\emph{very} long result. By default, this would throw an error. But if this is -really the goal, then the error can be bypassed by setting this argument -to \code{TRUE}. The quantiles in the first selected column will vary the fastest.} -} -\value{ -An object of the same class as \code{.data}. -} -\description{ -Selected columns that contain \code{dist_quantiles} will be "lengthened" with -the quantile levels serving as 1 column and the values as another. If -multiple columns are selected, these will be prefixed with the column name. -} -\examples{ -d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -tib <- tibble(g = c("a", "b"), d1 = d1, d2 = d2) - -pivot_quantiles_longer(tib, "d1") -pivot_quantiles_longer(tib, dplyr::ends_with("1")) -pivot_quantiles_longer(tib, d1, d2) -} diff --git a/man/pivot_quantiles_wider.Rd b/man/pivot_quantiles_wider.Rd deleted file mode 100644 index e477777ca..000000000 --- a/man/pivot_quantiles_wider.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pivot_quantiles.R -\name{pivot_quantiles_wider} -\alias{pivot_quantiles_wider} -\title{Pivot columns containing \code{dist_quantile} wider} -\usage{ -pivot_quantiles_wider(.data, ...) -} -\arguments{ -\item{.data}{A data frame, or a data frame extension such as a tibble or -epi_df.} - -\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted -expressions separated by commas. Variable names can be used as if they -were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables.} -} -\value{ -An object of the same class as \code{.data} -} -\description{ -Any selected columns that contain \code{dist_quantiles} will be "widened" with -the "taus" (quantile) serving as names and the values in the data frame. -When pivoting multiple columns, the original column name will be used as -a prefix. -} -\examples{ -d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) - -pivot_quantiles_wider(tib, c("d1", "d2")) -pivot_quantiles_wider(tib, dplyr::starts_with("d")) -pivot_quantiles_wider(tib, d2) -} diff --git a/man/predict-epi_workflow.Rd b/man/predict-epi_workflow.Rd index 130279249..577b20abb 100644 --- a/man/predict-epi_workflow.Rd +++ b/man/predict-epi_workflow.Rd @@ -47,26 +47,31 @@ time points at which the survival probability or hazard is estimated. } \value{ A data frame of model predictions, with as many rows as \code{new_data} has. -If \code{new_data} is an \code{epi_df} or a data frame with \code{time_value} or +If \code{new_data} is an \code{epiprocess::epi_df} or a data frame with \code{time_value} or \code{geo_value} columns, then the result will have those as well. } \description{ -This is the \code{predict()} method for a fit epi_workflow object. The nice thing -about predicting from an epi_workflow is that it will: +This is the \code{predict()} method for a fit epi_workflow object. The 3 steps that this implements are: \itemize{ \item Preprocess \code{new_data} using the preprocessing method specified when the workflow was created and fit. This is accomplished using \code{\link[hardhat:forge]{hardhat::forge()}}, which will apply any formula preprocessing or call \code{\link[recipes:bake]{recipes::bake()}} if a recipe was supplied. -\item Call \code{\link[parsnip:predict.model_fit]{parsnip::predict.model_fit()}} for you using the underlying fit +\item Preprocessing \code{new_data} using the preprocessing method specified when the +epi_workflow was created and fit. This is accomplished using +\code{hardhat::bake()} if a recipe was supplied (passing through +\code{\link[hardhat:forge]{hardhat::forge()}}, which is used for non-recipe preprocessors). Note that +this is a slightly different \code{bake} operation than the one occuring during +the fit. Any \code{step} that has \code{skip = TRUE} isn't applied during prediction; +for example in \code{step_epi_naomit()}, \code{all_outcomes()} isn't \code{NA} omitted, +since doing so would drop the exact \code{time_values} we are trying to predict. +\item Calling \code{parsnip::predict.model_fit()} for you using the underlying fit parsnip model. -\item Ensure that the returned object is an \link[epiprocess:epi_df]{epiprocess::epi_df} where -possible. Specifically, the output will have \code{time_value} and -\code{geo_value} columns as well as the prediction. +\item \code{slather()} any frosting that has been included in the \code{epi_workflow}. } } \examples{ -jhu <- case_death_rate_subset +jhu <- covid_case_death_rates r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/quantile.quantile_pred.Rd b/man/quantile.quantile_pred.Rd new file mode 100644 index 000000000..1c23daed4 --- /dev/null +++ b/man/quantile.quantile_pred.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{quantile.quantile_pred} +\alias{quantile.quantile_pred} +\title{Quantiles from a distribution} +\usage{ +\method{quantile}{quantile_pred}( + x, + probs = seq(0, 1, 0.25), + na.rm = FALSE, + lower = -Inf, + upper = Inf, + middle = c("cubic", "linear"), + ... +) +} +\arguments{ +\item{x}{numeric vector whose sample quantiles are wanted, or an + object of a class for which a method has been defined (see also + \sQuote{details}). \code{\link{NA}} and \code{NaN} values are not + allowed in numeric vectors unless \code{na.rm} is \code{TRUE}.} + +\item{probs}{numeric vector of probabilities with values in + \eqn{[0,1]}. (Values up to \samp{2e-14} outside that + range are accepted and moved to the nearby endpoint.)} + +\item{na.rm}{logical; if true, any \code{\link{NA}} and \code{NaN}'s + are removed from \code{x} before the quantiles are computed.} + +\item{lower}{Scalar. Optional lower bound.} + +\item{upper}{Scalar. Optional upper bound.} + +\item{middle}{Controls how extrapolation to "interior" probabilities is +performed. "cubic" attempts to use \code{\link[stats:splinefun]{stats::splinefun()}} while "linear" +uses \code{\link[stats:approxfun]{stats::approx()}}. The "linear" method is used as a fallback if +"cubic" should fail for some reason.} + +\item{...}{unused} +} +\value{ +a matrix with one row for each entry in \code{x} and one column for each +value in \code{probs} +} +\description{ +Given a \link[hardhat:quantile_pred]{hardhat::quantile_pred} object, users may wish to compute additional +\code{quantile_levels} that are not part of the object. This function attempts +to estimate these quantities under some assumptions. Interior probabilities, +those contained within existing probabilities are interpolated in a manner +controled by the \code{middle} argument. Those outside existing probabilities +are extrapolated under the assumption that the tails of the distribution +decays exponentially. Optionally, one may constrain \emph{all} quantiles to be +within some support (say, \verb{[0, Inf)}). +} +\examples{ +qp <- quantile_pred(matrix(1:8, nrow = 2, byrow = TRUE), 1:4 / 5) +quantile(qp) +quantile(qp, lower = 0) +quantile(qp, probs = 0.5) +quantile(qp, probs = 1:9 / 10) +} +\seealso{ +\code{\link[=extrapolate_quantiles]{extrapolate_quantiles()}} +} diff --git a/man/quantile_reg.Rd b/man/quantile_reg.Rd index 5079c3434..31a5dd123 100644 --- a/man/quantile_reg.Rd +++ b/man/quantile_reg.Rd @@ -7,7 +7,7 @@ quantile_reg( mode = "regression", engine = "rq", - quantile_levels = 0.5, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), method = "br" ) } @@ -19,7 +19,7 @@ The only possible value for this model is "regression".} "rq" and "grf" are supported.} \item{quantile_levels}{A scalar or vector of values in (0, 1) to determine which -quantiles to estimate (default is 0.5).} +quantiles to estimate (default is the set 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95).} \item{method}{A fitting method used by \code{\link[quantreg:rq]{quantreg::rq()}}. See the documentation for a list of options.} diff --git a/man/reexports.Rd b/man/reexports.Rd index f6849a53c..8d3e360f4 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -1,5 +1,6 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/autoplot.R, R/reexports-tidymodels.R +% Please edit documentation in R/autoplot.R, R/reexports-tidymodels.R, +% R/reexports.R \docType{import} \name{reexports} \alias{reexports} @@ -10,7 +11,19 @@ \alias{bake} \alias{rand_id} \alias{tibble} +\alias{as_tibble} \alias{tidy} +\alias{quantile_pred} +\alias{extract_quantile_levels} +\alias{filter} +\alias{mutate} +\alias{rename} +\alias{select} +\alias{as_epi_df} +\alias{key_colnames} +\alias{pivot_longer} +\alias{pivot_wider} +\alias{unnest} \title{Objects exported from other packages} \keyword{internal} \description{ @@ -18,12 +31,20 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{generics}{\code{\link[generics]{fit}}, \code{\link[generics]{forecast}}, \code{\link[generics]{tidy}}} + \item{dplyr}{\code{\link[dplyr]{filter}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{select}}} + + \item{epiprocess}{\code{\link[epiprocess:epi_df]{as_epi_df}}, \code{\link[epiprocess]{key_colnames}}} + + \item{generics}{\code{\link[generics]{fit}}, \code{\link[generics]{fit}}, \code{\link[generics]{forecast}}, \code{\link[generics]{forecast}}, \code{\link[generics]{tidy}}, \code{\link[generics]{tidy}}} \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} - \item{recipes}{\code{\link[recipes]{bake}}, \code{\link[recipes]{prep}}, \code{\link[recipes]{rand_id}}} + \item{hardhat}{\code{\link[hardhat:quantile_pred]{extract_quantile_levels}}, \code{\link[hardhat:quantile_pred]{extract_quantile_levels}}, \code{\link[hardhat]{quantile_pred}}, \code{\link[hardhat]{quantile_pred}}} + + \item{recipes}{\code{\link[recipes]{bake}}, \code{\link[recipes]{bake}}, \code{\link[recipes]{prep}}, \code{\link[recipes]{prep}}, \code{\link[recipes]{rand_id}}, \code{\link[recipes]{rand_id}}} + + \item{tibble}{\code{\link[tibble]{as_tibble}}, \code{\link[tibble]{as_tibble}}, \code{\link[tibble]{tibble}}, \code{\link[tibble]{tibble}}} - \item{tibble}{\code{\link[tibble]{tibble}}} + \item{tidyr}{\code{\link[tidyr]{pivot_longer}}, \code{\link[tidyr]{pivot_wider}}, \code{\link[tidyr]{unnest}}} }} diff --git a/man/roll_modular_multivec.Rd b/man/roll_modular_multivec.Rd new file mode 100644 index 000000000..83c32aa41 --- /dev/null +++ b/man/roll_modular_multivec.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{roll_modular_multivec} +\alias{roll_modular_multivec} +\title{group col by .idx values and sum windows around each .idx value} +\usage{ +roll_modular_multivec(col, idx_in, weights, aggr, window_size, modulus) +} +\arguments{ +\item{col}{the list of values indexed by \code{idx_in}} + +\item{idx_in}{the relevant periodic part of time value, e.g. the week number, +limited to the relevant range} + +\item{weights}{how much to weigh each particular datapoint (also indexed by +\code{idx_in})} + +\item{aggr}{the aggregation function, probably Quantile, mean, or median} + +\item{window_size}{the number of .idx entries before and after to include in +the aggregation} + +\item{modulus}{the number of days/weeks/months in the year, not including any +leap days/weeks} +} +\description{ +group col by .idx values and sum windows around each .idx value +} +\keyword{internal} diff --git a/man/seq_forward.Rd b/man/seq_forward.Rd new file mode 100644 index 000000000..9b3da6e55 --- /dev/null +++ b/man/seq_forward.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{seq_forward} +\alias{seq_forward} +\title{seq, but returns null if from is larger} +\usage{ +seq_forward(from, to, by) +} +\description{ +seq, but returns null if from is larger +} +\keyword{internal} diff --git a/man/slather.Rd b/man/slather.Rd index dd556b629..3219d0e32 100644 --- a/man/slather.Rd +++ b/man/slather.Rd @@ -7,7 +7,7 @@ slather(object, components, workflow, new_data, ...) } \arguments{ -\item{object}{a workflow with \code{frosting} postprocessing steps} +\item{object}{a workflow with \code{frosting} post-processing steps} \item{components}{a list of components containing model information. These will be updated and returned by the layer. These should be @@ -31,10 +31,14 @@ and predict on} \item{...}{additional arguments used by methods. Currently unused.} } \value{ -The \code{components} list. In the same format after applying any updates. +The \code{components} list, in the same format as before, after applying +any updates. } \description{ -Slathering frosting means to implement a postprocessing layer. When -creating a new postprocessing layer, you must implement an S3 method -for this function +Slathering frosting means to implement a post-processing layer. It is the +post-processing equivalent of \code{bake} for a recipe. Given a layer, it applies +the actual transformation of that layer. When creating a new post-processing +layer, you must implement an S3 method for this function. Generally, you will +not need to call this function directly, as it will be used indirectly during +\code{predict}. } diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index c6b17dd86..5d401c795 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -8,7 +8,7 @@ smooth_quantile_reg( mode = "regression", engine = "smoothqr", outcome_locations = NULL, - quantile_levels = 0.5, + quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), degree = 3L ) } @@ -17,7 +17,7 @@ smooth_quantile_reg( The only possible value for this model is "regression".} \item{engine}{Character string naming the fitting function. Currently, only -"smooth_qr" is supported.} +"rq" and "grf" are supported.} \item{outcome_locations}{Defaults to the vector \code{1:ncol(y)} but if the responses are observed at a different spacing (or appear in a different @@ -25,7 +25,7 @@ order), that information should be used here. This argument will be mapped to the \code{ahead} argument of \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}}.} \item{quantile_levels}{A scalar or vector of values in (0, 1) to determine which -quantiles to estimate (default is 0.5).} +quantiles to estimate (default is the set 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95).} \item{degree}{the number of polynomials used for response smoothing. Must be no more than the number of responses.} @@ -50,25 +50,21 @@ x <- -99:99 / 100 * 2 * pi y <- sin(x) + rnorm(length(x), sd = .1) fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) -XY <- tibble::as_tibble(XY) +XY <- as_tibble(XY) qr_spec <- smooth_quantile_reg(quantile_levels = c(.2, .5, .8), outcome_locations = 20:1) tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) -library(tidyr) -library(dplyr) pl <- predict( object = tt, new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] ) pl <- pl \%>\% unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% + pivot_quantiles_wider(distn) \%>\% mutate( x = x[length(x) - 20] + ahead / 100 * 2 * pi, ahead = NULL - ) \%>\% - pivot_wider(names_from = quantile_levels, values_from = values) + ) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) @@ -78,11 +74,11 @@ lines(pl$x, pl$`0.5`, col = "red") library(ggplot2) ggplot(data.frame(x = x, y = y), aes(x)) + - geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "cornflowerblue") + geom_point(aes(y = y), colour = "grey") + # observed data geom_function(fun = sin, colour = "black") + # truth geom_vline(xintercept = fd, linetype = "dashed") + # end of training data - geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction + geom_line(data = pl, aes(y = `0.5`), colour = "orange") + # median prediction theme_bw() + coord_cartesian(xlim = c(0, NA)) + ylab("y") diff --git a/man/snap.Rd b/man/snap.Rd new file mode 100644 index 000000000..abe4421e8 --- /dev/null +++ b/man/snap.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_threshold_preds.R +\name{snap} +\alias{snap} +\title{restrict various objects to the interval [lower, upper]} +\usage{ +snap(x, lower, upper, ...) +} +\arguments{ +\item{x}{the object to restrict} + +\item{lower}{numeric, the lower bound} + +\item{upper}{numeric, the upper bound} + +\item{...}{unused} +} +\description{ +restrict various objects to the interval [lower, upper] +} +\keyword{internal} diff --git a/man/state_census.Rd b/man/state_census.Rd deleted file mode 100644 index eec13eb53..000000000 --- a/man/state_census.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{state_census} -\alias{state_census} -\title{State population data} -\format{ -Data frame with 57 rows (including one for the United States as a -whole, plus the District of Columbia, Puerto Rico Commonwealth, -American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, -Islands). - -\describe{ -\item{fips}{FIPS code} -\item{name}{Full name of the state or territory} -\item{pop}{Estimate of the location's resident population in -2019.} -\item{abbr}{Postal abbreviation for the location} -} -} -\source{ -United States Census Bureau, at -\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, -\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, -and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} -} -\usage{ -state_census -} -\description{ -Data set on state populations, from the 2019 US Census. -} -\keyword{datasets} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd new file mode 100644 index 000000000..019434409 --- /dev/null +++ b/man/step_adjust_latency.Rd @@ -0,0 +1,305 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_adjust_latency.R +\name{step_adjust_latency} +\alias{step_adjust_latency} +\title{Adapt the model to latent data} +\usage{ +step_adjust_latency( + recipe, + ..., + method = c("extend_ahead", "locf", "extend_lags"), + epi_keys_checked = NULL, + keys_to_ignore = c(), + fixed_latency = NULL, + fixed_forecast_date = NULL, + check_latency_length = TRUE, + id = rand_id("adjust_latency") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{method}{a character. Determines the method by which the +forecast handles latency. The options are: +\itemize{ +\item \code{"extend_ahead"}: Lengthen the ahead so that forecasting from the last +observation results in a forecast \code{ahead} after the \code{forecast_date} date. +E.g. if there are 3 days of latency between the last observation and the +\code{forecast_date} date for a 4 day ahead forecast, the ahead used in practice +is actually 7. +\item \code{"locf"}: carries forward the last observed value(s) up to the forecast +date. +\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that +the shortest lag at predict time is at the last observation. E.g. if the +lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used +become \code{c(3,10,17)}. +}} + +\item{epi_keys_checked}{a character vector. A list of keys to group by before +finding the \code{max_time_value} (the last day of data), defaulting to +\code{geo_value}. Different locations may have different latencies; to produce a +forecast at every location, we need to guarantee data at every location by +using the largest latency across every location; this means taking +\code{max_time_value} to be the minimum of the \code{max_time_value}s for each set of +key values (so the earliest date). If \code{NULL} or an empty character vector, +it will take the maximum across all values, irrespective of any keys. + +Note that this is a separate concern from different latencies across +different \emph{data columns}, which is only handled by the choice of \code{method}.} + +\item{keys_to_ignore}{a list of character vectors. Set this to avoid using +specific key values in the \code{epi_keys_checked} to set latency. For example, +say you have two locations \code{pr} and \code{gu} which have useful training data, +but have stopped providing up-to-date information, and so are no longer +part of the test set. Setting \code{keys_to_ignore = list(geo_value = c("pr", "gu"))} will exclude them from the latency calculation.} + +\item{fixed_latency}{either a positive integer, or a labeled positive integer +vector. Cannot be set at the same time as \code{fixed_forecast_date}. If +non-\code{NULL}, the amount to offset the ahead or lag by. If a single integer, +this is used for all columns; if a labeled vector, the labels must +correspond to the base column names (before lags/aheads). If \code{NULL}, the +latency is the distance between the \code{epi_df}'s \code{max_time_value} and the \code{forecast_date}.} + +\item{fixed_forecast_date}{either a date of the same kind used in the +\code{epi_df}, or \code{NULL}. Exclusive with \code{fixed_latency}. If a date, it gives +the date from which the forecast is actually occurring. If \code{NULL}, the +\code{forecast_date} is determined either via the \code{fixed_latency}, or is set to +the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} + +\item{check_latency_length}{bool, determines whether to warn if the latency +is unusually high. Turn off if you know your forecast is going to be far +into the future.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +In the standard case, the arx models assume that the last observation is also +the day from which the forecast is being made. But if the data has latency, +then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +to compensate. +This is most useful in realtime and +pseudo-prospective forecasting for data where there is some delay between the +event occurring and the event being reported. +} +\details{ +This step allows the user to create models on the most recent +data, automatically accounting for latency patterns. Instead of using the last observation +date, \code{step_adjust_latency} uses the \code{as_of} date of the \code{epi_df} as the +\code{forecast_date}, and adjusts the model so that there is data available. To +demonstrate some of the subtleties, let's consider a toy dataset: + +\if{html}{\out{
}}\preformatted{toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) \%>\% + as_epi_df(as_of = as.Date("2015-01-14")) +}\if{html}{\out{
}} + +If we're looking to predict the value on the 15th, forecasting from the 14th +(the \code{as_of} date above), there are two issues we will need to address: +\enumerate{ +\item \code{"ca"} is latent by 2 days, whereas \code{"ma"} is latent by 1 +\item if we want to use \code{b} as an exogenous variable, for \code{"ma"} it is latent by +3 days instead of just 1. +} + +Regardless of \code{method}, \code{epi_keys_checked="geo_value"} guarantees tha the +difference between \code{"ma"} and \code{"ca"} is accounted for by making the latency +adjustment at least 2. For some comparison, here's what the various methods +will do: +\subsection{\code{locf}}{ + +Short for "last observation carried forward", \code{locf} assumes that every day +between the last observation and the forecast day is exactly the same. +This is a very straightforward assumption, but wrecks any features that +depend on changes in value over time, such as the growth rate, or even +adjacent lags. A more robust version of this falls under the heading of +nowcasting, an eventual aim for this package. On the toy dataset, it +doesn't matter which day we're trying to predict, since it just fills +forward to the \code{forecast_date}: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(has_role("raw"), method="locf") + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 8 x 4 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 8 x 4 +#> geo_value time_value a b +#> +#> 1 ca 2015-01-11 100 5 +#> 2 ca 2015-01-12 103 10 +#> 3 ca 2015-01-13 103 10 +#> 4 ca 2015-01-14 103 10 +#> 5 ma 2015-01-11 20 6 +#> 6 ma 2015-01-12 23 6 +#> 7 ma 2015-01-13 25 6 +#> 8 ma 2015-01-14 25 6 +}\if{html}{\out{
}} +} + +\subsection{\code{extend_lags}}{ + +\code{extend_lags} increases the lags so that they are guaranteed to have +data. This has the advantage of being applicable on +a per-column basis; if cases and deaths are reported at different +latencies, the lags for each are adjusted separately. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(has_role("raw"), method = "extend_lags") \%>\% + step_epi_lag(a, lag=1) \%>\% + step_epi_lag(b, lag=1) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 21 x 7 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 21 x 7 +#> geo_value time_value a b lag_3_a lag_4_b ahead_1_a +#> +#> 1 ca 2015-01-10 NA NA NA NA 100 +#> 2 ca 2015-01-11 100 5 NA NA 103 +#> 3 ca 2015-01-12 103 10 NA NA NA +#> 4 ca 2015-01-13 NA NA NA NA NA +#> 5 ca 2015-01-14 NA NA 100 NA NA +#> 6 ca 2015-01-15 NA NA 103 5 NA +#> 7 ca 2015-01-16 NA NA NA 10 NA +#> 8 ca 2015-01-17 NA NA NA NA NA +#> 9 ca 2015-01-18 NA NA NA NA NA +#> 10 ca 2015-01-19 NA NA NA NA NA +#> # i 11 more rows +}\if{html}{\out{
}} + +The maximum latency in column \code{a} is 2 days, so the lag is increased to 3, +while the max latency in column \code{b} is 3, so the same lag is increased to +4; both of these changes are reflected in the column names. Meanwhile the +ahead is uneffected. + +As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +the values are brought forward in time, so that for a given row, column +\code{lag_3_a} represents the value 3 days before. +} + +\subsection{\code{extend_ahead}}{ + +\code{extend_ahead} increases the ahead, turning a 3 day ahead forecast +into a 7 day one; this has the advantage of simplicity and is reflective of +the actual modelling task, but potentially leaves information unused if +different data sources have different latencies; it must use the latency of +the most latent data to insure there is data available. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(has_role("raw"), method="extend_ahead") \%>\% + step_epi_lag(a, lag=0) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 10 x 6 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 10 x 6 +#> geo_value time_value a b lag_0_a ahead_3_a +#> +#> 1 ca 2015-01-08 NA NA NA 100 +#> 2 ca 2015-01-09 NA NA NA 103 +#> 3 ca 2015-01-11 100 5 100 NA +#> 4 ca 2015-01-12 103 10 103 NA +#> 5 ma 2015-01-08 NA NA NA 20 +#> 6 ma 2015-01-09 NA NA NA 23 +#> 7 ma 2015-01-10 NA NA NA 25 +#> 8 ma 2015-01-11 20 6 20 NA +#> 9 ma 2015-01-12 23 NA 23 NA +#> 10 ma 2015-01-13 25 NA 25 NA +}\if{html}{\out{
}} + +Even though we're doing a 1 day ahead forecast, because our worst latency +is 3 days from column \code{b}'s \code{"ma"} data, our outcome column is \code{ahead_4_a} +(so 4 days ahead). If we want to ignore any latency in column \code{b}, we need +to explicitly set the columns to consider while adjusting like this: +\code{step_adjust_latency(a, method="extend_ahead")}. +} +} +\section{Programmatic details}{ +\code{step_adjust_latency} uses the metadata, such as \code{time_type} and \code{as_of}, of +the \code{epi_df} used in the initial prep step, rather than baking or +prediction. This means reusing the same forecaster on new data is not +advised, though typically it is not advised in general. + +The latency adjustment only applies to columns created after this step, so +this step should go before both \code{step_epi_ahead} and \code{step_epi_lag}. This will work: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + # non-lag steps + step_adjust_latency(a, method = "extend_lags") \%>\% + step_epi_lag(a, lag=0) # other steps +}\if{html}{\out{
}} + +while this will not: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_epi_lag(a, lag=0) \%>\% + step_adjust_latency(a, method = "extend_lags") +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't +#> work with modified data. +}\if{html}{\out{
}} + +If you create columns that you then apply lags to (such as +\code{step_growth_rate()}), these should be created before +\code{step_adjust_latency}, so any subseqent latency can be addressed. +} + +\examples{ +rates <- covid_case_death_rates \%>\% + dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) +# setting the `as_of` to something realistic +attributes(rates)$metadata$as_of <- max(rates$time_value) + 3 + +r <- epi_recipe(rates) \%>\% + step_adjust_latency(recipes::has_role("raw"), method = "extend_ahead") \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) +r + +rates_fit <- epi_workflow() \%>\% + add_epi_recipe(r) \%>\% + add_model(linear_reg()) \%>\% + fit(data = rates) +rates_fit + +} +\seealso{ +Other row operation steps: +\code{\link{step_epi_lag}()}, +\code{\link{step_growth_rate}()}, +\code{\link{step_lag_difference}()} +} +\concept{row operation steps} diff --git a/man/step_adjust_latency_checks.Rd b/man/step_adjust_latency_checks.Rd new file mode 100644 index 000000000..baed1fb9b --- /dev/null +++ b/man/step_adjust_latency_checks.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{step_adjust_latency_checks} +\alias{step_adjust_latency_checks} +\title{checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion} +\usage{ +step_adjust_latency_checks( + id, + method, + recipe, + fixed_latency, + fixed_forecast_date, + call = caller_env() +) +} +\description{ +checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion +} +\keyword{internal} diff --git a/man/step_climate.Rd b/man/step_climate.Rd new file mode 100644 index 000000000..9b29fdc42 --- /dev/null +++ b/man/step_climate.Rd @@ -0,0 +1,156 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{step_climate} +\alias{step_climate} +\title{Calculate a climatological variable based on the history} +\usage{ +step_climate( + recipe, + ..., + forecast_ahead = "detect", + role = "predictor", + time_type = c("detect", "epiweek", "week", "month", "day"), + center_method = c("median", "mean"), + window_size = 3L, + epi_keys = NULL, + prefix = "climate_", + skip = FALSE, + id = rand_id("climate") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{forecast_ahead}{The forecast horizon. By default, this step will try to +detect whether a forecast horizon has already been specified with +\code{\link[=step_epi_ahead]{step_epi_ahead()}}. Alternatively, one can specify an explicit +horizon with a scalar integer. Auto-detection is only possible +when the time type of the \code{epi_df} used to create the \code{epi_recipe} is the +same as the aggregation +\code{time_type} specified in this step (say, both daily or both weekly). If, +for example, daily data is used with monthly time aggregation, then +auto-detection is not possible (and may in fact lead to strange behaviour +even if \code{forecast_ahead} is specified with an integer). See details below.} + +\item{role}{What role should be assigned for any variables created by this +step? "predictor" is the most likely choice.} + +\item{time_type}{The duration over which time aggregation should be performed.} + +\item{center_method}{The measure of center to be calculated over the time +window.} + +\item{window_size}{Scalar integer. How many time units on each side should +be included. For example, if \code{window_size = 3} and \code{time_type = "day"}, +then on each day in the data, the center will be calculated using 3 days +before and three days after. So, in this case, it operates like a weekly +rolling average, centered at each day.} + +\item{epi_keys}{Character vector or \code{NULL}. Any columns mentioned will be +grouped before performing any center calculation. So for example, given +state-level data, a national climate would be calculated if \code{NULL}, but +passing \code{epi_keys = "geo_value"} would calculate the climate separately +by state.} + +\item{prefix}{A character string that will be prefixed to the new column.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_climate()} creates a \emph{specification} of a recipe step that will +generate one or more new columns of derived data. This step examines all +available seasons in the training data and calculates the a measure of center +for the "typical" season. Think of this like with the weather: to predict the +temperature in January in Pittsburgh, PA, I might look at all previous +January's on record, average their temperatures, and include that in my +model. So it is important to \emph{align} the forecast horizon with the climate. +This step will work best if added after \code{step_epi_ahead()}, but that is not +strictly required. See the details for more information. +} +\details{ +Construction of a climate predictor can be helpful with strongly seasonal +data. But its utility is greatest when the estimated "climate" is aligned +to the forecast horizon. +For example, if today is December 1, and we want +to make a prediction for December 15, we want to know the climate for the +week of December 15 to use in our model. But we also want to align the rest +of our training data with the climate \emph{2 weeks after} those dates. + +To accomplish +this, if we have daily data, we could use \code{time_type = "week"} and +\code{forecast_ahead = 2}. The climate predictor would be created by taking +averages over each week (with a window of a few weeks before and after, as +determined by \code{window_size}), and then aligning these with the appropriate dates +in the training data so that each \code{time_value} will "see" the typical climate 2 +weeks in the future. + +Alternatively, in the same scenario, we could use \code{time_type = "day"} and +\code{forecast_ahead = 14}. The climate predictor would be created by taking +averages over a small window around each \emph{day}, and then aligning these with +the appropriate dates in the training data so that each \code{time_value} will +"see" the climate 14 days in the future. + +The only differences between these options is the type of averaging performed +over the historical data. In the first case, days in the same week will get +the same value of the climate predictor (because we're looking at weekly +windows), while in the second case, every day in the data will have the +average climate for the \emph{day} that happens 14 days in the future. + +Autodetecting the forecast horizon can only be guaranteed to work correctly +when the time types are the same: for example using daily data for training +and daily climate calculations. However, using weekly data, predicting 4 +weeks ahead, and setting \code{time_type = "month"} is perfectly reasonable. It's +just that the climate is calculated over \emph{months} (January, February, March, +etc.) so how to properly align this when producing a forecast for the 5th week +in the year is challenging. For scenarios like these, it may be best to +approximately match the times with \code{forecast_ahead = 1}, for example. +} +\examples{ +# automatically detects the horizon +r <- epi_recipe(covid_case_death_rates) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_climate(death_rate, time_type = "day") +r + +r \%>\% + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) + +# same idea, but using weekly climate +r <- epi_recipe(covid_case_death_rates) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_climate(death_rate, + forecast_ahead = 1, time_type = "epiweek", + window_size = 1L + ) +r + +r \%>\% + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) + +# switching the order is possible if you specify `forecast_ahead` +r <- epi_recipe(covid_case_death_rates) \%>\% + step_climate(death_rate, forecast_ahead = 7, time_type = "day") \%>\% + step_epi_ahead(death_rate, ahead = 7) +r + +r \%>\% + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) +} diff --git a/man/step_epi_naomit.Rd b/man/step_epi_naomit.Rd index b579dd6d6..aa9208d89 100644 --- a/man/step_epi_naomit.Rd +++ b/man/step_epi_naomit.Rd @@ -10,16 +10,21 @@ step_epi_naomit(recipe) \item{recipe}{Recipe to be used for omission steps} } \value{ -Omits NA's from both predictors and outcomes at training time -to fit the model. Also only omits associated predictors and not -outcomes at prediction time due to lack of response and avoidance -of data loss. +Omits NA's from both predictors and outcomes at training time to fit +the model. Also only omits associated predictors and not outcomes at +prediction time due to lack of response and avoidance of data loss. Given a +\code{recipe}, this step is literally equivalent to + +\if{html}{\out{
}}\preformatted{ recipe \%>\% + recipes::step_naomit(all_predictors(), skip = FALSE) \%>\% + recipes::step_naomit(all_outcomes(), skip = TRUE) +}\if{html}{\out{
}} } \description{ Unified NA omission wrapper function for recipes } \examples{ -case_death_rate_subset \%>\% +covid_case_death_rates \%>\% epi_recipe() \%>\% step_epi_naomit() } diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 2bf22c15d..e53c94a07 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -61,16 +61,18 @@ sequence of any existing operations. } \description{ \code{step_epi_lag} and \code{step_epi_ahead} create a \emph{specification} of a recipe step -that will add new columns of shifted data. The former will created a lag -column, while the latter will create a lead column. Shifted data will -by default include NA values where the shift was induced. -These can be properly removed with \code{\link[=step_epi_naomit]{step_epi_naomit()}}, or you may -specify an alternative filler value with the \code{default} -argument. +that will add new columns of shifted data. The \code{step_epi_lag} will create +a lagged \code{predictor} column, while \code{step_epi_ahead} will create a leading +\code{outcome} column. Shifted data will by default include NA values where the +shift was induced. These can be properly removed with \code{\link[=step_epi_naomit]{step_epi_naomit()}}, +or you may specify an alternative value with the \code{default} argument. } \details{ -The step assumes that the data are already \emph{in the proper sequential -order} for shifting. +Our \code{lag/ahead} functions respect the \code{geo_value} and \code{other_keys} of the +\code{epi_df}, and allow for discontiguous \code{time_value}s. Both of these features +are noticably lacking from \code{recipe::step_lag()}. +Our \code{lag/ahead} functions also appropriately adjust the amount of data to +avoid accidentally dropping recent predictors from the test data. The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they @@ -78,17 +80,19 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index 242f8e312..1a104c1bc 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -75,13 +75,13 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_epi_slide()} creates a \emph{specification} of a recipe step -that will generate one or more new columns of derived data by "sliding" -a computation along existing data. +\code{step_epi_slide()} creates a \emph{specification} of a recipe step that will +generate one or more new columns of derived data by "sliding" a computation +along existing data. This is a wrapper around \code{epiprocess::epi_slide()} +to allow its use within an \code{epi_recipe()}. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-01-01"), geo_value \%in\% c("ca", "ny")) rec <- epi_recipe(jhu) \%>\% step_epi_slide(case_rate, death_rate, diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index bc6da0bef..999d818ed 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -11,11 +11,11 @@ step_growth_rate( horizon = 7, method = c("rel_change", "linear_reg"), log_scale = FALSE, + na_rm = TRUE, replace_Inf = NA, prefix = "gr_", skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list() + id = rand_id("growth_rate") ) } \arguments{ @@ -41,6 +41,9 @@ growth rates). See \code{\link[epiprocess:growth_rate]{epiprocess::growth_rate() \item{log_scale}{Should growth rates be estimated using the parameterization on the log scale? See details for an explanation. Default is \code{FALSE}.} +\item{na_rm}{Should missing values be removed before the computation? Default +is \code{FALSE}.} + \item{replace_Inf}{Sometimes, the growth rate calculation can result in infinite values (if the denominator is zero, for example). In this case, most prediction methods will fail. This argument specifies potential @@ -59,30 +62,32 @@ Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{id}{A unique identifier for the step} - -\item{additional_gr_args_list}{A list of additional arguments used by -\code{\link[epiprocess:growth_rate]{epiprocess::growth_rate()}}. All \code{...} arguments may be passed here along -with \code{dup_rm} and \code{na_rm}.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_growth_rate()} creates a \emph{specification} of a recipe step -that will generate one or more new columns of derived data. +\code{step_growth_rate()} creates a \emph{specification} of a recipe step that will +generate one or more new columns of derived data. This is a wrapper around +\code{epiprocess::growth_rate()} to allow its use within an \code{epi_recipe()}. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +tiny_geos <- c("as", "mp", "vi", "gu", "pr") +rates <- covid_case_death_rates \%>\% + filter(time_value >= as.Date("2021-11-01"), !(geo_value \%in\% tiny_geos)) + +r <- epi_recipe(rates) \%>\% step_growth_rate(case_rate, death_rate) r r \%>\% - prep(case_death_rate_subset) \%>\% - bake(case_death_rate_subset) + prep(rates) \%>\% + bake(new_data = NULL) } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index 7969ea3a7..325bcf05c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -43,21 +43,30 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_lag_difference()} creates a \emph{specification} of a recipe step -that will generate one or more new columns of derived data. +\code{step_lag_difference()} creates a \emph{specification} of a recipe step that will +generate one or more new columns of derived data. For each column in the +specification, \code{step_lag_difference()} will calculate the difference +between the values at a distance of \code{horizon}. For example, with +\code{horizon=1}, this would simply be the difference between adjacent days. +} +\details{ +Much like \code{step_epi_lag()} this step works with the actual time values (so if +there are gaps it will fill with \code{NA} values), and respects the grouping +inherent in the \code{epi_df()} as specified by \code{geo_value} and \code{other_keys}. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) \%>\% step_epi_naomit() r r \%>\% - prep(case_death_rate_subset) \%>\% - bake(case_death_rate_subset) + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_growth_rate}()} } diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 004c2c823..733e1a9f8 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -26,19 +26,25 @@ sequence of operations for this recipe.} for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} \item{role}{For model terms created by this step, what analysis role should -they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} +they be assigned?} -\item{df}{a data frame that contains the population data to be used for -inverting the existing scaling.} +\item{df}{a data frame containing the scaling data (typically population). The +target column is divided by the value in \code{df_pop_col}.} -\item{by}{A (possibly named) character vector of variables to join by. +\item{by}{A (possibly named) character vector of variables by which to join +\code{df} to the \code{epi_df}. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the training/test +data with roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in +\code{df}; these roles are automatically set if you are using an \code{epi_df}, or you +can use, e.g., \code{update_role}. If no such roles are set, it will try to +perform a natural join, using variables in common between the training/test +data and population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} @@ -46,7 +52,7 @@ to \code{df$states}. To join by multiple variables, use a vector with length > 1 For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. -See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} +See \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}} for more details.} \item{df_pop_col}{the name of the column in the data frame \code{df} that contains the population data and will be used for scaling. @@ -56,7 +62,7 @@ This should be one column.} Adjustments can be made here. For example, if the original scale is "per 100K", then set \code{rate_rescaling = 1e5} to get rates.} -\item{create_new}{TRUE to create a new column and keep the original column +\item{create_new}{\code{TRUE} to create a new column and keep the original column in the \code{epi_df}} \item{suffix}{a character. The suffix added to the column name if @@ -75,19 +81,17 @@ the computations for subsequent operations.} Scales raw data by the population } \description{ -\code{step_population_scaling} creates a specification of a recipe step -that will perform per-capita scaling. Typical usage would -load a dataset that contains state-level population, and use it to convert -predictions made from a raw scale model to rate-scale by dividing by -the population. -Although, it is worth noting that there is nothing special about "population". -The function can be used to scale by any variable. Population is the -standard use case in the epidemiology forecasting scenario. Any value -passed will \emph{divide} the selected variables while the \code{rate_rescaling} -argument is a common \emph{multiplier} of the selected variables. +\code{step_population_scaling()} creates a specification of a recipe step that +will perform per-capita scaling. Typical usage would set \code{df} to be a dataset +that contains population for each \code{geo_value}, and use it to convert +predictions made from a raw scale model to rate-scale by dividing by the +population. Although, it is worth noting that there is nothing special about +"population", and the function can be used to scale by any variable. +Population is the standard use case in the epidemiology forecasting scenario. +Any value passed will \emph{divide} the selected variables while the +\code{rate_rescaling} argument is a common \emph{multiplier} of the selected variables. } \examples{ -library(dplyr) jhu <- cases_deaths_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% select(geo_value, time_value, cases) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 42f6b9a95..d10b0e41c 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -40,8 +40,12 @@ observations in \code{time_value} per group, where the groups are formed based on the remaining \code{epi_keys}. } \details{ -Note that \code{step_epi_lead()} and \code{step_epi_lag()} should come -after any filtering step. +It is recommended to do this after any \code{step_epi_ahead()}, +\code{step_epi_lag()}, or \code{step_epi_naomit()} steps. If \code{step_training_window()} +happens first, there will be less than \code{n_training} remaining examples, +since either leading or lagging will introduce \code{NA}'s later removed by +\code{step_epi_naomit()}. Typical usage will use this step last in an +\code{epi_recipe()}. } \examples{ tib <- tibble( diff --git a/man/tidy.frosting.Rd b/man/tidy.frosting.Rd index ba3c0f3d5..3f9b0e377 100644 --- a/man/tidy.frosting.Rd +++ b/man/tidy.frosting.Rd @@ -37,8 +37,7 @@ method for the operation exists). Note that this is a modified version of the \code{tidy} method for a recipe. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% @@ -48,6 +47,7 @@ r <- epi_recipe(jhu) \%>\% wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(recipe = r, x = jhu) + f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) diff --git a/man/update.layer.Rd b/man/update.layer.Rd index 9604992e1..878c9bb11 100644 --- a/man/update.layer.Rd +++ b/man/update.layer.Rd @@ -18,8 +18,7 @@ will replace the elements of the same name in the actual post-processing layer. Analogous to \code{update.step()} from the \code{recipes} package. } \examples{ -library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd index 4907e2724..ef09d4da1 100644 --- a/man/weighted_interval_score.Rd +++ b/man/weighted_interval_score.Rd @@ -2,12 +2,9 @@ % Please edit documentation in R/weighted_interval_score.R \name{weighted_interval_score} \alias{weighted_interval_score} -\alias{weighted_interval_score.dist_quantiles} \title{Compute weighted interval score} \usage{ -weighted_interval_score(x, actual, quantile_levels = NULL, ...) - -\method{weighted_interval_score}{dist_quantiles}( +weighted_interval_score( x, actual, quantile_levels = NULL, @@ -16,27 +13,26 @@ weighted_interval_score(x, actual, quantile_levels = NULL, ...) ) } \arguments{ -\item{x}{distribution. A vector of class distribution. Ideally, this vector -contains \code{dist_quantiles()}, though other distributions are supported when -\code{quantile_levels} is specified. See below.} +\item{x}{A vector of class \code{quantile_pred}.} \item{actual}{double. Actual value(s)} \item{quantile_levels}{probabilities. If specified, the score will be -computed at this set of levels.} - -\item{...}{not used} +computed at this set of levels. Otherwise, those present in \code{x} will be +used.} -\item{na_handling}{character. Determines how \code{quantile_levels} without a -corresponding \code{value} are handled. For \code{"impute"}, missing values will be +\item{na_handling}{character. Determines missing values are handled. +For \code{"impute"}, missing values will be calculated if possible using the available quantiles. For \code{"drop"}, explicitly missing values are ignored in the calculation of the score, but implicitly missing values are imputed if possible. For \code{"propogate"}, the resulting score will be \code{NA} if any missing values -exist in the original \code{quantile_levels}. Finally, if +exist. Finally, if \code{quantile_levels} is specified, \code{"fail"} will result in the score being \code{NA} when any required quantile levels (implicit or explicit) -are do not have corresponding values.} +do not have corresponding values.} + +\item{...}{not used} } \value{ a vector of nonnegative scores. @@ -48,45 +44,38 @@ approximation of the commonly-used continuous ranked probability score generalization of absolute error. For example, see \href{https://arxiv.org/abs/2005.12881}{Bracher et al. (2020)} for discussion in the context of COVID-19 forecasting. } -\section{Methods (by class)}{ -\itemize{ -\item \code{weighted_interval_score(dist_quantiles)}: Weighted interval score with -\code{dist_quantiles} allows for different \code{NA} behaviours. - -}} \examples{ quantile_levels <- c(.2, .4, .6, .8) -predq_1 <- 1:4 # -predq_2 <- 8:11 -dstn <- dist_quantiles(list(predq_1, predq_2), quantile_levels) +predq1 <- 1:4 # +predq2 <- 8:11 +dstn <- quantile_pred(rbind(predq1, predq2), quantile_levels) actual <- c(3.3, 7.1) weighted_interval_score(dstn, actual) weighted_interval_score(dstn, actual, c(.25, .5, .75)) -library(distributional) -dstn <- dist_normal(c(.75, 2)) -weighted_interval_score(dstn, 1, c(.25, .5, .75)) - # Missing value behaviours -dstn <- dist_quantiles(c(1, 2, NA, 4), 1:4 / 5) +dstn <- quantile_pred(matrix(c(1, 2, NA, 4), nrow = 1), 1:4 / 5) weighted_interval_score(dstn, 2.5) weighted_interval_score(dstn, 2.5, 1:9 / 10) weighted_interval_score(dstn, 2.5, 1:9 / 10, na_handling = "drop") weighted_interval_score(dstn, 2.5, na_handling = "propagate") -weighted_interval_score(dist_quantiles(1:4, 1:4 / 5), 2.5, 1:9 / 10, +weighted_interval_score( + quantile_pred(matrix(1:4, nrow = 1), 1:4 / 5), + actual = 2.5, + quantile_levels = 1:9 / 10, na_handling = "fail" ) # Using some actual forecasts -------- library(dplyr) -jhu <- case_death_rate_subset \%>\% +training <- covid_case_death_rates \%>\% filter(time_value >= "2021-10-01", time_value <= "2021-12-01") preds <- flatline_forecaster( - jhu, "death_rate", + training, "death_rate", flatline_args_list(quantile_levels = c(.01, .025, 1:19 / 20, .975, .99)) )$predictions -actuals <- case_death_rate_subset \%>\% +actuals <- covid_case_death_rates \%>\% filter(time_value == as.Date("2021-12-01") + 7) \%>\% select(geo_value, time_value, actual = death_rate) preds <- left_join(preds, actuals, diff --git a/man/within_window.Rd b/man/within_window.Rd new file mode 100644 index 000000000..b0b824690 --- /dev/null +++ b/man/within_window.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{within_window} +\alias{within_window} +\title{generate the idx values within \code{window_size} of \code{target_idx} given that our +time value is of the type matching modulus} +\usage{ +within_window(target_idx, window_size, modulus) +} +\arguments{ +\item{target_idx}{the time index which we're drawing the window around} + +\item{window_size}{the size of the window on one side of \code{target_idx}} + +\item{modulus}{the number of days/weeks/months in the year, not including any leap days/weeks} +} +\description{ +generate the idx values within \code{window_size} of \code{target_idx} given that our +time value is of the type matching modulus +} +\keyword{internal} diff --git a/man/yday_leap.Rd b/man/yday_leap.Rd new file mode 100644 index 000000000..1dc873a38 --- /dev/null +++ b/man/yday_leap.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_climate.R +\name{yday_leap} +\alias{yday_leap} +\title{a function that assigns Feb 29th to 999, and aligns all other dates the same +number in the year, regardless of whether it's a leap year} +\usage{ +yday_leap(time_value) +} +\description{ +a function that assigns Feb 29th to 999, and aligns all other dates the same +number in the year, regardless of whether it's a leap year +} +\keyword{internal} diff --git a/pkgdown-watch.R b/pkgdown-watch.R new file mode 100644 index 000000000..bd23406a3 --- /dev/null +++ b/pkgdown-watch.R @@ -0,0 +1,65 @@ +# Run with: Rscript pkgdown-watch.R +# +# Modifying this: https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621 +# - Removed docopts cli interface and various configs/features I didn't need. +# - Sped up reference building by not running examples. +# +# Note that the `pattern` regex is case sensitive, so make sure your Rmd files +# end in `.Rmd` and not `.rmd`. +# +# Also I had issues with `pkgdown::build_reference()` not working, so I just run +# it manually when I need to. + +rlang::check_installed(c("pkgdown", "servr", "devtools", "here", "cli", "fs")) +library(pkgdown) +pkg <- pkgdown::as_pkgdown(here::here()) +devtools::document(here::here()) +devtools::build_readme() +pkgdown::build_articles(pkg) +pkgdown::build_site(pkg, lazy = FALSE, examples = FALSE, devel = TRUE, preview = FALSE) + +servr::httw( + dir = here::here("docs"), + watch = here::here(), + pattern = "[.](Rm?d|y?ml|s[ac]ss|css|js)$", + handler = function(files) { + devtools::load_all() + + files_rel <- fs::path_rel(files, start = getwd()) + cli::cli_inform("{cli::col_yellow('Updated')} {.val {files_rel}}") + + articles <- grep("vignettes.+Rmd$", files, value = TRUE) + + if (length(articles) == 1) { + name <- fs::path_ext_remove(fs::path_rel(articles, fs::path(pkg$src_path, "vignettes"))) + pkgdown::build_article(name, pkg) + } else if (length(articles) > 1) { + pkgdown::build_articles(pkg, preview = FALSE) + } + + refs <- grep("man.+R(m?d)?$", files, value = TRUE) + if (length(refs)) { + # Doesn't work for me, so I run it manually. + # pkgdown::build_reference(pkg) # nolint: commented_code_linter + } + + pkgdown <- grep("pkgdown", files, value = TRUE) + if (length(pkgdown) && !pkgdown %in% c(articles, refs)) { + pkgdown::init_site(pkg) + } + + pkgdown_index <- grep("index[.]Rmd$", files_rel, value = TRUE) + if (length(pkgdown_index)) { + devtools::build_rmd(pkgdown_index) + pkgdown::build_home(pkg) + } + + readme <- grep("README[.]rmd$", files, value = TRUE, ignore.case = TRUE) + if (length(readme)) { + devtools::build_readme() + pkgdown::build_site(pkg, lazy = TRUE, examples = FALSE, devel = TRUE, preview = FALSE) + } + + cli::cli_alert("Site rebuild done!") + } +) diff --git a/tests/testthat.R b/tests/testthat.R index 296d916e8..27254bec7 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,8 @@ library(testthat) library(epipredict) +library(parsnip) +library(workflows) +library(dplyr) + test_check("epipredict") diff --git a/tests/testthat/_snaps/arx_args_list.md b/tests/testthat/_snaps/arx_args_list.md index 959a5e25b..2579c5f0c 100644 --- a/tests/testthat/_snaps/arx_args_list.md +++ b/tests/testthat/_snaps/arx_args_list.md @@ -124,6 +124,15 @@ # arx forecaster disambiguates quantiles + Code + compare_quantile_args(alist / 10, 1:9 / 10, "grf") + Condition + Error in `compare_quantile_args()`: + ! You have specified different, non-default, quantiles in the trainier and `arx_args` options. + i Please only specify quantiles in one location. + +--- + Code compare_quantile_args(alist, tlist) Condition diff --git a/tests/testthat/_snaps/bake-method.md b/tests/testthat/_snaps/bake-method.md index eee28cc4b..0e8aeb78b 100644 --- a/tests/testthat/_snaps/bake-method.md +++ b/tests/testthat/_snaps/bake-method.md @@ -3,7 +3,7 @@ Code bake(prep(r, edf), NULL, composition = "matrix") Condition - Error in `hardhat::recompose()`: + Error in `juice()`: ! `data` must only contain numeric columns. i These columns aren't numeric: "geo_value" and "time_value". diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md new file mode 100644 index 000000000..4a6ff336d --- /dev/null +++ b/tests/testthat/_snaps/check_enough_data.md @@ -0,0 +1,54 @@ +# check_enough_data works on pooled data + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n + 1, + drop_na = FALSE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +--- + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 1, + drop_na = TRUE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x. + +# check_enough_data works on unpooled data + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = n + 1, + epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +--- + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 3, + epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +# check_enough_data only checks train data when skip = FALSE + + Code + forecaster %>% predict(new_data = toy_test_data %>% filter(time_value > + "2020-01-08")) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to predict: x. + +# check_enough_data works with all_predictors() downstream of constructed terms + + Code + epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( + all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y. + diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_train_data.md deleted file mode 100644 index 8f2389acb..000000000 --- a/tests/testthat/_snaps/check_enough_train_data.md +++ /dev/null @@ -1,46 +0,0 @@ -# check_enough_train_data works on pooled data - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n + 1, - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - ---- - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 1, - drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - -# check_enough_train_data works on unpooled data - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - ---- - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 3, - epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - -# check_enough_train_data works with all_predictors() downstream of constructed terms - - Code - epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep( - toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y. - diff --git a/tests/testthat/_snaps/climatological_forecaster.md b/tests/testthat/_snaps/climatological_forecaster.md new file mode 100644 index 000000000..78043a20b --- /dev/null +++ b/tests/testthat/_snaps/climatological_forecaster.md @@ -0,0 +1,112 @@ +# climate args list validates properly + + Code + climate_args_list(forecast_date = 12345) + Condition + Error in `climate_args_list()`: + ! `forecast_date` must be a date. + +--- + + Code + climate_args_list(forecast_date = as.Date(c("2021-01-10", "2024-01-22"))) + Condition + Error in `climate_args_list()`: + ! `forecast_date` must be a scalar. + +--- + + Code + climate_args_list(forecast_horizon = 1.3) + Condition + Error in `climate_args_list()`: + ! `forecast_horizon` must be a integer. + +--- + + Code + climate_args_list(window_size = -1) + Condition + Error in `climate_args_list()`: + ! `window_size` must be a non-negative integer. + +--- + + Code + climate_args_list(window_size = 2.5) + Condition + Error in `climate_args_list()`: + ! `window_size` must be a non-negative integer. + +--- + + Code + climate_args_list(window_size = 1:3) + Condition + Error in `climate_args_list()`: + ! `window_size` must be a scalar. + +--- + + Code + climate_args_list(quantile_levels = -1) + Condition + Error in `climate_args_list()`: + ! `quantile_levels` must lie in [0, 1]. + +--- + + Code + climate_args_list(quantile_levels = 1.3) + Condition + Error in `climate_args_list()`: + ! `quantile_levels` must lie in [0, 1]. + +--- + + Code + climate_args_list(symmetrize = 2.5) + Condition + Error in `climate_args_list()`: + ! `symmetrize` must be of type . + +--- + + Code + climate_args_list(symmetrize = c(TRUE, TRUE)) + Condition + Error in `climate_args_list()`: + ! `symmetrize` must be a scalar. + +--- + + Code + climate_args_list(nonneg = 2.5) + Condition + Error in `climate_args_list()`: + ! `nonneg` must be of type . + +--- + + Code + climate_args_list(nonneg = c(TRUE, TRUE)) + Condition + Error in `climate_args_list()`: + ! `nonneg` must be a scalar. + +--- + + Code + climate_args_list(quantile_by_key = TRUE) + Condition + Error in `climate_args_list()`: + ! `quantile_by_key` must be of type . + +--- + + Code + climate_args_list(quantile_by_key = 2:3) + Condition + Error in `climate_args_list()`: + ! `quantile_by_key` must be of type . + diff --git a/tests/testthat/_snaps/dist_quantiles.md b/tests/testthat/_snaps/dist_quantiles.md deleted file mode 100644 index 1d626e089..000000000 --- a/tests/testthat/_snaps/dist_quantiles.md +++ /dev/null @@ -1,56 +0,0 @@ -# constructor returns reasonable quantiles - - Code - new_quantiles(rnorm(5), c(-2, -1, 0, 1, 2)) - Condition - Error in `new_quantiles()`: - ! `quantile_levels` must lie in [0, 1]. - ---- - - Code - new_quantiles(sort(rnorm(5)), sort(runif(2))) - Condition - Error in `new_quantiles()`: - ! length(values) == length(quantile_levels) is not TRUE - ---- - - Code - new_quantiles(c(2, 1, 3, 4, 5), c(0.1, 0.1, 0.2, 0.5, 0.8)) - Condition - Error in `new_quantiles()`: - ! !vctrs::vec_duplicate_any(quantile_levels) is not TRUE - ---- - - Code - new_quantiles(c(2, 1, 3, 4, 5), c(0.1, 0.15, 0.2, 0.5, 0.8)) - Condition - Error in `new_quantiles()`: - ! `values[order(quantile_levels)]` produces unsorted quantiles. - ---- - - Code - new_quantiles(c(1, 2, 3), c(0.1, 0.2, 3)) - Condition - Error in `new_quantiles()`: - ! `quantile_levels` must lie in [0, 1]. - -# arithmetic works on quantiles - - Code - sum(dstn) - Condition - Error in `mapply()`: - ! You can't perform arithmetic between two distributions like this. - ---- - - Code - suppressWarnings(dstn + distributional::dist_normal()) - Condition - Error: - ! non-numeric argument to binary operator - diff --git a/tests/testthat/_snaps/enframer.md b/tests/testthat/_snaps/enframer.md deleted file mode 100644 index 4b05dbff3..000000000 --- a/tests/testthat/_snaps/enframer.md +++ /dev/null @@ -1,32 +0,0 @@ -# enframer errors/works as needed - - Code - enframer(1:5, letters[1]) - Condition - Error in `enframer()`: - ! is.data.frame(df) is not TRUE - ---- - - Code - enframer(data.frame(a = 1:5), 1:3) - Condition - Error in `enframer()`: - ! `x` must be of type . - ---- - - Code - enframer(data.frame(a = 1:5), letters[1:3]) - Condition - Error in `enframer()`: - ! In enframer: some new cols match existing column names - ---- - - Code - enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4) - Condition - Error in `enframer()`: - ! length(fill) == 1 || length(fill) == nrow(df) is not TRUE - diff --git a/tests/testthat/_snaps/epi_recipe.md b/tests/testthat/_snaps/epi_recipe.md index 24b046678..c63bf8f07 100644 --- a/tests/testthat/_snaps/epi_recipe.md +++ b/tests/testthat/_snaps/epi_recipe.md @@ -3,8 +3,8 @@ Code epi_recipe(tib) Condition - Error in `epi_recipe()`: - ! `x` must be an or a , not a . + Error in `UseMethod()`: + ! no applicable method for 'epi_recipe' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')" --- @@ -19,8 +19,8 @@ Code epi_recipe(m) Condition - Error in `epi_recipe()`: - ! `x` must be an or a , not a . + Error in `UseMethod()`: + ! no applicable method for 'epi_recipe' applied to an object of class "c('matrix', 'array', 'character')" # add/update/adjust/remove epi_recipe works as intended diff --git a/tests/testthat/_snaps/get_test_data.md b/tests/testthat/_snaps/get_test_data.md index e65b0715c..22d0c942a 100644 --- a/tests/testthat/_snaps/get_test_data.md +++ b/tests/testthat/_snaps/get_test_data.md @@ -1,7 +1,7 @@ # expect insufficient training data error Code - get_test_data(recipe = r, x = case_death_rate_subset) + get_test_data(recipe = r, x = covid_case_death_rates) Condition Error in `get_test_data()`: ! You supplied insufficient recent data for this recipe. diff --git a/tests/testthat/_snaps/layers.md b/tests/testthat/_snaps/layers.md index a0474eab6..7f208f2e6 100644 --- a/tests/testthat/_snaps/layers.md +++ b/tests/testthat/_snaps/layers.md @@ -3,7 +3,7 @@ Code update(f$layers[[1]], lower = 100) Condition - Error in `recipes:::update_fields()`: + Error in `update()`: ! The step you are trying to update, `layer_predict()`, does not have the lower field. --- @@ -19,6 +19,6 @@ Code update(f$layers[[2]], bad_param = 100) Condition - Error in `recipes:::update_fields()`: + Error in `update()`: ! The step you are trying to update, `layer_threshold()`, does not have the bad_param field. diff --git a/tests/testthat/_snaps/pivot_quantiles.md b/tests/testthat/_snaps/pivot_quantiles.md index 184eb62a6..ea027d50c 100644 --- a/tests/testthat/_snaps/pivot_quantiles.md +++ b/tests/testthat/_snaps/pivot_quantiles.md @@ -3,49 +3,56 @@ Code pivot_quantiles_wider(tib, a) Condition - Error in `UseMethod()`: - ! no applicable method for 'family' applied to an object of class "c('integer', 'numeric')" + Error in `pivot_quantiles_wider()`: + ! `a` is not <`quantile_pred`>. Cannot pivot it. --- Code - pivot_quantiles_wider(tib, c) + pivot_quantiles_wider(tib, d1, d2) Condition - Error in `validate_pivot_quantiles()`: - ! Variables(s) `c` are not `dist_quantiles`. Cannot pivot them. + Error in `pivot_quantiles_wider()`: + ! Only one column can be pivotted. Can not pivot all of: `d1` and `d2`. --- Code - pivot_quantiles_wider(tib, d1) + pivot_quantiles_longer(tib, d1, d2) Condition - Error in `pivot_quantiles_wider()`: - ! Quantiles must be the same length and have the same set of taus. - i Check failed for variables(s) `d1`. + Error in `pivot_quantiles_longer()`: + ! Only one column can be pivotted. Can not pivot all of: `d1` and `d2`. # quantile pivotting longer behaves Code pivot_quantiles_longer(tib, a) Condition - Error in `UseMethod()`: - ! no applicable method for 'family' applied to an object of class "c('integer', 'numeric')" + Error in `pivot_quantiles_longer()`: + ! `a` is not <`quantile_pred`>. Cannot pivot it. --- Code - pivot_quantiles_longer(tib, c) + pivot_quantiles_longer(tib, d1, d2) Condition - Error in `validate_pivot_quantiles()`: - ! Variables(s) `c` are not `dist_quantiles`. Cannot pivot them. + Error in `pivot_quantiles_longer()`: + ! Only one column can be pivotted. Can not pivot all of: `d1` and `d2`. + +# nested_quantiles is deprecated, but works where possible + + Code + d <- dist_quantiles(list(1:4, 2:5), 1:4 / 5) + Condition + Warning: + `dist_quantiles()` was deprecated in epipredict 0.1.11. + i Please use `hardhat::quantile_pred()` instead. --- Code - pivot_quantiles_longer(tib, d1, d3) + o <- nested_quantiles(d) Condition - Error in `pivot_quantiles_longer()`: - ! Some selected columns contain different numbers of quantiles. - The result would be a very long . - To do this anyway, rerun with `.ignore_length_check = TRUE`. + Warning: + `nested_quantiles()` was deprecated in epipredict 0.1.11. + i Please use `hardhat::quantile_pred()` instead. diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md index 9263e8e1e..99714e71c 100644 --- a/tests/testthat/_snaps/population_scaling.md +++ b/tests/testthat/_snaps/population_scaling.md @@ -4,7 +4,7 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) Condition Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'a'. + ! The required column "a" is missing. --- @@ -12,5 +12,5 @@ forecast(wf) Condition Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'nothere'. + ! The required column "nothere" is missing. diff --git a/tests/testthat/_snaps/quantile_pred.md b/tests/testthat/_snaps/quantile_pred.md new file mode 100644 index 000000000..dd13dcb86 --- /dev/null +++ b/tests/testthat/_snaps/quantile_pred.md @@ -0,0 +1,8 @@ +# arithmetic works on quantiles + + Code + sum(dstn) + Condition + Error in `vec_math()`: + ! `sum()` is not a supported operation for . + diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 84abf57d2..b2972a8a0 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -2,100 +2,76 @@ structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.1393442, 0.103199, 0.3121244, 0.4218461, 0.7319844, - 0.1975426), .pred_distn = structure(list(structure(list(values = c(0, - 0.34820911), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.31206391), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.10325949, 0.52098931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.21298119, 0.63071101), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.52311949, 0.94084931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.40640751), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", "list" - )), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18999, 18999, - 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, + 0.1975426), .pred_distn = structure(list(c(0, 0.00989957999999999, + 0.09353595, 0.1393442, 0.18515245, 0.26878882, 0.34820911), c(0, + 0, 0.05739075, 0.103199, 0.14900725, 0.23264362, 0.31206391), + c(0.10325949, 0.18267978, 0.26631615, 0.3121244, 0.35793265, + 0.44156902, 0.52098931), c(0.21298119, 0.29240148, 0.37603785, + 0.4218461, 0.46765435, 0.55129072, 0.63071101), c(0.52311949, + 0.60253978, 0.68617615, 0.7319844, 0.77779265, 0.86142902, + 0.94084931), c(0, 0.06809798, 0.15173435, 0.1975426, 0.24335085, + 0.32698722, 0.40640751)), quantile_levels = c(0.05, 0.1, + 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", + "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, + 18992, 18992), class = "Date"), target_date = structure(c(18999, + 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) --- structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.1393442, 0.103199, 0.3121244, 0.4218461, 0.7319844, - 0.1975426), .pred_distn = structure(list(structure(list(values = c(0.084583345, - 0.194105055), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.048438145, 0.157959855), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.257363545, 0.366885255 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.367085245, 0.476606955), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.677223545, 0.786745255 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.142781745, 0.252303455), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", "list" - )), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18993, 18993, - 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, + 0.1975426), .pred_distn = structure(list(c(0.084583345, 0.1073314, + 0.1292864, 0.1393442, 0.149402, 0.171357, 0.194105055), c(0.048438145, + 0.0711862, 0.0931412, 0.103199, 0.1132568, 0.1352118, 0.157959855 + ), c(0.257363545, 0.2801116, 0.3020666, 0.3121244, 0.3221822, + 0.3441372, 0.366885255), c(0.367085245, 0.3898333, 0.4117883, + 0.4218461, 0.4319039, 0.4538589, 0.476606955), c(0.677223545, + 0.6999716, 0.7219266, 0.7319844, 0.7420422, 0.7639972, 0.786745255 + ), c(0.142781745, 0.1655298, 0.1874848, 0.1975426, 0.2076004, + 0.2295554, 0.252303455)), quantile_levels = c(0.05, 0.1, 0.25, + 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", + "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, + 18992, 18992), class = "Date"), target_date = structure(c(18993, + 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) --- structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.1393442, 0.103199, 0.3121244, 0.4218461, 0.7319844, - 0.1975426), .pred_distn = structure(list(structure(list(values = c(0, - 0.34820911), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.31206391), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.10325949, 0.52098931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.21298119, 0.63071101), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.52311949, 0.94084931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.40640751), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", "list" - )), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18999, 18999, - 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, + 0.1975426), .pred_distn = structure(list(c(0, 0.00989957999999999, + 0.09353595, 0.1393442, 0.18515245, 0.26878882, 0.34820911), c(0, + 0, 0.05739075, 0.103199, 0.14900725, 0.23264362, 0.31206391), + c(0.10325949, 0.18267978, 0.26631615, 0.3121244, 0.35793265, + 0.44156902, 0.52098931), c(0.21298119, 0.29240148, 0.37603785, + 0.4218461, 0.46765435, 0.55129072, 0.63071101), c(0.52311949, + 0.60253978, 0.68617615, 0.7319844, 0.77779265, 0.86142902, + 0.94084931), c(0, 0.06809798, 0.15173435, 0.1975426, 0.24335085, + 0.32698722, 0.40640751)), quantile_levels = c(0.05, 0.1, + 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", + "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, + 18992, 18992), class = "Date"), target_date = structure(c(18999, + 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) --- structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.1393442, 0.103199, 0.3121244, 0.4218461, 0.7319844, - 0.1975426), .pred_distn = structure(list(structure(list(values = c(0, - 0.34820911), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.31206391), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.10325949, 0.52098931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.21298119, 0.63071101), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.52311949, 0.94084931 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.40640751), quantile_levels = c(0.05, 0.95 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", "list" - )), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18993, 18993, - 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, + 0.1975426), .pred_distn = structure(list(c(0, 0.00989957999999999, + 0.09353595, 0.1393442, 0.18515245, 0.26878882, 0.34820911), c(0, + 0, 0.05739075, 0.103199, 0.14900725, 0.23264362, 0.31206391), + c(0.10325949, 0.18267978, 0.26631615, 0.3121244, 0.35793265, + 0.44156902, 0.52098931), c(0.21298119, 0.29240148, 0.37603785, + 0.4218461, 0.46765435, 0.55129072, 0.63071101), c(0.52311949, + 0.60253978, 0.68617615, 0.7319844, 0.77779265, 0.86142902, + 0.94084931), c(0, 0.06809798, 0.15173435, 0.1975426, 0.24335085, + 0.32698722, 0.40640751)), quantile_levels = c(0.05, 0.1, + 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", + "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, + 18992, 18992), class = "Date"), target_date = structure(c(18993, + 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) # cdc_baseline_forecaster snapshots @@ -110,279 +86,160 @@ 0.7319844, 0.7319844, 0.7319844, 0.1975426, 0.1975426, 0.1975426, 0.1975426, 0.1975426), ahead = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, - 4L, 5L, 1L, 2L, 3L, 4L, 5L), .pred_distn = structure(list(structure(list( - values = c(0, 0, 0, 0.05519342, 0.082372705, 0.0936219, 0.1048711, - 0.1157573, 0.12317806, 0.1302723, 0.1353526, 0.1393442, 0.1433358, - 0.1484161, 0.15551034, 0.1629311, 0.1738173, 0.1850665, 0.196315695, - 0.22349498, 0.309768685, 0.3567520625, 0.439580229), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0335550493877939, + 4L, 5L, 1L, 2L, 3L, 4L, 5L), .pred_distn = structure(list(c(0, + 0, 0, 0.05519342, 0.082372705, 0.0936219, 0.1048711, 0.1157573, + 0.12317806, 0.1302723, 0.1353526, 0.1393442, 0.1433358, 0.1484161, + 0.15551034, 0.1629311, 0.1738173, 0.1850665, 0.196315695, 0.22349498, + 0.309768685, 0.3567520625, 0.439580229), c(0, 0, 0, 0, 0.0335550493877939, 0.0604073208819088, 0.0796881899581496, 0.0945180888333883, 0.107218788833388, 0.118830788833388, 0.129717088833388, 0.1393442, 0.148949488833388, 0.159110072060821, 0.171080110623306, 0.184009705322953, 0.19866346102411, 0.218798896615666, 0.250961850618106, 0.300471354816148, 0.368582781136862, - 0.43909595699107, 0.520101234797705), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.0310685196688967, + 0.43909595699107, 0.520101234797705), c(0, 0, 0, 0, 0, 0.0310685196688967, 0.0565901050435504, 0.0768417663716637, 0.0947104815343153, 0.110553706525765, 0.125192081534315, 0.1393442, 0.153133424194392, 0.167807181271713, 0.183769310145952, 0.202099979390294, 0.224139947221972, 0.252840918770688, 0.291417895572206, 0.341073550318203, 0.420604597710477, 0.494523225410904, - 0.573647294116801), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.00623643594225938, 0.0360877950479505, - 0.0604332430739307, 0.0824028153516535, 0.102509343235732, - 0.121439405653606, 0.1393442, 0.15780837904264, 0.176333479766098, - 0.1971089199637, 0.219859545844459, 0.246500872561225, 0.279163385675357, - 0.320379296602716, 0.374497727839579, 0.458894379633346, - 0.535545067037845, 0.628776504364044), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0, 0.0192048017017668, - 0.0478501821296211, 0.0723167026720766, 0.0958385084225842, 0.11812331897399, - 0.1393442, 0.161074539705197, 0.184026763327133, 0.207844848454635, - 0.23407004803228, 0.265166265836908, 0.302137478236883, 0.346008752873429, - 0.403205598400084, 0.495260096430714, 0.574198142463125, 0.672941852619816 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0, 0.016465765, 0.03549514, 0.05225675, 0.0644172, 0.0749343, + 0.573647294116801), c(0, 0, 0, 0, 0, 0.00623643594225938, 0.0360877950479505, + 0.0604332430739307, 0.0824028153516535, 0.102509343235732, 0.121439405653606, + 0.1393442, 0.15780837904264, 0.176333479766098, 0.1971089199637, + 0.219859545844459, 0.246500872561225, 0.279163385675357, 0.320379296602716, + 0.374497727839579, 0.458894379633346, 0.535545067037845, 0.628776504364044 + ), c(0, 0, 0, 0, 0, 0, 0.0192048017017668, 0.0478501821296211, + 0.0723167026720766, 0.0958385084225842, 0.11812331897399, 0.1393442, + 0.161074539705197, 0.184026763327133, 0.207844848454635, 0.23407004803228, + 0.265166265836908, 0.302137478236883, 0.346008752873429, 0.403205598400084, + 0.495260096430714, 0.574198142463125, 0.672941852619816), c(0, + 0, 0, 0, 0.016465765, 0.03549514, 0.05225675, 0.0644172, 0.0749343, 0.0847941, 0.0966258, 0.103199, 0.1097722, 0.1216039, 0.1314637, 0.1419808, 0.15414125, 0.17090286, 0.189932235, 0.22848398, 0.30542311, - 0.40216399, 0.512353658), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.00331296053340532, 0.0234804643776438, - 0.0414109089650896, 0.0579040140087902, 0.0738391473860739, - 0.0882882738549385, 0.103199, 0.118522737211872, 0.134217143129031, - 0.15174910202592, 0.17076597900759, 0.192368859892349, 0.218887, - 0.254338497855279, 0.307871753369934, 0.407530532639726, - 0.506824682189646, 0.607973477267732), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0, 0, - 0.0185864520320203, 0.0411215858914089, 0.062281046686267, 0.0828222124563246, - 0.103199, 0.123575888447284, 0.144785989158292, 0.167277039342293, - 0.192536265178252, 0.221677797769728, 0.256887836856768, 0.302366681512415, - 0.3669383199518, 0.476508917333523, 0.574293059865274, 0.69194511433946 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0, 0, 0, 0, 0, 0.0271019287070871, 0.0535555494987951, 0.0785514374097741, - 0.103199, 0.128043832742677, 0.154157375592856, 0.181874602598776, - 0.212708648669987, 0.247608381738568, 0.289082621291513, 0.342486159511745, - 0.41300665395314, 0.52870334697862, 0.634316186092986, 0.767614547228429 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0, 0, 0, 0, 0, 0.0118725894981448, 0.0439446210512103, 0.0736366703227029, - 0.103199, 0.133138617710077, 0.16357656105121, 0.19575459701827, - 0.230475760859608, 0.269323345322203, 0.314976554734947, 0.373424338576786, - 0.452807955824158, 0.578141866759416, 0.690542571738594, 0.837295153768033 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0.0813658, 0.14899276, 0.1960782, 0.22542314, 0.2414296, 0.25890318, - 0.2747762, 0.2881148, 0.3027873, 0.3121244, 0.3214615, 0.336134, - 0.3494726, 0.36534562, 0.3828192, 0.39882566, 0.4281706, 0.47525604, - 0.542883, 0.682805397499999, 0.798878314999999), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0.0706949, - 0.1267172, 0.1667331, 0.198582473624236, 0.225423180397104, 0.2494327, - 0.2707747, 0.292116312116921, 0.3121244, 0.3321324, 0.353072222341423, - 0.375089999249792, 0.3988256, 0.425831930221552, 0.459232792604326, - 0.501467782274773, 0.562188443556836, 0.685648485782108, 0.80647163752115, - 0.939224788489265), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0.0704696868359684, 0.121407167925079, - 0.161930580284053, 0.197682797539976, 0.228361656891269, - 0.257706650923509, 0.285717780926109, 0.3121244, 0.338115598498035, - 0.365749693067931, 0.395921877240673, 0.427437934626446, - 0.462388578749537, 0.504066064225642, 0.558443518811788, - 0.636013559040791, 0.771225883005179, 0.89210797204162, 1.02314875759509 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, - 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0.0247190015881658, 0.0834693973257732, - 0.131490031120311, 0.173258318827988, 0.211213742349423, - 0.246202447408474, 0.279926744217642, 0.3121244, 0.344908347408474, - 0.378255200773608, 0.412935547408474, 0.45191576510605, 0.494757615230152, - 0.545060918490786, 0.609312182129471, 0.69704881099591, 0.838550239412991, - 0.962653262246773, 1.11351403170759), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.0501392705767058, - 0.104248897713977, 0.151994400390804, 0.195087767727627, 0.235544124698047, - 0.274058107118071, 0.3121244, 0.350571341810268, 0.390274666572666, - 0.43048632300908, 0.474320393891039, 0.523839613390634, 0.581010268149082, - 0.652137495469405, 0.748428674762348, 0.898563270096551, 1.03273295410124, - 1.19211145220822), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0.2148017, 0.31250597, 0.350183905, 0.3745639, - 0.3884161, 0.39746621, 0.404854, 0.4115031, 0.417413315, - 0.4218461, 0.426278885, 0.4321891, 0.4388382, 0.44622599, - 0.4552761, 0.4691283, 0.493508295, 0.53118623, 0.628890499999999, - 1.22043540499999, 1.95905017899999), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0.212369462232823, - 0.289571577546325, 0.324446887783878, 0.351262144469445, 0.37087, - 0.3863844, 0.399682509835098, 0.411036898891089, 0.4218461, 0.432927818676137, - 0.444338520819208, 0.4573077, 0.4728222, 0.492817749438994, 0.519442857224172, - 0.556165331447064, 0.635946057886079, 1.18402232252562, 1.7827032389242, - 2.5561261649726), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0413098183761837, 0.216633655848608, - 0.28006329699657, 0.3175577049983, 0.345923291761818, 0.368957399144641, - 0.38804556403724, 0.405400893204282, 0.4218461, 0.43864616004845, - 0.456105937661177, 0.475585378227632, 0.499018124730147, - 0.5270891900114, 0.564293444378844, 0.630730263388634, 0.898212235100651, - 1.53976520159876, 2.08228809477582, 2.80588762256078), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.114729892920429, - 0.227785958288583, 0.282278878729037, 0.320407599201492, 0.350577823459785, - 0.37665230304923, 0.39981364198757, 0.4218461, 0.444009706175862, - 0.466962725214852, 0.493098379685547, 0.523708407392674, 0.562100740111401, - 0.619050517814778, 0.754868363055733, 1.1177263295869, 1.76277018354499, - 2.37278671910076, 2.9651652434047), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0100954501382014, - 0.165091099860099, 0.244964334392844, 0.294577054174442, 0.333357739419644, - 0.365251480804308, 0.394198909379894, 0.4218461, 0.449607812233022, - 0.479120513116631, 0.511271131674317, 0.5506402899964, 0.60295411796593, - 0.690751300611906, 0.913578722060166, 1.30856988553206, 1.94020220543606, - 2.57104934168037, 3.07139639379724), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.303454977, 0.3982330425, - 0.46791125, 0.57642367, 0.631462275, 0.6694025, 0.685048, 0.69857015, - 0.7085162, 0.71633898, 0.7252792, 0.7319844, 0.7386896, 0.74762982, - 0.7554526, 0.76539865, 0.7789208, 0.7945663, 0.832506525, 0.88754513, - 0.99605755, 1.0657357575, 1.160513823), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.188727136659627, - 0.292714653217782, 0.380882595473705, 0.476427609604196, 0.5464739, - 0.6001155, 0.636506664263643, 0.6638148, 0.684726301742618, 0.701811, - 0.7174565, 0.7319844, 0.7465124, 0.7621578, 0.779322149415794, - 0.800154, 0.826981204292293, 0.8649709, 0.918345662372574, 0.987315641681917, - 1.08210087899389, 1.17564510102166, 1.27428433325155), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.0928040444059739, - 0.212569233904214, 0.310718449102641, 0.418013562853928, 0.489917936424114, - 0.546885925424654, 0.593410228218282, 0.631406259421094, 0.661579628218282, - 0.687282906872069, 0.710456666258662, 0.7319844, 0.754131389282943, - 0.776685628218282, 0.802388976168662, 0.832758896293562, 0.869440928218282, - 0.916359694097141, 0.97403912794778, 1.04529048496565, 1.15710382277548, - 1.25675656404419, 1.37098330871205), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.0108404989744699, - 0.144337973117581, 0.250292371898569, 0.367310419323293, 0.44444044802193, - 0.506592035751958, 0.558428768125431, 0.602035095628756, 0.64112383905529, - 0.674354964141041, 0.703707875219752, 0.7319844, 0.760702196782168, - 0.78975826405844, 0.823427572594726, 0.860294897090771, 0.904032120658957, - 0.955736581115011, 1.0165945004053, 1.09529786576616, 1.21614421175967, - 1.32331604019295, 1.45293812780298), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0.0783919019408445, - 0.19440762901709, 0.323264916745368, 0.407999619319143, 0.474764568463685, - 0.530890671381964, 0.580852443909739, 0.623441748828038, 0.661393469870099, - 0.69827126098506, 0.7319844, 0.766440770218252, 0.802260162496625, - 0.840536805657307, 0.883133954556946, 0.931565607767828, 0.98815401699637, - 1.05406790404239, 1.138596250043, 1.27030064370239, 1.39007785503355, - 1.5343628053761), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0.012845105, 0.07040502, 0.09495188, 0.12669976, - 0.1502248, 0.1659163, 0.1761341, 0.18586528, 0.191290375, - 0.1975426, 0.203794825, 0.20921992, 0.2189511, 0.2291689, - 0.2448604, 0.26838544, 0.30013332, 0.32468018, 0.382240095, - 0.5020427625, 0.590302013999998), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0.0133856545472455, + 0.40216399, 0.512353658), c(0, 0, 0, 0, 0, 0.00331296053340532, + 0.0234804643776438, 0.0414109089650896, 0.0579040140087902, 0.0738391473860739, + 0.0882882738549385, 0.103199, 0.118522737211872, 0.134217143129031, + 0.15174910202592, 0.17076597900759, 0.192368859892349, 0.218887, + 0.254338497855279, 0.307871753369934, 0.407530532639726, 0.506824682189646, + 0.607973477267732), c(0, 0, 0, 0, 0, 0, 0, 0.0185864520320203, + 0.0411215858914089, 0.062281046686267, 0.0828222124563246, 0.103199, + 0.123575888447284, 0.144785989158292, 0.167277039342293, 0.192536265178252, + 0.221677797769728, 0.256887836856768, 0.302366681512415, 0.3669383199518, + 0.476508917333523, 0.574293059865274, 0.69194511433946), c(0, + 0, 0, 0, 0, 0, 0, 0, 0.0271019287070871, 0.0535555494987951, + 0.0785514374097741, 0.103199, 0.128043832742677, 0.154157375592856, + 0.181874602598776, 0.212708648669987, 0.247608381738568, 0.289082621291513, + 0.342486159511745, 0.41300665395314, 0.52870334697862, 0.634316186092986, + 0.767614547228429), c(0, 0, 0, 0, 0, 0, 0, 0, 0.0118725894981448, + 0.0439446210512103, 0.0736366703227029, 0.103199, 0.133138617710077, + 0.16357656105121, 0.19575459701827, 0.230475760859608, 0.269323345322203, + 0.314976554734947, 0.373424338576786, 0.452807955824158, 0.578141866759416, + 0.690542571738594, 0.837295153768033), c(0, 0, 0.0813658, 0.14899276, + 0.1960782, 0.22542314, 0.2414296, 0.25890318, 0.2747762, 0.2881148, + 0.3027873, 0.3121244, 0.3214615, 0.336134, 0.3494726, 0.36534562, + 0.3828192, 0.39882566, 0.4281706, 0.47525604, 0.542883, 0.682805397499999, + 0.798878314999999), c(0, 0, 0, 0.0706949, 0.1267172, 0.1667331, + 0.198582473624236, 0.225423180397104, 0.2494327, 0.2707747, 0.292116312116921, + 0.3121244, 0.3321324, 0.353072222341423, 0.375089999249792, 0.3988256, + 0.425831930221552, 0.459232792604326, 0.501467782274773, 0.562188443556836, + 0.685648485782108, 0.80647163752115, 0.939224788489265), c(0, + 0, 0, 0, 0.0704696868359684, 0.121407167925079, 0.161930580284053, + 0.197682797539976, 0.228361656891269, 0.257706650923509, 0.285717780926109, + 0.3121244, 0.338115598498035, 0.365749693067931, 0.395921877240673, + 0.427437934626446, 0.462388578749537, 0.504066064225642, 0.558443518811788, + 0.636013559040791, 0.771225883005179, 0.89210797204162, 1.02314875759509 + ), c(0, 0, 0, 0, 0.0247190015881658, 0.0834693973257732, 0.131490031120311, + 0.173258318827988, 0.211213742349423, 0.246202447408474, 0.279926744217642, + 0.3121244, 0.344908347408474, 0.378255200773608, 0.412935547408474, + 0.45191576510605, 0.494757615230152, 0.545060918490786, 0.609312182129471, + 0.69704881099591, 0.838550239412991, 0.962653262246773, 1.11351403170759 + ), c(0, 0, 0, 0, 0, 0.0501392705767058, 0.104248897713977, 0.151994400390804, + 0.195087767727627, 0.235544124698047, 0.274058107118071, 0.3121244, + 0.350571341810268, 0.390274666572666, 0.43048632300908, 0.474320393891039, + 0.523839613390634, 0.581010268149082, 0.652137495469405, 0.748428674762348, + 0.898563270096551, 1.03273295410124, 1.19211145220822), c(0, + 0, 0.2148017, 0.31250597, 0.350183905, 0.3745639, 0.3884161, + 0.39746621, 0.404854, 0.4115031, 0.417413315, 0.4218461, 0.426278885, + 0.4321891, 0.4388382, 0.44622599, 0.4552761, 0.4691283, 0.493508295, + 0.53118623, 0.628890499999999, 1.22043540499999, 1.95905017899999 + ), c(0, 0, 0, 0.212369462232823, 0.289571577546325, 0.324446887783878, + 0.351262144469445, 0.37087, 0.3863844, 0.399682509835098, 0.411036898891089, + 0.4218461, 0.432927818676137, 0.444338520819208, 0.4573077, 0.4728222, + 0.492817749438994, 0.519442857224172, 0.556165331447064, 0.635946057886079, + 1.18402232252562, 1.7827032389242, 2.5561261649726), c(0, 0, + 0, 0.0413098183761837, 0.216633655848608, 0.28006329699657, 0.3175577049983, + 0.345923291761818, 0.368957399144641, 0.38804556403724, 0.405400893204282, + 0.4218461, 0.43864616004845, 0.456105937661177, 0.475585378227632, + 0.499018124730147, 0.5270891900114, 0.564293444378844, 0.630730263388634, + 0.898212235100651, 1.53976520159876, 2.08228809477582, 2.80588762256078 + ), c(0, 0, 0, 0, 0.114729892920429, 0.227785958288583, 0.282278878729037, + 0.320407599201492, 0.350577823459785, 0.37665230304923, 0.39981364198757, + 0.4218461, 0.444009706175862, 0.466962725214852, 0.493098379685547, + 0.523708407392674, 0.562100740111401, 0.619050517814778, 0.754868363055733, + 1.1177263295869, 1.76277018354499, 2.37278671910076, 2.9651652434047 + ), c(0, 0, 0, 0, 0.0100954501382014, 0.165091099860099, 0.244964334392844, + 0.294577054174442, 0.333357739419644, 0.365251480804308, 0.394198909379894, + 0.4218461, 0.449607812233022, 0.479120513116631, 0.511271131674317, + 0.5506402899964, 0.60295411796593, 0.690751300611906, 0.913578722060166, + 1.30856988553206, 1.94020220543606, 2.57104934168037, 3.07139639379724 + ), c(0.303454977, 0.3982330425, 0.46791125, 0.57642367, 0.631462275, + 0.6694025, 0.685048, 0.69857015, 0.7085162, 0.71633898, 0.7252792, + 0.7319844, 0.7386896, 0.74762982, 0.7554526, 0.76539865, 0.7789208, + 0.7945663, 0.832506525, 0.88754513, 0.99605755, 1.0657357575, + 1.160513823), c(0.188727136659627, 0.292714653217782, 0.380882595473705, + 0.476427609604196, 0.5464739, 0.6001155, 0.636506664263643, 0.6638148, + 0.684726301742618, 0.701811, 0.7174565, 0.7319844, 0.7465124, + 0.7621578, 0.779322149415794, 0.800154, 0.826981204292293, 0.8649709, + 0.918345662372574, 0.987315641681917, 1.08210087899389, 1.17564510102166, + 1.27428433325155), c(0.0928040444059739, 0.212569233904214, 0.310718449102641, + 0.418013562853928, 0.489917936424114, 0.546885925424654, 0.593410228218282, + 0.631406259421094, 0.661579628218282, 0.687282906872069, 0.710456666258662, + 0.7319844, 0.754131389282943, 0.776685628218282, 0.802388976168662, + 0.832758896293562, 0.869440928218282, 0.916359694097141, 0.97403912794778, + 1.04529048496565, 1.15710382277548, 1.25675656404419, 1.37098330871205 + ), c(0.0108404989744699, 0.144337973117581, 0.250292371898569, + 0.367310419323293, 0.44444044802193, 0.506592035751958, 0.558428768125431, + 0.602035095628756, 0.64112383905529, 0.674354964141041, 0.703707875219752, + 0.7319844, 0.760702196782168, 0.78975826405844, 0.823427572594726, + 0.860294897090771, 0.904032120658957, 0.955736581115011, 1.0165945004053, + 1.09529786576616, 1.21614421175967, 1.32331604019295, 1.45293812780298 + ), c(0, 0.0783919019408445, 0.19440762901709, 0.323264916745368, + 0.407999619319143, 0.474764568463685, 0.530890671381964, 0.580852443909739, + 0.623441748828038, 0.661393469870099, 0.69827126098506, 0.7319844, + 0.766440770218252, 0.802260162496625, 0.840536805657307, 0.883133954556946, + 0.931565607767828, 0.98815401699637, 1.05406790404239, 1.138596250043, + 1.27030064370239, 1.39007785503355, 1.5343628053761), c(0, 0, + 0.012845105, 0.07040502, 0.09495188, 0.12669976, 0.1502248, 0.1659163, + 0.1761341, 0.18586528, 0.191290375, 0.1975426, 0.203794825, 0.20921992, + 0.2189511, 0.2291689, 0.2448604, 0.26838544, 0.30013332, 0.32468018, + 0.382240095, 0.5020427625, 0.590302013999998), c(0, 0, 0, 0.0133856545472455, 0.0528330564916649, 0.0825071163605637, 0.107217748074731, 0.130397558147181, 0.151367721571716, 0.1688357, 0.183736649076791, 0.1975426, 0.2111662, 0.226622576069161, 0.244738709634746, 0.265660771838618, 0.289502, 0.3157762, 0.347933515877459, 0.395446576674467, 0.494033943284933, - 0.586036939413118, 0.696507800090321), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0119984314577645, + 0.586036939413118, 0.696507800090321), c(0, 0, 0, 0, 0.0119984314577645, 0.0497573816250162, 0.081255049503995, 0.108502307388674, 0.132961558931189, 0.156011650575706, 0.177125892134071, 0.1975426, 0.217737120618906, 0.239458499211792, 0.263562581820818, 0.289525383565136, 0.31824420000725, 0.35141305194052, 0.393862560773808, 0.453538799225292, 0.558631806850418, - 0.657452391363313, 0.767918764883928), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.0189057930465303, + 0.657452391363313, 0.767918764883928), c(0, 0, 0, 0, 0, 0.0189057930465303, 0.0558619823820737, 0.0885055048481483, 0.117823094349893, 0.145878789120691, 0.171852417645726, 0.1975426, 0.222526993865839, 0.249029206661066, 0.27731797305948, 0.306704680469104, 0.340659034209842, 0.379550761828618, 0.429562304567396, 0.499209921951019, 0.612206099576094, 0.713714149138691, - 0.835600324727346), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.0331956220262204, 0.0710455499705998, - 0.105140687231072, 0.136976315413355, 0.167518817907279, - 0.1975426, 0.226974062486675, 0.257640196272163, 0.289459502055271, - 0.323342029611596, 0.361500312536625, 0.407123841331413, - 0.46286764504675, 0.538379175655057, 0.659249503348734, 0.768470658367656, - 0.898774707571334), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", - "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, + 0.835600324727346), c(0, 0, 0, 0, 0, 0, 0.0331956220262204, 0.0710455499705998, + 0.105140687231072, 0.136976315413355, 0.167518817907279, 0.1975426, + 0.226974062486675, 0.257640196272163, 0.289459502055271, 0.323342029611596, + 0.361500312536625, 0.407123841331413, 0.46286764504675, 0.538379175655057, + 0.659249503348734, 0.768470658367656, 0.898774707571334)), quantile_levels = c(0.01, + 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, + 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 + ), class = c("quantile_pred", "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18999, 19006, - 19013, 19020, 19027, 18999, 19006, 19013, 19020, 19027, 18999, + 18992, 18992), class = "Date"), target_date = structure(c(18999, 19006, 19013, 19020, 19027, 18999, 19006, 19013, 19020, 19027, 18999, 19006, 19013, 19020, 19027, 18999, 19006, 19013, 19020, - 19027), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", + 19027, 18999, 19006, 19013, 19020, 19027, 18999, 19006, 19013, + 19020, 19027), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame")) --- @@ -397,291 +254,167 @@ 0.7319844, 0.7319844, 0.7319844, 0.1975426, 0.1975426, 0.1975426, 0.1975426, 0.1975426), ahead = c(2L, 3L, 4L, 5L, 6L, 2L, 3L, 4L, 5L, 6L, 2L, 3L, 4L, 5L, 6L, 2L, 3L, 4L, 5L, 6L, 2L, 3L, 4L, - 5L, 6L, 2L, 3L, 4L, 5L, 6L), .pred_distn = structure(list(structure(list( - values = c(0, 0, 0, 0, 0.0344362435566855, 0.0610170086495865, - 0.0798865084778347, 0.0944014546310463, 0.107339121226462, - 0.11899734099851, 0.129600408649586, 0.1393442, 0.149195708649586, - 0.159627982246122, 0.170968308649587, 0.184031805880359, - 0.198909658094331, 0.219058736130861, 0.250692448549235, - 0.300646382944129, 0.368938143197633, 0.440038195052124, - 0.51997011826723), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.0303364052628526, 0.0557306728227282, - 0.0766736159703596, 0.0942284381264812, 0.11050757203172, - 0.125214601455714, 0.1393442, 0.15359732398729, 0.168500447692877, - 0.184551468093631, 0.202926420944109, 0.22476606802393, 0.253070223293233, - 0.29122995395109, 0.341963643747938, 0.419747975311502, 0.495994046054689, - 0.5748791770223), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.00603076915889168, 0.0356039073625737, - 0.0609470811194113, 0.0833232869645198, 0.103265350891109, - 0.121507077706427, 0.1393442, 0.157305073932789, 0.176004666813668, - 0.196866917086671, 0.219796529731897, 0.247137200365254, - 0.280371254591746, 0.320842872758278, 0.374783454750148, - 0.461368597638526, 0.539683256474915, 0.632562403391324), - quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, - 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.018869505399304, 0.0471517885822858, - 0.0732707765908659, 0.0969223475714758, 0.118188509171441, - 0.1393442, 0.161036861715017, 0.183255665579256, 0.207206810683007, - 0.23409988698267, 0.265549713886389, 0.302197074524145, 0.346715970732557, - 0.40460690801818, 0.498076490174802, 0.580016068409433, 0.680138975526255 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, - 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.00232218982614828, 0.0342017690820909, - 0.062828756299263, 0.0893725834453345, 0.114623710996309, - 0.1393442, 0.163790622390774, 0.189495107256772, 0.216754530328403, - 0.247065337260473, 0.281410456107061, 0.32037037400004, 0.367018829587046, - 0.431198706165962, 0.52829547296083, 0.619021148955337, 0.728730172315724 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, - 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.00233673672776743, 0.0223488000000001, - 0.040304673503435, 0.0576262998104982, 0.0732741199141993, - 0.088455610793058, 0.103199, 0.118707592060121, 0.134185928864089, - 0.151183139276793, 0.1702454, 0.191937, 0.2182298, 0.253577609846549, - 0.307351538752588, 0.407165223924639, 0.502529513927214, - 0.605582108686126), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0, 0.0190621000375005, 0.0420071558734088, - 0.0629230825705257, 0.0833688260410605, 0.103199, 0.124118509153392, - 0.145401945823358, 0.168667287877079, 0.1939090000375, 0.222597428173282, - 0.256984900377504, 0.301709122144422, 0.366495424858649, - 0.475152766217062, 0.572497835146252, 0.693762274318904), - quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, - 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0, 0, 0.0269530072946728, 0.0530040092850928, - 0.0782481277003769, 0.103199, 0.12816325599641, 0.154866111682517, - 0.182302899107341, 0.213783044306043, 0.248363904708547, - 0.28995690796288, 0.341627908394784, 0.413707680386504, 0.528381820556805, - 0.635771182105746, 0.77652465912812), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0, 0, + 5L, 6L, 2L, 3L, 4L, 5L, 6L), .pred_distn = structure(list(c(0, + 0, 0, 0, 0.0344362435566855, 0.0610170086495865, 0.0798865084778347, + 0.0944014546310463, 0.107339121226462, 0.11899734099851, 0.129600408649586, + 0.1393442, 0.149195708649586, 0.159627982246122, 0.170968308649587, + 0.184031805880359, 0.198909658094331, 0.219058736130861, 0.250692448549235, + 0.300646382944129, 0.368938143197633, 0.440038195052124, 0.51997011826723 + ), c(0, 0, 0, 0, 0, 0.0303364052628526, 0.0557306728227282, 0.0766736159703596, + 0.0942284381264812, 0.11050757203172, 0.125214601455714, 0.1393442, + 0.15359732398729, 0.168500447692877, 0.184551468093631, 0.202926420944109, + 0.22476606802393, 0.253070223293233, 0.29122995395109, 0.341963643747938, + 0.419747975311502, 0.495994046054689, 0.5748791770223), c(0, + 0, 0, 0, 0, 0.00603076915889168, 0.0356039073625737, 0.0609470811194113, + 0.0833232869645198, 0.103265350891109, 0.121507077706427, 0.1393442, + 0.157305073932789, 0.176004666813668, 0.196866917086671, 0.219796529731897, + 0.247137200365254, 0.280371254591746, 0.320842872758278, 0.374783454750148, + 0.461368597638526, 0.539683256474915, 0.632562403391324), c(0, + 0, 0, 0, 0, 0, 0.018869505399304, 0.0471517885822858, 0.0732707765908659, + 0.0969223475714758, 0.118188509171441, 0.1393442, 0.161036861715017, + 0.183255665579256, 0.207206810683007, 0.23409988698267, 0.265549713886389, + 0.302197074524145, 0.346715970732557, 0.40460690801818, 0.498076490174802, + 0.580016068409433, 0.680138975526255), c(0, 0, 0, 0, 0, 0, 0.00232218982614828, + 0.0342017690820909, 0.062828756299263, 0.0893725834453345, 0.114623710996309, + 0.1393442, 0.163790622390774, 0.189495107256772, 0.216754530328403, + 0.247065337260473, 0.281410456107061, 0.32037037400004, 0.367018829587046, + 0.431198706165962, 0.52829547296083, 0.619021148955337, 0.728730172315724 + ), c(0, 0, 0, 0, 0, 0.00233673672776743, 0.0223488000000001, + 0.040304673503435, 0.0576262998104982, 0.0732741199141993, 0.088455610793058, + 0.103199, 0.118707592060121, 0.134185928864089, 0.151183139276793, + 0.1702454, 0.191937, 0.2182298, 0.253577609846549, 0.307351538752588, + 0.407165223924639, 0.502529513927214, 0.605582108686126), c(0, + 0, 0, 0, 0, 0, 0, 0.0190621000375005, 0.0420071558734088, 0.0629230825705257, + 0.0833688260410605, 0.103199, 0.124118509153392, 0.145401945823358, + 0.168667287877079, 0.1939090000375, 0.222597428173282, 0.256984900377504, + 0.301709122144422, 0.366495424858649, 0.475152766217062, 0.572497835146252, + 0.693762274318904), c(0, 0, 0, 0, 0, 0, 0, 0, 0.0269530072946728, + 0.0530040092850928, 0.0782481277003769, 0.103199, 0.12816325599641, + 0.154866111682517, 0.182302899107341, 0.213783044306043, 0.248363904708547, + 0.28995690796288, 0.341627908394784, 0.413707680386504, 0.528381820556805, + 0.635771182105746, 0.77652465912812), c(0, 0, 0, 0, 0, 0, 0, 0, 0.0133969262208122, 0.0447913089328894, 0.0739787251314013, 0.103199, 0.132965213784838, 0.163644939246192, 0.196475575572506, 0.231647450729907, 0.271208219491195, 0.317741925837459, 0.376214875186902, 0.454693715463155, 0.578781950822058, 0.695278060333427, 0.835521146843828 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0, 0, 0, 0, 0, 0.000725156354313476, 0.036290207696477, 0.0701157049196494, - 0.103199, 0.136581757676227, 0.170980571439515, 0.20778982998995, - 0.247087076718167, 0.291689672899979, 0.343587258527985, 0.406717577407724, - 0.490437549306793, 0.620305872542078, 0.740730855925609, 0.888992767585756 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0.0701359181289814, 0.126021564763798, 0.165542973066331, - 0.197412078824538, 0.2254231, 0.24849244896414, 0.271074448350284, - 0.292116376731667, 0.3121244, 0.3321324, 0.3534741, 0.375505591313813, - 0.4001594, 0.4268368, 0.459466546351464, 0.501142770839258, 0.562143084394445, - 0.686511993260583, 0.808747521078011, 0.936070949770187), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0.00157374045240457, - 0.0698662194634446, 0.120287640452405, 0.16090076400914, 0.195966561494315, - 0.227802919628796, 0.257250456567366, 0.284352940452404, 0.3121244, - 0.338954445099751, 0.366682808562485, 0.395431772465525, 0.428410340452405, - 0.464424683613586, 0.505774640452405, 0.559060310062401, 0.635868688255882, - 0.771213743700187, 0.895124744284645, 1.02835689610128), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0203251909788099, - 0.0807941084801849, 0.131156594663197, 0.173483742579226, 0.211670557196072, - 0.246244078609487, 0.278363918673537, 0.3121244, 0.345057130768308, - 0.378403757196072, 0.414130127568126, 0.451969178608786, 0.495598517595426, - 0.545136665227352, 0.60807806098831, 0.695394235571256, 0.837130344811698, - 0.966111057134121, 1.11185508502426), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.0477276069251695, - 0.103509981435814, 0.15221877094871, 0.195952578625286, 0.236147272793828, - 0.274650521629366, 0.3121244, 0.349346986282313, 0.388561057230272, - 0.429378978625286, 0.474721256740267, 0.523806740641156, 0.581962784214742, - 0.652062951302463, 0.746838578625286, 0.896492945755508, 1.0340527654686, - 1.19219029825678), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.0166039560593608, 0.0776387168354182, - 0.132003170161801, 0.180530886857168, 0.22594722201882, 0.268822337600976, - 0.3121244, 0.354489864523245, 0.398378553881739, 0.444274543339083, - 0.494499388431484, 0.548837448212482, 0.612239188685087, - 0.690272902609576, 0.790473599123991, 0.950950996975469, - 1.09638828065763, 1.26930966690442), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0.214450885057551, - 0.288864871241312, 0.3250653, 0.3516615, 0.3716087, 0.386718885323753, - 0.399682691320713, 0.411042976158862, 0.4218461, 0.4329278, 0.444139278140181, - 0.456951313505885, 0.4720835, 0.4920307, 0.518626803531635, 0.555566110165902, - 0.636745822624727, 1.18069710590251, 1.79487371178211, 2.55270530204625 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, + ), c(0, 0, 0, 0, 0, 0, 0, 0, 0.000725156354313476, 0.036290207696477, + 0.0701157049196494, 0.103199, 0.136581757676227, 0.170980571439515, + 0.20778982998995, 0.247087076718167, 0.291689672899979, 0.343587258527985, + 0.406717577407724, 0.490437549306793, 0.620305872542078, 0.740730855925609, + 0.888992767585756), c(0, 0, 0, 0.0701359181289814, 0.126021564763798, + 0.165542973066331, 0.197412078824538, 0.2254231, 0.24849244896414, + 0.271074448350284, 0.292116376731667, 0.3121244, 0.3321324, 0.3534741, + 0.375505591313813, 0.4001594, 0.4268368, 0.459466546351464, 0.501142770839258, + 0.562143084394445, 0.686511993260583, 0.808747521078011, 0.936070949770187 + ), c(0, 0, 0, 0.00157374045240457, 0.0698662194634446, 0.120287640452405, + 0.16090076400914, 0.195966561494315, 0.227802919628796, 0.257250456567366, + 0.284352940452404, 0.3121244, 0.338954445099751, 0.366682808562485, + 0.395431772465525, 0.428410340452405, 0.464424683613586, 0.505774640452405, + 0.559060310062401, 0.635868688255882, 0.771213743700187, 0.895124744284645, + 1.02835689610128), c(0, 0, 0, 0, 0.0203251909788099, 0.0807941084801849, + 0.131156594663197, 0.173483742579226, 0.211670557196072, 0.246244078609487, + 0.278363918673537, 0.3121244, 0.345057130768308, 0.378403757196072, + 0.414130127568126, 0.451969178608786, 0.495598517595426, 0.545136665227352, + 0.60807806098831, 0.695394235571256, 0.837130344811698, 0.966111057134121, + 1.11185508502426), c(0, 0, 0, 0, 0, 0.0477276069251695, 0.103509981435814, + 0.15221877094871, 0.195952578625286, 0.236147272793828, 0.274650521629366, + 0.3121244, 0.349346986282313, 0.388561057230272, 0.429378978625286, + 0.474721256740267, 0.523806740641156, 0.581962784214742, 0.652062951302463, + 0.746838578625286, 0.896492945755508, 1.0340527654686, 1.19219029825678 + ), c(0, 0, 0, 0, 0, 0.0166039560593608, 0.0776387168354182, 0.132003170161801, + 0.180530886857168, 0.22594722201882, 0.268822337600976, 0.3121244, + 0.354489864523245, 0.398378553881739, 0.444274543339083, 0.494499388431484, + 0.548837448212482, 0.612239188685087, 0.690272902609576, 0.790473599123991, + 0.950950996975469, 1.09638828065763, 1.26930966690442), c(0, + 0, 0, 0.214450885057551, 0.288864871241312, 0.3250653, 0.3516615, + 0.3716087, 0.386718885323753, 0.399682691320713, 0.411042976158862, + 0.4218461, 0.4329278, 0.444139278140181, 0.456951313505885, 0.4720835, + 0.4920307, 0.518626803531635, 0.555566110165902, 0.636745822624727, + 1.18069710590251, 1.79487371178211, 2.55270530204625), c(0, 0, 0, 0.0412188277837779, 0.218851219710947, 0.281178109847399, 0.318187061211362, 0.346336916208562, 0.368500427783778, 0.387753955899259, 0.405439627783778, 0.4218461, 0.438238911502765, 0.455473161565916, 0.474946888792488, 0.497793222697627, 0.526600327783778, 0.565677321171112, 0.632773149305243, 0.891087255237454, 1.53723873883164, 2.07877430490449, - 2.80265665435411), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0.11916637099981, 0.229217761668717, - 0.283591182792578, 0.32089403701397, 0.351025234947199, 0.376764238355684, - 0.399580647158371, 0.4218461, 0.44387311299288, 0.466809871716417, - 0.493008689720547, 0.523409488360383, 0.563157298622986, - 0.621505313473235, 0.756485815282202, 1.12190615310943, 1.76010655352564, - 2.36678033794496, 2.94420631979259), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0166944520132201, - 0.165418069472795, 0.245206977511275, 0.294705591133411, 0.333122440419504, - 0.365628706470365, 0.393898304736197, 0.4218461, 0.449111464628896, - 0.478419567119571, 0.511583967360174, 0.551380591704217, 0.602914542469175, - 0.695207681738717, 0.912006796599716, 1.31516316514125, 1.94296465866439, - 2.56528565211139, 3.07364144272118), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.095868346511765, - 0.20216012803078, 0.267545492825128, 0.314290150935209, 0.353895445422154, - 0.388115128404834, 0.4218461, 0.455823761272913, 0.49135719600286, - 0.53249009905049, 0.582341165610556, 0.654473427614026, 0.784511194125441, - 1.05644872659752, 1.47044175860169, 2.09183984013705, 2.69484857437112, - 3.1694157654766), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.189889609612846, 0.28831400446517, 0.378590156778518, - 0.474951757151471, 0.546550271666467, 0.599713541496415, - 0.636994072140471, 0.663814888730087, 0.6839305, 0.701811, - 0.71711131701917, 0.7319844, 0.746512343291783, 0.7621579, - 0.7800383, 0.800154, 0.826974702066021, 0.86472325100111, - 0.918612458720487, 0.988605006042461, 1.08324298909714, 1.1736324426019, - 1.27400190201593), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.0970521814156041, 0.21019273451422, 0.3073217, - 0.418096666577866, 0.489016664299943, 0.547102113575136, - 0.594490775323003, 0.63162246104581, 0.661579866583116, 0.687283, - 0.709633785855109, 0.7319844, 0.754030577281223, 0.776967707389074, - 0.802389, 0.832791429272493, 0.870576437517875, 0.917019363782438, - 0.973069487834329, 1.04481411391714, 1.15502640396814, 1.25613855529213, - 1.37419193312441), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.0121672025865257, 0.139873460696682, 0.245836896475015, - 0.366700877088971, 0.445024777793378, 0.506295707796278, - 0.557812941319663, 0.601634091201612, 0.639324955546405, - 0.673001603565436, 0.702827370737707, 0.7319844, 0.760387153293983, - 0.790515252114921, 0.823330663438584, 0.86065768198682, 0.904468070814958, - 0.954989716167962, 1.01626566701207, 1.09352836237872, 1.21548452077266, - 1.32239947141536, 1.46006378366371), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0.0755189873928237, - 0.192404624794198, 0.322282766861868, 0.409749729479745, 0.475729034228042, - 0.531171513462134, 0.579442333436034, 0.623023292701627, 0.662178609529395, - 0.697968947885378, 0.7319844, 0.766345465406154, 0.80256496503135, - 0.841452466611966, 0.884524366576965, 0.93218174000415, 0.988252217755677, - 1.05297410373014, 1.13838991320473, 1.27210128334768, 1.38822119412612, - 1.53603026586717), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0.0137515321313713, 0.140785106599616, 0.283710273212032, - 0.374321519596796, 0.446394180252102, 0.505830587319873, - 0.559570052916329, 0.606684360953109, 0.65111343293503, 0.692845474832798, - 0.7319844, 0.771333743893139, 0.812267094081241, 0.855930534362644, - 0.903545840608706, 0.955193592261423, 1.01560313647486, 1.08583632750787, - 1.17818451335943, 1.31856131315813, 1.44615719776698, 1.60468791291453 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, - 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0124103998425985, 0.0518320161167612, - 0.0822283734557346, 0.106956582246572, 0.130236689538895, - 0.150852198845738, 0.168835673455735, 0.183678547429124, - 0.1975426, 0.211166273455735, 0.226249473455735, 0.243919155834858, - 0.265304527061771, 0.289781663064881, 0.315985067670677, - 0.347644682675627, 0.394981842425824, 0.491215248628636, - 0.584975102439074, 0.694697494489265), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0106056685868359, + 2.80265665435411), c(0, 0, 0, 0, 0.11916637099981, 0.229217761668717, + 0.283591182792578, 0.32089403701397, 0.351025234947199, 0.376764238355684, + 0.399580647158371, 0.4218461, 0.44387311299288, 0.466809871716417, + 0.493008689720547, 0.523409488360383, 0.563157298622986, 0.621505313473235, + 0.756485815282202, 1.12190615310943, 1.76010655352564, 2.36678033794496, + 2.94420631979259), c(0, 0, 0, 0, 0.0166944520132201, 0.165418069472795, + 0.245206977511275, 0.294705591133411, 0.333122440419504, 0.365628706470365, + 0.393898304736197, 0.4218461, 0.449111464628896, 0.478419567119571, + 0.511583967360174, 0.551380591704217, 0.602914542469175, 0.695207681738717, + 0.912006796599716, 1.31516316514125, 1.94296465866439, 2.56528565211139, + 3.07364144272118), c(0, 0, 0, 0, 0, 0.095868346511765, 0.20216012803078, + 0.267545492825128, 0.314290150935209, 0.353895445422154, 0.388115128404834, + 0.4218461, 0.455823761272913, 0.49135719600286, 0.53249009905049, + 0.582341165610556, 0.654473427614026, 0.784511194125441, 1.05644872659752, + 1.47044175860169, 2.09183984013705, 2.69484857437112, 3.1694157654766 + ), c(0.189889609612846, 0.28831400446517, 0.378590156778518, + 0.474951757151471, 0.546550271666467, 0.599713541496415, 0.636994072140471, + 0.663814888730087, 0.6839305, 0.701811, 0.71711131701917, 0.7319844, + 0.746512343291783, 0.7621579, 0.7800383, 0.800154, 0.826974702066021, + 0.86472325100111, 0.918612458720487, 0.988605006042461, 1.08324298909714, + 1.1736324426019, 1.27400190201593), c(0.0970521814156041, 0.21019273451422, + 0.3073217, 0.418096666577866, 0.489016664299943, 0.547102113575136, + 0.594490775323003, 0.63162246104581, 0.661579866583116, 0.687283, + 0.709633785855109, 0.7319844, 0.754030577281223, 0.776967707389074, + 0.802389, 0.832791429272493, 0.870576437517875, 0.917019363782438, + 0.973069487834329, 1.04481411391714, 1.15502640396814, 1.25613855529213, + 1.37419193312441), c(0.0121672025865257, 0.139873460696682, 0.245836896475015, + 0.366700877088971, 0.445024777793378, 0.506295707796278, 0.557812941319663, + 0.601634091201612, 0.639324955546405, 0.673001603565436, 0.702827370737707, + 0.7319844, 0.760387153293983, 0.790515252114921, 0.823330663438584, + 0.86065768198682, 0.904468070814958, 0.954989716167962, 1.01626566701207, + 1.09352836237872, 1.21548452077266, 1.32239947141536, 1.46006378366371 + ), c(0, 0.0755189873928237, 0.192404624794198, 0.322282766861868, + 0.409749729479745, 0.475729034228042, 0.531171513462134, 0.579442333436034, + 0.623023292701627, 0.662178609529395, 0.697968947885378, 0.7319844, + 0.766345465406154, 0.80256496503135, 0.841452466611966, 0.884524366576965, + 0.93218174000415, 0.988252217755677, 1.05297410373014, 1.13838991320473, + 1.27210128334768, 1.38822119412612, 1.53603026586717), c(0, 0.0137515321313713, + 0.140785106599616, 0.283710273212032, 0.374321519596796, 0.446394180252102, + 0.505830587319873, 0.559570052916329, 0.606684360953109, 0.65111343293503, + 0.692845474832798, 0.7319844, 0.771333743893139, 0.812267094081241, + 0.855930534362644, 0.903545840608706, 0.955193592261423, 1.01560313647486, + 1.08583632750787, 1.17818451335943, 1.31856131315813, 1.44615719776698, + 1.60468791291453), c(0, 0, 0, 0.0124103998425985, 0.0518320161167612, + 0.0822283734557346, 0.106956582246572, 0.130236689538895, 0.150852198845738, + 0.168835673455735, 0.183678547429124, 0.1975426, 0.211166273455735, + 0.226249473455735, 0.243919155834858, 0.265304527061771, 0.289781663064881, + 0.315985067670677, 0.347644682675627, 0.394981842425824, 0.491215248628636, + 0.584975102439074, 0.694697494489265), c(0, 0, 0, 0, 0.0106056685868359, 0.0491424720812208, 0.0803975947094471, 0.108060576398464, 0.133638500841809, 0.155968088623186, 0.177107275224252, 0.1975426, 0.218180906543366, 0.239601831646016, 0.262811949904799, 0.28886838404664, 0.317235975224252, 0.350545157867879, 0.393998327257523, 0.454550976564066, 0.558555075803007, - 0.656859449317743, 0.763718974419534), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.0185370189554894, + 0.656859449317743, 0.763718974419534), c(0, 0, 0, 0, 0, 0.0185370189554894, 0.0562218087603375, 0.0890356919950198, 0.118731362266373, 0.146216910144001, 0.172533896645116, 0.1975426, 0.223021121504065, 0.249412654553045, 0.277680444480195, 0.308522683806638, 0.342270845449704, 0.382702709814398, 0.433443929063141, 0.501610622734127, 0.61417580106326, 0.715138862353848, - 0.833535553075286), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.0346528073343234, 0.0723584880324803, - 0.106222897173122, 0.138467941096611, 0.167844669490445, - 0.1975426, 0.227591504589096, 0.258479799230192, 0.290862843650987, - 0.325718759418194, 0.364163081687565, 0.409581315443156, - 0.46531554698862, 0.54043504498905, 0.659111642885379, 0.761453612496025, - 0.889794566241181), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.0134397969692197, 0.0557212574100741, - 0.0941597345954959, 0.130401776157262, 0.164200585080601, - 0.1975426, 0.231566981332063, 0.265597088493385, 0.30192115798073, - 0.341652226704467, 0.384249568152932, 0.43541812199952, 0.495340659591346, - 0.575765691755518, 0.703032070294999, 0.815605113815338, - 0.955488202108743), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", - "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, + 0.833535553075286), c(0, 0, 0, 0, 0, 0, 0.0346528073343234, 0.0723584880324803, + 0.106222897173122, 0.138467941096611, 0.167844669490445, 0.1975426, + 0.227591504589096, 0.258479799230192, 0.290862843650987, 0.325718759418194, + 0.364163081687565, 0.409581315443156, 0.46531554698862, 0.54043504498905, + 0.659111642885379, 0.761453612496025, 0.889794566241181), c(0, + 0, 0, 0, 0, 0, 0.0134397969692197, 0.0557212574100741, 0.0941597345954959, + 0.130401776157262, 0.164200585080601, 0.1975426, 0.231566981332063, + 0.265597088493385, 0.30192115798073, 0.341652226704467, 0.384249568152932, + 0.43541812199952, 0.495340659591346, 0.575765691755518, 0.703032070294999, + 0.815605113815338, 0.955488202108743)), quantile_levels = c(0.01, + 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, + 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 + ), class = c("quantile_pred", "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(19006, 19013, - 19020, 19027, 19034, 19006, 19013, 19020, 19027, 19034, 19006, + 18992, 18992), class = "Date"), target_date = structure(c(19006, 19013, 19020, 19027, 19034, 19006, 19013, 19020, 19027, 19034, 19006, 19013, 19020, 19027, 19034, 19006, 19013, 19020, 19027, - 19034), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", + 19034, 19006, 19013, 19020, 19027, 19034, 19006, 19013, 19020, + 19027, 19034), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame")) --- @@ -696,287 +429,180 @@ 0.7319844, 0.7319844, 0.7319844, 0.1975426, 0.1975426, 0.1975426, 0.1975426, 0.1975426), ahead = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, - 4L, 5L, 1L, 2L, 3L, 4L, 5L), .pred_distn = structure(list(structure(list( - values = c(0, 0, 0.00812835000000001, 0.07297428, 0.0936219, - 0.10421786, 0.1121285, 0.1201118, 0.1273693, 0.1317238, 0.1360783, - 0.1393442, 0.1426101, 0.1469646, 0.1513191, 0.1585766, 0.1665599, - 0.17447054, 0.1850665, 0.20571412, 0.27056005, 0.313941744999999, - 0.384931126999997), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0250982954899548, 0.0576421230361804, - 0.0776985410529105, 0.0929731777892779, 0.104205115094451, - 0.114209292598776, 0.123365027741977, 0.131496226094211, - 0.1393442, 0.147007648291083, 0.154990950042, 0.16406284204392, - 0.173835548288583, 0.185472494222942, 0.200167568392984, - 0.221760005190952, 0.260313716029161, 0.318794320716957, - 0.376941794597195, 0.461705276864399), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.028693230499105, - 0.055453963203632, 0.0755679534410344, 0.0913921813275133, 0.104804902302573, - 0.117142722458225, 0.128444430213702, 0.1393442, 0.150479535783308, - 0.161776522458225, 0.173925041831968, 0.187540579925299, 0.204200618941439, - 0.225353161205212, 0.253695961466565, 0.294498109305393, 0.358245879234942, - 0.427563795224327, 0.501665748776186), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.00587171510650109, - 0.0364866623781238, 0.0602683002957529, 0.0794861096145961, 0.0963414561651617, - 0.111439230212802, 0.125394639614746, 0.1393442, 0.153216527502025, - 0.167801944181742, 0.183359587288923, 0.200880434888349, 0.221656465706657, - 0.24743726609676, 0.279449270180852, 0.322415149384594, 0.395367499639696, - 0.464904880713406, 0.539558052669137), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0.019055042091221, - 0.0457625510440105, 0.068309473710537, 0.087945102194822, 0.106033592330923, - 0.123045226382564, 0.1393442, 0.155351600131351, 0.172491058371384, - 0.19101350900654, 0.211425349928599, 0.234936300692507, 0.264303292652126, - 0.299599722715327, 0.346282638921389, 0.423857010226352, 0.494689091614341, - 0.577833814673327), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.00138033000000002, 0.030893965, 0.0479842, - 0.059815975, 0.07118759, 0.0815075, 0.0926819, 0.0992551, - 0.103199, 0.1071429, 0.1137161, 0.1248905, 0.13521041, 0.146582025, - 0.1584138, 0.175504035, 0.20501767, 0.25694586, 0.335051815, - 0.436709474), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.0179658025100251, 0.0356060154111541, + 4L, 5L, 1L, 2L, 3L, 4L, 5L), .pred_distn = structure(list(c(0, + 0, 0.00812835000000001, 0.07297428, 0.0936219, 0.10421786, 0.1121285, + 0.1201118, 0.1273693, 0.1317238, 0.1360783, 0.1393442, 0.1426101, + 0.1469646, 0.1513191, 0.1585766, 0.1665599, 0.17447054, 0.1850665, + 0.20571412, 0.27056005, 0.313941744999999, 0.384931126999997), + c(0, 0, 0, 0.0250982954899548, 0.0576421230361804, 0.0776985410529105, + 0.0929731777892779, 0.104205115094451, 0.114209292598776, + 0.123365027741977, 0.131496226094211, 0.1393442, 0.147007648291083, + 0.154990950042, 0.16406284204392, 0.173835548288583, 0.185472494222942, + 0.200167568392984, 0.221760005190952, 0.260313716029161, + 0.318794320716957, 0.376941794597195, 0.461705276864399), + c(0, 0, 0, 0, 0.028693230499105, 0.055453963203632, 0.0755679534410344, + 0.0913921813275133, 0.104804902302573, 0.117142722458225, + 0.128444430213702, 0.1393442, 0.150479535783308, 0.161776522458225, + 0.173925041831968, 0.187540579925299, 0.204200618941439, + 0.225353161205212, 0.253695961466565, 0.294498109305393, + 0.358245879234942, 0.427563795224327, 0.501665748776186), + c(0, 0, 0, 0, 0.00587171510650109, 0.0364866623781238, 0.0602683002957529, + 0.0794861096145961, 0.0963414561651617, 0.111439230212802, + 0.125394639614746, 0.1393442, 0.153216527502025, 0.167801944181742, + 0.183359587288923, 0.200880434888349, 0.221656465706657, + 0.24743726609676, 0.279449270180852, 0.322415149384594, 0.395367499639696, + 0.464904880713406, 0.539558052669137), c(0, 0, 0, 0, 0, 0.019055042091221, + 0.0457625510440105, 0.068309473710537, 0.087945102194822, + 0.106033592330923, 0.123045226382564, 0.1393442, 0.155351600131351, + 0.172491058371384, 0.19101350900654, 0.211425349928599, 0.234936300692507, + 0.264303292652126, 0.299599722715327, 0.346282638921389, + 0.423857010226352, 0.494689091614341, 0.577833814673327), + c(0, 0, 0, 0.00138033000000002, 0.030893965, 0.0479842, 0.059815975, + 0.07118759, 0.0815075, 0.0926819, 0.0992551, 0.103199, 0.1071429, + 0.1137161, 0.1248905, 0.13521041, 0.146582025, 0.1584138, + 0.175504035, 0.20501767, 0.25694586, 0.335051815, 0.436709474 + ), c(0, 0, 0, 0, 0, 0.0179658025100251, 0.0356060154111541, 0.050834301692017, 0.0650050989327893, 0.0784417069434695, 0.0916422518458685, 0.103199, 0.115251501692017, 0.128398001692017, 0.142201701692017, 0.157319973859039, 0.174980914065641, 0.196101805086251, 0.223989860848608, 0.266334685464555, 0.354050965519204, 0.437948459272293, 0.520203978940639), - quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, - 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0, 0.0134241653129031, 0.0338447112456125, + c(0, 0, 0, 0, 0, 0, 0.0134241653129031, 0.0338447112456125, 0.052643303388484, 0.0699345638167383, 0.0866373614747148, 0.103199, 0.119627111136411, 0.137401026927169, 0.156056395793358, 0.175781901322513, 0.198564535163602, 0.226934571881819, 0.263862501322513, 0.317121769745397, 0.412419996940619, - 0.491470213131306, 0.580892509639735), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0, 0, - 0.0170903, 0.0403385023363734, 0.0616387632732329, 0.0827585779094291, - 0.103199, 0.123094939420544, 0.14464638301663, 0.1669589, 0.191770645535455, - 0.220735117412174, 0.254231042750228, 0.296807527848978, 0.357153759489695, - 0.45347931404539, 0.538725322834228, 0.636530647411066), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0, 0, 0, - 0.0026415954262542, 0.0297423239924899, 0.0555402340406406, 0.0792255827466275, - 0.103199, 0.127366925585556, 0.151700351432014, 0.177708522618176, - 0.206088123699737, 0.238712707453825, 0.277708313715037, 0.325132239647296, - 0.390468252727729, 0.490417296529864, 0.578557086846368, 0.688679948593326 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0.0320461375000001, - 0.129384955, 0.18940881, 0.2200878, 0.2427634, 0.2587698, 0.2734423, - 0.2841133, 0.296118, 0.3041212, 0.3121244, 0.3201276, 0.3281308, - 0.3401355, 0.3508065, 0.365479, 0.3814854, 0.404161, 0.43483999, - 0.494863845, 0.592202662499998, 0.737413847999994), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0.0319186440152902, - 0.118606588418984, 0.166386434627046, 0.198884154069741, 0.224089313858389, - 0.245418255377554, 0.2641052, 0.281445422925429, 0.297451875378704, - 0.3121244, 0.327667648091081, 0.343487967727477, 0.360314881408664, - 0.379575527422374, 0.400991145952209, 0.426605204088841, 0.4588495, - 0.506128350755908, 0.604640728888889, 0.713520019350718, 0.848429920658984 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0, 0, - 0, 0.0628145244703447, 0.119951261697167, 0.161800708429584, - 0.194481529786298, 0.221976473503235, 0.246382528361484, 0.268661795456855, - 0.29099237601426, 0.3121244, 0.332687273503235, 0.354487379145491, - 0.376704773503235, 0.401222379758598, 0.428725473503235, 0.462071908680987, - 0.503745448659536, 0.564825512591627, 0.677307126205362, 0.788889302835928, - 0.92389000979736), quantile_levels = c(0.01, 0.025, 0.05, 0.1, - 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, - 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0154147362739629, 0.0815589624901754, - 0.130419447103471, 0.16933591200637, 0.202296191455315, 0.23230661698317, - 0.260103744489245, 0.28583424396924, 0.3121244, 0.337226511153312, - 0.3628113, 0.3894886, 0.419049975899859, 0.453339140405904, - 0.492830630339104, 0.542883079890499, 0.613577832767128, - 0.73571689900399, 0.853844909059791, 0.988010467319443), - quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, - 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0.0493531737111374, 0.104172112803728, - 0.147940700281253, 0.185518687303273, 0.220197034594646, - 0.2521005, 0.282477641919719, 0.3121244, 0.3414694, 0.371435390499905, - 0.402230766363414, 0.436173824348844, 0.474579164424894, - 0.519690345185252, 0.57667375206677, 0.655151246845668, 0.78520792902029, - 0.90968118047453, 1.05112182091783), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0.28439515, 0.33688581, - 0.369872555, 0.3863845, 0.3945111, 0.40189893, 0.4078092, 0.4137194, - 0.4174134, 0.4218461, 0.4262788, 0.4299728, 0.435883, 0.44179327, - 0.4491811, 0.4573077, 0.473819645, 0.50680639, 0.55929705, 0.9841905175, - 1.556671116), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, - 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0.003694, 0.268840486221162, 0.320208490155752, + 0.491470213131306, 0.580892509639735), c(0, 0, 0, 0, 0, 0, + 0, 0.0170903, 0.0403385023363734, 0.0616387632732329, 0.0827585779094291, + 0.103199, 0.123094939420544, 0.14464638301663, 0.1669589, + 0.191770645535455, 0.220735117412174, 0.254231042750228, + 0.296807527848978, 0.357153759489695, 0.45347931404539, 0.538725322834228, + 0.636530647411066), c(0, 0, 0, 0, 0, 0, 0, 0.0026415954262542, + 0.0297423239924899, 0.0555402340406406, 0.0792255827466275, + 0.103199, 0.127366925585556, 0.151700351432014, 0.177708522618176, + 0.206088123699737, 0.238712707453825, 0.277708313715037, + 0.325132239647296, 0.390468252727729, 0.490417296529864, + 0.578557086846368, 0.688679948593326), c(0, 0.0320461375000001, + 0.129384955, 0.18940881, 0.2200878, 0.2427634, 0.2587698, + 0.2734423, 0.2841133, 0.296118, 0.3041212, 0.3121244, 0.3201276, + 0.3281308, 0.3401355, 0.3508065, 0.365479, 0.3814854, 0.404161, + 0.43483999, 0.494863845, 0.592202662499998, 0.737413847999994 + ), c(0, 0, 0.0319186440152902, 0.118606588418984, 0.166386434627046, + 0.198884154069741, 0.224089313858389, 0.245418255377554, + 0.2641052, 0.281445422925429, 0.297451875378704, 0.3121244, + 0.327667648091081, 0.343487967727477, 0.360314881408664, + 0.379575527422374, 0.400991145952209, 0.426605204088841, + 0.4588495, 0.506128350755908, 0.604640728888889, 0.713520019350718, + 0.848429920658984), c(0, 0, 0, 0.0628145244703447, 0.119951261697167, + 0.161800708429584, 0.194481529786298, 0.221976473503235, + 0.246382528361484, 0.268661795456855, 0.29099237601426, 0.3121244, + 0.332687273503235, 0.354487379145491, 0.376704773503235, + 0.401222379758598, 0.428725473503235, 0.462071908680987, + 0.503745448659536, 0.564825512591627, 0.677307126205362, + 0.788889302835928, 0.92389000979736), c(0, 0, 0, 0.0154147362739629, + 0.0815589624901754, 0.130419447103471, 0.16933591200637, + 0.202296191455315, 0.23230661698317, 0.260103744489245, 0.28583424396924, + 0.3121244, 0.337226511153312, 0.3628113, 0.3894886, 0.419049975899859, + 0.453339140405904, 0.492830630339104, 0.542883079890499, + 0.613577832767128, 0.73571689900399, 0.853844909059791, 0.988010467319443 + ), c(0, 0, 0, 0, 0.0493531737111374, 0.104172112803728, 0.147940700281253, + 0.185518687303273, 0.220197034594646, 0.2521005, 0.282477641919719, + 0.3121244, 0.3414694, 0.371435390499905, 0.402230766363414, + 0.436173824348844, 0.474579164424894, 0.519690345185252, + 0.57667375206677, 0.655151246845668, 0.78520792902029, 0.90968118047453, + 1.05112182091783), c(0, 0, 0.28439515, 0.33688581, 0.369872555, + 0.3863845, 0.3945111, 0.40189893, 0.4078092, 0.4137194, 0.4174134, + 0.4218461, 0.4262788, 0.4299728, 0.435883, 0.44179327, 0.4491811, + 0.4573077, 0.473819645, 0.50680639, 0.55929705, 0.9841905175, + 1.556671116), c(0, 0, 0.003694, 0.268840486221162, 0.320208490155752, 0.34804029700677, 0.368653615349654, 0.3834292, 0.3945111, 0.4041153, 0.413171785132151, 0.4218461, 0.430424661802068, 0.4395769, 0.4491812, 0.4610017, 0.47590450199302, 0.497193409669697, 0.525275921931869, 0.57616046396334, 0.97179808113241, 1.42880557869041, - 2.00265362857685), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0925362072632727, 0.270427502912579, + 2.00265362857685), c(0, 0, 0, 0.0925362072632727, 0.270427502912579, 0.315212102423624, 0.343335698090731, 0.364285966419164, 0.381412585636556, 0.3959887, 0.4092868, 0.4218461, 0.4344055, 0.447738051828318, 0.4632179, 0.480948870517105, 0.502553166907419, 0.531676966454865, 0.576804782629326, 0.776643061384413, - 1.21840177544959, 1.666716830807, 2.19163048441111), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, - 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, - 0.99)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.186887482630176, - 0.277238777881179, 0.317854348809488, 0.345779327332173, 0.367941987952029, - 0.38755201396574, 0.405055828677287, 0.4218461, 0.438666668060931, - 0.456611962704227, 0.476718028677287, 0.499751625882259, 0.528508989683397, - 0.569810205861059, 0.666081219804098, 0.934028445917159, 1.42658287124316, - 1.85311957889209, 2.30760254154095), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0, 0, 0.0845659921302213, - 0.228553649752897, 0.289236861333113, 0.326073140839108, 0.354785333802038, - 0.379166830409904, 0.401230227456875, 0.4218461, 0.442801275729157, - 0.465572618600986, 0.490133389090691, 0.520052318734487, 0.558588500497255, - 0.62065225601836, 0.788392143304334, 1.05428294678997, 1.55684044507063, - 2.01374350966068, 2.37954449328776), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.33818795, 0.4386877525, - 0.528816855, 0.61252005, 0.6626973, 0.6816954, 0.697340875, 0.7085162, - 0.7152214, 0.7208091, 0.72745833, 0.7319844, 0.73651047, 0.7431597, - 0.7487474, 0.7554526, 0.766627925, 0.7822734, 0.8012715, 0.85144875, - 0.935151945, 1.0252810475, 1.12578085), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.276821846502455, - 0.354318476867519, 0.440270225449805, 0.533132934163242, 0.5900576, - 0.631102729748298, 0.660462274661497, 0.680831108876989, 0.696223359635746, - 0.7096337, 0.7219265, 0.7319844, 0.7431597, 0.7543351, 0.7677455, - 0.783391, 0.804046832839828, 0.833541896886769, 0.873735298798638, - 0.929106903073231, 1.02188617627186, 1.10971107833641, 1.18626816850867 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, - 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, - 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", "dist_default", - "vctrs_rcrd", "vctrs_vctr")), structure(list(values = c(0.202265200637946, - 0.298325094034965, 0.380907645938709, 0.481339524857949, 0.543219696138311, - 0.589507953775938, 0.6258186, 0.654874580912809, 0.6783427, 0.6984583, - 0.715655544727447, 0.7319844, 0.7487473, 0.7666278, 0.785715489951649, - 0.8090941, 0.83815, 0.873623567291473, 0.920206978680437, 0.98231174201862, - 1.08425930872329, 1.16639411427812, 1.25926838507547), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.129193504425124, - 0.241744300793533, 0.331949483165032, 0.43649858695157, 0.504472062268773, - 0.556141464729147, 0.597172505336053, 0.631406591640416, 0.660898437441874, - 0.686684727470375, 0.709633972330423, 0.7319844, 0.753217699696647, - 0.77608746100351, 0.8012715950276, 0.830327492252422, 0.86464477397774, - 0.906319686121761, 0.956815387818928, 1.02495125855129, 1.13129413647201, - 1.21644533535035, 1.32424172966634), quantile_levels = c(0.01, + 1.21840177544959, 1.666716830807, 2.19163048441111), c(0, + 0, 0, 0, 0.186887482630176, 0.277238777881179, 0.317854348809488, + 0.345779327332173, 0.367941987952029, 0.38755201396574, 0.405055828677287, + 0.4218461, 0.438666668060931, 0.456611962704227, 0.476718028677287, + 0.499751625882259, 0.528508989683397, 0.569810205861059, + 0.666081219804098, 0.934028445917159, 1.42658287124316, 1.85311957889209, + 2.30760254154095), c(0, 0, 0, 0, 0.0845659921302213, 0.228553649752897, + 0.289236861333113, 0.326073140839108, 0.354785333802038, + 0.379166830409904, 0.401230227456875, 0.4218461, 0.442801275729157, + 0.465572618600986, 0.490133389090691, 0.520052318734487, + 0.558588500497255, 0.62065225601836, 0.788392143304334, 1.05428294678997, + 1.55684044507063, 2.01374350966068, 2.37954449328776), c(0.33818795, + 0.4386877525, 0.528816855, 0.61252005, 0.6626973, 0.6816954, + 0.697340875, 0.7085162, 0.7152214, 0.7208091, 0.72745833, + 0.7319844, 0.73651047, 0.7431597, 0.7487474, 0.7554526, 0.766627925, + 0.7822734, 0.8012715, 0.85144875, 0.935151945, 1.0252810475, + 1.12578085), c(0.276821846502455, 0.354318476867519, 0.440270225449805, + 0.533132934163242, 0.5900576, 0.631102729748298, 0.660462274661497, + 0.680831108876989, 0.696223359635746, 0.7096337, 0.7219265, + 0.7319844, 0.7431597, 0.7543351, 0.7677455, 0.783391, 0.804046832839828, + 0.833541896886769, 0.873735298798638, 0.929106903073231, + 1.02188617627186, 1.10971107833641, 1.18626816850867), c(0.202265200637946, + 0.298325094034965, 0.380907645938709, 0.481339524857949, + 0.543219696138311, 0.589507953775938, 0.6258186, 0.654874580912809, + 0.6783427, 0.6984583, 0.715655544727447, 0.7319844, 0.7487473, + 0.7666278, 0.785715489951649, 0.8090941, 0.83815, 0.873623567291473, + 0.920206978680437, 0.98231174201862, 1.08425930872329, 1.16639411427812, + 1.25926838507547), c(0.129193504425124, 0.241744300793533, + 0.331949483165032, 0.43649858695157, 0.504472062268773, 0.556141464729147, + 0.597172505336053, 0.631406591640416, 0.660898437441874, + 0.686684727470375, 0.709633972330423, 0.7319844, 0.753217699696647, + 0.77608746100351, 0.8012715950276, 0.830327492252422, 0.86464477397774, + 0.906319686121761, 0.956815387818928, 1.02495125855129, 1.13129413647201, + 1.21644533535035, 1.32424172966634), c(0.0667682979050189, + 0.189580042212397, 0.290485041721667, 0.402951609190092, + 0.475328740486855, 0.530590906520765, 0.575504908587586, + 0.613421932920829, 0.647285177364573, 0.678099283398734, + 0.70593862799773, 0.7319844, 0.758701322488325, 0.786639532920829, + 0.816837200234752, 0.850627936753767, 0.888963924063491, + 0.933785069065791, 0.988913131611816, 1.06240172852619, 1.16959624730917, + 1.2662008825538, 1.38860505690239), c(0, 0, 0.0419413650000001, + 0.09882005, 0.1230992, 0.14226962, 0.1600776, 0.1722416, + 0.1800265, 0.1880061, 0.1936501, 0.1975426, 0.2014351, 0.2070791, + 0.2150587, 0.2228436, 0.2350076, 0.25281558, 0.271986, 0.29626515, + 0.353143835, 0.4353357125, 0.545314878), c(0, 0, 0, 0.0438463650372504, + 0.0808594787511875, 0.106995615813358, 0.127478232938079, + 0.145480846633466, 0.1610508, 0.17461199504795, 0.186668812203222, + 0.1975426, 0.208428571374764, 0.2204108, 0.233930283744537, + 0.249894552784127, 0.267362348440485, 0.288755575723157, + 0.316120297580926, 0.355450425419354, 0.443192503687136, + 0.536871211931719, 0.636344785545224), c(0, 0, 0, 0.00188932708477086, + 0.0470905919531195, 0.079226864399944, 0.105414109111591, + 0.127225815559956, 0.146699420891509, 0.164644114298843, + 0.18142942603581, 0.1975426, 0.213933119201142, 0.231001630488804, + 0.24941229702312, 0.269578845560456, 0.292362546530965, 0.319632071367214, + 0.354433951358713, 0.406915236639266, 0.506944745332152, + 0.596044605353528, 0.695533388807317), c(0, 0, 0, 0, 0.0156342454546545, + 0.0536811248488485, 0.084228833507335, 0.110407751354614, + 0.134410113872139, 0.156669167575476, 0.177701902429674, + 0.1975426, 0.217759024165492, 0.238897316673167, 0.261484572608426, + 0.286120039498095, 0.313065324705997, 0.345395334882349, + 0.386811116673167, 0.44780805303823, 0.550781846423163, 0.644984940689833, + 0.752937731654986), c(0, 0, 0, 0, 0, 0.0290260214229144, + 0.0653218111708617, 0.0966336637233373, 0.124670861123061, + 0.149775978614687, 0.174275935467055, 0.1975426, 0.221291415429954, + 0.246723385601356, 0.273144383515685, 0.30101566402084, 0.33204051788793, + 0.369730347126771, 0.416909038104281, 0.481925596660567, + 0.58989871202142, 0.688635568252056, 0.803906183401304)), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.0667682979050189, - 0.189580042212397, 0.290485041721667, 0.402951609190092, 0.475328740486855, - 0.530590906520765, 0.575504908587586, 0.613421932920829, 0.647285177364573, - 0.678099283398734, 0.70593862799773, 0.7319844, 0.758701322488325, - 0.786639532920829, 0.816837200234752, 0.850627936753767, 0.888963924063491, - 0.933785069065791, 0.988913131611816, 1.06240172852619, 1.16959624730917, - 1.2662008825538, 1.38860505690239), quantile_levels = c(0.01, - 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, - 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99 - )), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0, 0, 0.0419413650000001, - 0.09882005, 0.1230992, 0.14226962, 0.1600776, 0.1722416, 0.1800265, - 0.1880061, 0.1936501, 0.1975426, 0.2014351, 0.2070791, 0.2150587, - 0.2228436, 0.2350076, 0.25281558, 0.271986, 0.29626515, 0.353143835, - 0.4353357125, 0.545314878), quantile_levels = c(0.01, 0.025, - 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, - 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.0438463650372504, 0.0808594787511875, - 0.106995615813358, 0.127478232938079, 0.145480846633466, - 0.1610508, 0.17461199504795, 0.186668812203222, 0.1975426, - 0.208428571374764, 0.2204108, 0.233930283744537, 0.249894552784127, - 0.267362348440485, 0.288755575723157, 0.316120297580926, - 0.355450425419354, 0.443192503687136, 0.536871211931719, - 0.636344785545224), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0.00188932708477086, 0.0470905919531195, - 0.079226864399944, 0.105414109111591, 0.127225815559956, - 0.146699420891509, 0.164644114298843, 0.18142942603581, 0.1975426, - 0.213933119201142, 0.231001630488804, 0.24941229702312, 0.269578845560456, - 0.292362546530965, 0.319632071367214, 0.354433951358713, - 0.406915236639266, 0.506944745332152, 0.596044605353528, - 0.695533388807317), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0.0156342454546545, 0.0536811248488485, - 0.084228833507335, 0.110407751354614, 0.134410113872139, - 0.156669167575476, 0.177701902429674, 0.1975426, 0.217759024165492, - 0.238897316673167, 0.261484572608426, 0.286120039498095, - 0.313065324705997, 0.345395334882349, 0.386811116673167, - 0.44780805303823, 0.550781846423163, 0.644984940689833, 0.752937731654986 - ), quantile_levels = c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, - 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, - 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0, 0, 0, 0, 0, 0.0290260214229144, 0.0653218111708617, - 0.0966336637233373, 0.124670861123061, 0.149775978614687, - 0.174275935467055, 0.1975426, 0.221291415429954, 0.246723385601356, - 0.273144383515685, 0.30101566402084, 0.33204051788793, 0.369730347126771, - 0.416909038104281, 0.481925596660567, 0.58989871202142, 0.688635568252056, - 0.803906183401304), quantile_levels = c(0.01, 0.025, 0.05, - 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, - 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", - "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, + ), class = c("quantile_pred", "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992), class = "Date"), target_date = structure(c(18997, 19002, - 19007, 19012, 19017, 18997, 19002, 19007, 19012, 19017, 18997, + 18992, 18992), class = "Date"), target_date = structure(c(18997, 19002, 19007, 19012, 19017, 18997, 19002, 19007, 19012, 19017, 18997, 19002, 19007, 19012, 19017, 18997, 19002, 19007, 19012, - 19017), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", + 19017, 18997, 19002, 19007, 19012, 19017, 18997, 19002, 19007, + 19012, 19017), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame")) # arx_forecaster snapshots @@ -984,24 +610,23 @@ structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.353013358779435, 0.648525432444877, 0.667670289394328, 1.1418673907239, 0.830448695683587, 0.329799431948649), .pred_distn = structure(list( - structure(list(values = c(0.171022956902288, 0.535003760656582 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.46653503056773, 0.830515834322024), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.485679887517181, - 0.849660691271475), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.959876988846753, 1.32385779260105), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.64845829380644, - 1.01243909756073), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.147809030071502, 0.511789833825796), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", - "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, - 18992, 18992), class = "Date"), target_date = structure(c(18999, + c(0.171022956902288, 0.244945899624723, 0.308032696431071, + 0.353013358779435, 0.397994021127798, 0.461080817934147, + 0.535003760656582), c(0.46653503056773, 0.540457973290166, + 0.603544770096514, 0.648525432444877, 0.693506094793241, + 0.756592891599589, 0.830515834322024), c(0.485679887517181, + 0.559602830239616, 0.622689627045964, 0.667670289394328, + 0.712650951742692, 0.77573774854904, 0.849660691271475), + c(0.959876988846753, 1.03379993156919, 1.09688672837554, + 1.1418673907239, 1.18684805307226, 1.24993484987861, 1.32385779260105 + ), c(0.64845829380644, 0.722381236528875, 0.785468033335223, + 0.830448695683587, 0.875429358031951, 0.938516154838299, + 1.01243909756073), c(0.147809030071502, 0.221731972793937, + 0.284818769600285, 0.329799431948649, 0.374780094297013, + 0.437866891103361, 0.511789833825796)), quantile_levels = c(0.05, + 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", + "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, + 18992, 18992, 18992, 18992), class = "Date"), target_date = structure(c(18999, 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) @@ -1010,51 +635,666 @@ structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.149303403634373, 0.139764664505948, 0.333186321066645, 0.470345577837144, 0.725986105412008, 0.212686665274007), .pred_distn = structure(list( - structure(list(values = c(0.0961118191398634, 0.202494988128882 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.0865730800114383, 0.192956249000457), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.279994736572136, - 0.386377905561154), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.417153993342634, 0.523537162331653), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr")), structure(list(values = c(0.672794520917498, - 0.779177689906517), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c(0.159495080779498, 0.265878249768516), quantile_levels = c(0.05, - 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", - "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", + c(0.0961118191398634, 0.118312393281548, 0.13840396557592, + 0.149303403634373, 0.160202841692825, 0.180294413987198, + 0.202494988128882), c(0.0865730800114383, 0.108773654153123, + 0.128865226447495, 0.139764664505948, 0.1506641025644, 0.170755674858773, + 0.192956249000457), c(0.279994736572136, 0.30219531071382, + 0.322286883008193, 0.333186321066645, 0.344085759125097, + 0.36417733141947, 0.386377905561154), c(0.417153993342634, + 0.439354567484319, 0.459446139778691, 0.470345577837144, + 0.481245015895596, 0.501336588189969, 0.523537162331653), + c(0.672794520917498, 0.694995095059183, 0.715086667353556, + 0.725986105412008, 0.73688554347046, 0.756977115764833, 0.779177689906517 + ), c(0.159495080779498, 0.181695654921182, 0.201787227215555, + 0.212686665274007, 0.223586103332459, 0.243677675626832, + 0.265878249768516)), quantile_levels = c(0.05, 0.1, 0.25, + 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", "list")), forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, 18992), class = "Date"), target_date = structure(c(18993, 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017742, 0.531332853311081, 0.58882794468598, + 0.98869024921623, 0.79480199700164, 0.306895457225321), .pred_distn = structure(list( + c(0.136509784083987, 0.202348949370703, 0.263837900408968, + 0.303244704017742, 0.342651507626517, 0.404140458664782, + 0.469979623951498), c(0.364597933377326, 0.430437098664042, + 0.491926049702307, 0.531332853311081, 0.570739656919856, + 0.632228607958121, 0.698067773244837), c(0.422093024752224, + 0.48793219003894, 0.549421141077205, 0.58882794468598, 0.628234748294754, + 0.689723699333019, 0.755562864619735), c(0.821955329282475, + 0.887794494569191, 0.949283445607456, 0.98869024921623, 1.028097052825, + 1.08958600386327, 1.15542516914999), c(0.628067077067884, + 0.693906242354601, 0.755395193392866, 0.79480199700164, 0.834208800610414, + 0.895697751648679, 0.961536916935395), c(0.140160537291566, + 0.205999702578282, 0.267488653616547, 0.306895457225321, + 0.346302260834096, 0.407791211872361, 0.473630377159077)), quantile_levels = c(0.05, + 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", + "vctrs_vctr", "list")), forecast_date = structure(c(18997, 18997, + 18997, 18997, 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017742, 0.531332853311081, 0.58882794468598, + 0.98869024921623, 0.79480199700164, 0.306895457225321), .pred_distn = structure(list( + c(0.136509784083987, 0.202348949370703, 0.263837900408968, + 0.303244704017742, 0.342651507626517, 0.404140458664782, + 0.469979623951498), c(0.364597933377326, 0.430437098664042, + 0.491926049702307, 0.531332853311081, 0.570739656919856, + 0.632228607958121, 0.698067773244837), c(0.422093024752224, + 0.48793219003894, 0.549421141077205, 0.58882794468598, 0.628234748294754, + 0.689723699333019, 0.755562864619735), c(0.821955329282475, + 0.887794494569191, 0.949283445607456, 0.98869024921623, 1.028097052825, + 1.08958600386327, 1.15542516914999), c(0.628067077067884, + 0.693906242354601, 0.755395193392866, 0.79480199700164, 0.834208800610414, + 0.895697751648679, 0.961536916935395), c(0.140160537291566, + 0.205999702578282, 0.267488653616547, 0.306895457225321, + 0.346302260834096, 0.407791211872361, 0.473630377159077)), quantile_levels = c(0.05, + 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", + "vctrs_vctr", "list")), forecast_date = structure(c(18997, 18997, + 18997, 18997, 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + +# arx_forecaster output format snapshots + + Code + out1 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 1999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2023-03-10. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2021-12-31, + * For target date: 2022-01-07, + + +--- + + Code + out2 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 1999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2023-03-10. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + + +--- + + Code + out3 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 1999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2023-03-10. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + + # arx_classifier snapshots structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", - "ct", "dc", "de", "fl", "ga", "gu", "hi", "ia", "id", "il", "in", - "ks", "ky", "la", "ma", "me", "mi", "mn", "mo", "mp", "ms", "mt", - "nc", "nd", "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", - "pa", "pr", "ri", "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", - "wi", "wv", "wy"), .pred_class = structure(c(1L, 1L, 1L, 1L, + "ct", "dc", "de", "fl", "ga", "hi", "ia", "id", "il", "in", "ks", + "ky", "la", "ma", "me", "mi", "mn", "mo", "ms", "mt", "nc", "nd", + "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", "pa", "pr", + "ri", "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", "wi", "wv", + "wy"), .pred_class = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("(-Inf,0.25]", + "(0.25, Inf]"), class = "factor"), forecast_date = structure(c(18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992), class = "Date"), target_date = structure(c(18999, + 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, + 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, + 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, + 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, + 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, + 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, + -51L), class = c("tbl_df", "tbl", "data.frame")) + +--- + + structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", + "ct", "dc", "de", "fl", "ga", "hi", "ia", "id", "il", "in", "ks", + "ky", "la", "ma", "me", "mi", "mn", "mo", "ms", "mt", "nc", "nd", + "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", "pa", "pr", + "ri", "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", "wi", "wv", + "wy"), .pred_class = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, - 1L), levels = c("(-Inf,0.25]", "(0.25, Inf]"), class = "factor"), - forecast_date = structure(c(18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, - 18992, 18992, 18992), class = "Date"), target_date = structure(c(18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, 18999, - 18999, 18999, 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, - -53L), class = c("tbl_df", "tbl", "data.frame")) + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("(-Inf,0.25]", + "(0.25, Inf]"), class = "factor"), forecast_date = structure(c(18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994), class = "Date"), target_date = structure(c(19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001), class = "Date")), row.names = c(NA, + -51L), class = c("tbl_df", "tbl", "data.frame")) + +# climatological_forecaster snapshots + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx", + "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", + "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", + "pa", "tx"), forecast_date = structure(c(18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992 + ), class = "Date"), target_date = structure(c(18992, 18992, 18992, + 18992, 18992, 18992, 18999, 18999, 18999, 18999, 18999, 18999, + 19006, 19006, 19006, 19006, 19006, 19006, 19013, 19013, 19013, + 19013, 19013, 19013, 19020, 19020, 19020, 19020, 19020, 19020 + ), class = "Date"), .pred = c(81.46212125, 48.9285099, 59.47903055, + 57.44900655, 62.78917715, 53.8558966, 84.3449863, 51.3826096, + 62.0540565, 67.1289331, 62.6712774, 59.47102, 92.3266162, 51.7816021, + 64.1929086, 67.80972325, 58.13744405, 63.10147305, 84.3449863, + 51.8545645, 66.9333338, 65.8523132, 55.9202576, 64.1234883, 59.50597115, + 48.9275239, 64.0481843, 63.45754255, 43.7883142, 65.37832155), + .pred_distn = structure(list(c(55.01884375, 61.9936534, 76.8558811, + 85.8705552, 95.7238376875, 107.595355333333, 124.8294820725 + ), c(22.4852324, 29.46004205, 44.32226975, 53.33694385, 63.1902263375, + 75.0617439833333, 92.2958707225), c(33.03575305, 40.0105627, + 54.8727904, 63.8874645, 73.7407469875, 85.6122646333333, + 102.8463913725), c(31.00572905, 37.9805387, 52.8427664, 61.8574405, + 71.7107229875, 83.5822406333333, 100.8163673725), c(36.34589965, + 43.3207093, 58.182937, 67.1976111, 77.0508935875, 88.9224112333333, + 106.1565379725), c(27.4126191, 34.38742875, 49.24965645, + 58.26433055, 68.1176130375, 79.9891306833333, 97.2232574225 + ), c(59.1059257625, 65.4267824, 80.893949775, 88.512741775, + 98.9851218875, 111.446583986667, 128.951180125), c(26.1435490625, + 32.4644057, 47.931573075, 55.550365075, 66.0227451875, 78.4842072866667, + 95.988803425), c(36.8149959625, 43.1358526, 58.603019975, + 66.221811975, 76.6941920875, 89.1556541866667, 106.660250325 + ), c(41.8898725625, 48.2107292, 63.677896575, 71.296688575, + 81.7690686875, 94.2305307866667, 111.735126925), c(37.4322168625, + 43.7530735, 59.220240875, 66.839032875, 77.3114129875, 89.7728750866667, + 107.277471225), c(34.2319594625, 40.5528161, 56.019983475, + 63.638775475, 74.1111555875, 86.5726176866667, 104.077213825 + ), c(76.597008525, 83.6285187166667, 89.8466094166667, 96.39787945, + 106.5883326375, 118.488928483333, 139.8972249625), c(36.051994425, + 43.0835046166667, 49.3015953166667, 55.85286535, 66.0433185375, + 77.9439143833333, 99.3522108625), c(48.463300925, 55.4948111166667, + 61.7129018166667, 68.26417185, 78.4546250375, 90.3552208833333, + 111.7635173625), c(52.080115575, 59.1116257666667, 65.3297164666667, + 71.8809865, 82.0714396875, 93.9720355333333, 115.3803320125 + ), c(42.407836375, 49.4393465666667, 55.6574372666667, 62.2087073, + 72.3991604875, 84.2997563333333, 105.7080528125), c(47.371865375, + 54.4033755666667, 60.6214662666667, 67.1727363, 77.3631894875, + 89.2637853333333, 110.6720818125), c(75.151683565, 78.2257355633333, + 82.1485257708333, 87.28053515, 96.1483782208333, 106.438732546667, + 113.7276053175), c(42.661261765, 45.7353137633333, 49.6581039708333, + 54.79011335, 63.6579564208333, 73.9483107466667, 81.2371835175 + ), c(57.740031065, 60.8140830633333, 64.7368732708333, 69.86888265, + 78.7367257208333, 89.0270800466667, 96.3159528175), c(56.659010465, + 59.7330624633333, 63.6558526708333, 68.78786205, 77.6557051208333, + 87.9460594466667, 95.2349322175), c(46.726954865, 49.8010068633333, + 53.7237970708333, 58.85580645, 67.7236495208333, 78.0140038466667, + 85.3028766175), c(54.930185565, 58.0042375633333, 61.9270277708333, + 67.05903715, 75.9268802208333, 86.2172345466667, 93.5061073175 + ), c(50.3425070425, 53.1286383166667, 57.7775218541667, 61.6092566, + 68.5981147666667, 77.2595451983333, 81.44671049), c(39.7640597925, + 42.5501910666667, 47.1990746041667, 51.03080935, 58.0196675166667, + 66.6810979483333, 70.86826324), c(54.8847201925, 57.6708514666667, + 62.3197350041667, 66.15146975, 73.1403279166667, 81.8017583483333, + 85.98892364), c(54.2940784425, 57.0802097166667, 61.7290932541667, + 65.560828, 72.5496861666667, 81.2111165983333, 85.39828189 + ), c(34.6248500925, 37.4109813666667, 42.0598649041667, 45.89159965, + 52.8804578166667, 61.5418882483333, 65.72905354), c(56.2148574425, + 59.0009887166667, 63.6498722541667, 67.481607, 74.4704651666667, + 83.1318955983333, 87.31906089)), quantile_levels = c(0.05, + 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", + "vctrs_vctr", "list"))), row.names = c(NA, -30L), class = c("tbl_df", + "tbl", "data.frame")) + +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx", + "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", + "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", + "pa", "tx"), forecast_date = structure(c(18779, 18779, 18779, + 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779, + 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779, + 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779, 18779 + ), class = "Date"), target_date = structure(c(18779, 18779, 18779, + 18779, 18779, 18779, 18786, 18786, 18786, 18786, 18786, 18786, + 18793, 18793, 18793, 18793, 18793, 18793, 18800, 18800, 18800, + 18800, 18800, 18800, 18807, 18807, 18807, 18807, 18807, 18807 + ), class = "Date"), .pred = c(1782, 927.5, 577.5, 935, 635, 1321, + 1791, 934.5, 561, 765.5, 529.5, 1435, 2153.5, 946.5, 607, 673, + 476, 1663.5, 2486.5, 1138, 659.5, 637.5, 446.5, 2002, 3236, 1311, + 879.5, 666.5, 446.5, 2964), .pred_distn = structure(list(c(512, + 651.333333333334, 1098.45833333333, 1678, 2429.04166666667, 3110.91666666667, + 3621.875), c(0, 0, 365.541666666667, 927.5, 2329.29166666667, + 3263.83333333333, 3892.5), c(220.125, 253.166666666667, 342, + 540.5, 733.375, 854.166666666667, 977), c(435.375, 506.583333333333, + 567.875, 776, 1103.75, 1465.83333333333, 1673.125), c(312.25, + 377.833333333333, 453.416666666667, 584.25, 726.541666666667, + 1037.33333333333, 1516.5), c(149.5, 363.666666666667, 814.333333333333, + 1149, 2003.95833333333, 3254.66666666667, 3883.375), c(0, 450.083333333333, + 924.75, 1670.5, 2544.375, 3821.08333333334, 4407.25), c(0, 0, + 6.41666666666674, 941.5, 2380.375, 4497.16666666667, 9047.75), + c(118.25, 198.583333333333, 288.916666666667, 495.75, 736.625, + 962, 1521.5), c(313.375, 350.166666666667, 409.083333333333, + 597, 860.791666666666, 1210.41666666667, 1353.25), c(206.75, + 258.666666666667, 330.166666666667, 478.75, 598.041666666667, + 734.833333333333, 926.25), c(0, 281.083333333333, 790.666666666667, + 1242.5, 2162.125, 3593.5, 4306.625), c(0, 121.916666666668, + 1184, 2179.25, 3261.875, 4614.33333333333, 5419.75), c(0, + 0, 11, 956, 2619, 8508.83333333334, 10103.25), c(0, 176.25, + 306.791666666667, 570, 839.958333333333, 1568.83333333333, + 1901.25), c(238.125, 267.583333333333, 344.291666666667, + 511.25, 760.375, 1021.5, 1195), c(147, 190.5, 261.458333333333, + 414, 545.458333333333, 664.583333333333, 753.5), c(0, 72.3333333333335, + 931.041666666667, 1487.5, 2731.83333333333, 4390.16666666667, + 5059.5), c(0, 0, 1270.25, 2551, 4018.70833333333, 5112.83333333333, + 6004.375), c(0, 0, 175.541666666667, 1335.5, 4165.625, 8942.58333333333, + 11505.125), c(0, 0, 321.458333333333, 564.75, 1010.20833333333, + 1795, 2183.625), c(225.125, 257.333333333333, 342.708333333333, + 551.25, 724.875, 924.416666666667, 1071.75), c(106.125, 152.083333333333, + 221.166666666667, 363, 513.208333333333, 615.166666666667, + 668), c(0, 125.166666666667, 943.833333333333, 1830, 3605.66666666667, + 5259.16666666667, 6957), c(0, 559.25, 1879.875, 3330.75, + 4889, 6122.66666666667, 7629.625), c(0, 0, 158.583333333333, + 1687.5, 5516.375, 9507.75, 12672.375), c(0, 3.41666666666674, + 500.375, 817.000000000001, 1497.16666666667, 2102.08333333333, + 2451.5), c(286.5, 311.333333333333, 399.166666666667, 629.75, + 792.041666666667, 1015, 1111), c(92.125, 139.833333333333, + 211.666666666667, 343.75, 513.208333333333, 626, 714.25), + c(0, 579.250000000001, 1838.75, 2963.25, 5057.91666666667, + 6361.33333333333, 7919)), quantile_levels = c(0.05, 0.1, + 0.25, 0.5, 0.75, 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", + "list"))), row.names = c(NA, -30L), class = c("tbl_df", "tbl", + "data.frame")) + +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx", + "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", + "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", + "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", + "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", + "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", + "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", + "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", + "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", + "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", + "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", + "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", + "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", + "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", + "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", "ny", + "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", "ga", + "ny", "pa", "tx", "ca", "fl", "ga", "ny", "pa", "tx", "ca", "fl", + "ga", "ny", "pa", "tx"), forecast_date = structure(c(18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, 18992, + 18992, 18992, 18992, 18992), class = "Date"), target_date = structure(c(18992, + 18992, 18992, 18992, 18992, 18992, 18993, 18993, 18993, 18993, + 18993, 18993, 18994, 18994, 18994, 18994, 18994, 18994, 18995, + 18995, 18995, 18995, 18995, 18995, 18996, 18996, 18996, 18996, + 18996, 18996, 18997, 18997, 18997, 18997, 18997, 18997, 18998, + 18998, 18998, 18998, 18998, 18998, 18999, 18999, 18999, 18999, + 18999, 18999, 19000, 19000, 19000, 19000, 19000, 19000, 19001, + 19001, 19001, 19001, 19001, 19001, 19002, 19002, 19002, 19002, + 19002, 19002, 19003, 19003, 19003, 19003, 19003, 19003, 19004, + 19004, 19004, 19004, 19004, 19004, 19005, 19005, 19005, 19005, + 19005, 19005, 19006, 19006, 19006, 19006, 19006, 19006, 19007, + 19007, 19007, 19007, 19007, 19007, 19008, 19008, 19008, 19008, + 19008, 19008, 19009, 19009, 19009, 19009, 19009, 19009, 19010, + 19010, 19010, 19010, 19010, 19010, 19011, 19011, 19011, 19011, + 19011, 19011, 19012, 19012, 19012, 19012, 19012, 19012, 19013, + 19013, 19013, 19013, 19013, 19013, 19014, 19014, 19014, 19014, + 19014, 19014, 19015, 19015, 19015, 19015, 19015, 19015, 19016, + 19016, 19016, 19016, 19016, 19016, 19017, 19017, 19017, 19017, + 19017, 19017, 19018, 19018, 19018, 19018, 19018, 19018, 19019, + 19019, 19019, 19019, 19019, 19019, 19020, 19020, 19020, 19020, + 19020, 19020, 19021, 19021, 19021, 19021, 19021, 19021, 19022, + 19022, 19022, 19022, 19022, 19022), class = "Date"), .pred = c(37365.5, + 13147.5, 8167, 14747.5, 8757, 15877, 37403, 13147, 8596, 14791, + 8734, 15883, 37760.5, 13433.5, 8816, 15621, 8445.5, 16014, 37403, + 13720, 9036, 15393, 8394, 16433, 37760.5, 13795.5, 8816, 15092, + 8445.5, 16289, 38118, 13871, 9036, 14791, 8394, 16145, 37723, + 13795.5, 8816, 14747.5, 8289, 16289, 37328, 13720, 8596, 14704, + 7895, 16433, 37297.5, 13719.5, 8453.5, 14747.5, 7460, 16987, + 37328, 13720, 8596, 14791, 7313, 17694, 37297.5, 13719.5, 8790.5, + 14747.5, 7460, 17951, 37267, 13719, 8596, 14704, 7313, 18208, + 37161.5, 13296, 8564.5, 14445.5, 7106, 20697.5, 36895, 12873, + 8393, 14187, 6757, 19574, 36497.5, 12312, 8283.5, 14183, 6644, + 20962, 35263, 12119, 8150, 13909, 6531, 19574, 31371, 12311, + 8068, 13908, 5863, 19574, 29874, 12119, 8068, 13783, 5817, 19574, + 25888, 12119, 8068, 13783, 5817, 22350, 25816, 12119, 8068, 13783, + 5817, 22580, 24774, 11914, 7686, 13783, 5785, 22350, 24299, 11576, + 7629, 13694, 5662, 19574, 23250, 11543, 7454, 13410, 5632, 19112, + 23168, 11423, 7352, 12951, 5576, 19064, 21159, 11093, 6558, 12892, + 5497, 18191, 20459, 10976, 6384, 12721, 5379, 18191, 19778, 10533, + 6343, 12383, 5379, 18191, 19727, 9816, 6332, 11972, 5111, 17694, + 18581, 9594, 5706, 11492, 5058, 17565, 17672, 9535, 4961, 11209, + 4470, 16433, 17077, 8720, 4957, 11119, 4378, 15274), .pred_distn = structure(list( + c(13505.6, 16257.7333333333, 26529.25, 38488.5, 47803.5833333333, + 52943, 54849.3), c(1798.7, 2451.95, 10271.125, 14631.5, 19124.5416666667, + 37913.2833333333, 77464.75), c(2147.4, 3752.65, 6336.54166666667, + 9070.75, 11484.9166666667, 14971.85, 24055.4), c(9205.975, + 10792.0333333333, 12938.5, 15369.75, 26851.9166666667, 51192.7333333333, + 74599.425), c(4132.625, 5173.11666666667, 7461.91666666667, + 9054.75, 10303.375, 13200.8666666667, 17771.625), c(8787.125, + 9949.68333333333, 12840.2916666667, 15816.5, 21231.125, 25060.15, + 28504.575), c(13359, 17130.6666666667, 28892.9166666667, + 38798, 46440.0833333333, 52838, 54064.5), c(1795, 2370, 10126.25, + 14555, 18932.0833333333, 39212.3333333334, 77512.5), c(2529.5, + 4030, 7336.5, 9651, 11921.1666666667, 15999.5, 24553), c(9224.25, + 10790.3333333333, 12943, 15436.5, 27261.1666666667, 52706.3333333334, + 74960.25), c(4092.25, 5132.16666666667, 7303.66666666667, + 8989.5, 10295.75, 13271.6666666667, 17869.75), c(9603.5, + 11103.6666666667, 13356.5, 15877, 21436.9166666667, 25184.5, + 28772.25), c(16185.7, 19338.2, 30417.7916666667, 39255.25, + 46966.4166666667, 52746.9333333333, 54427.7), c(2078.3, 2575.05, + 10845.875, 14696, 19267.7916666667, 40798.3833333333, 77847.25 + ), c(2702.6, 4977.35, 7662.25, 9719.75, 12149.25, 16818.15, + 24841.6), c(10029.025, 11575.1333333333, 13744, 16202.25, + 28511.875, 55006.4333333333, 76107.575), c(3786.375, 4825.71666666667, + 6922.625, 8743.25, 10023.1666666667, 13076.9666666667, 17702.375 + ), c(9716.8, 12099.6166666667, 13831.75, 16236, 21678.0833333333, + 25433.85, 29164.925), c(15758.4, 18701.4, 30977.3333333333, + 38798, 46155.3333333333, 52291.8, 54173.4), c(2361.6, 2780.1, + 11444.8333333333, 14837, 19625.6666666667, 42384.4333333333, + 78182), c(3490.5, 5374.46666666667, 7847, 10091, 12379, 17636.8, + 25130.2), c(9775.8, 11301.9333333333, 13507, 15910, 27131.6666666667, + 56248.5333333333, 76196.9), c(3717.5, 4756.26666666667, 7224.66666666667, + 8649.5, 9988.66666666667, 13119.2666666667, 17772), c(11078.7, + 12872.3333333333, 14339.6666666667, 16883, 22316.3333333333, + 25971.2, 29845.6), c(16326.8, 20830.8333333333, 32634.75, + 38883.5, 46655.2083333333, 52755.95, 54634.1), c(2433.9, + 2774.15, 11421.2083333333, 15058, 19772.5416666667, 43759.4833333333, + 78305.75), c(3146.75, 5152.86666666667, 7268.75, 9719.75, + 12133.9166666667, 18015.45, 24978.8), c(9449.575, 10955.7333333333, + 13283, 15523.25, 24825.1666666666, 57417.6333333333, 76213.225 + ), c(4012.9, 5440.66666666667, 7644.75, 8743.25, 10057.1666666667, + 13264.5666666667, 17944.625), c(7168.225, 12262.5166666667, + 14176.6666666667, 16511, 22391.5833333333, 25945.55, 29963.275 + ), c(17412.3, 22308.8, 34428, 38969, 47155.0833333333, 53220.1, + 55094.8), c(2506.2, 4444.2, 11594.6666666667, 15279, 19919.4166666667, + 45134.5333333333, 78429.5), c(3243, 5371.26666666667, 7776.5, + 10091, 12361.1666666667, 18834.1, 25267.4), c(9123.35, 10609.5333333333, + 12954.5, 15136.5, 20226.4166666667, 58586.7333333334, 76229.55 + ), c(3906.7, 5242.66666666667, 7866.58333333333, 8649.5, + 10022.6666666667, 13306.8666666667, 18014.25), c(5065.8, + 10541.3, 14013.6666666667, 16139, 21698.9166666667, 24836.8, + 30080.95), c(16960.25, 23946.75, 34087.8333333333, 38334.5, + 46928.9166666667, 52931.75, 54803), c(2427.5, 3658.75, 11472.7916666667, + 15058, 19629.7916666667, 46358.5833333334, 78402.25), c(2899.25, + 5149.66666666667, 7495.25, 9506.5, 12163.6666666667, 19212.75, + 25116), c(9054.625, 10520.8333333333, 12870.5, 15092.25, + 19622.2916666667, 60013.3333333334, 76503.375), c(3747, 4991.16666666667, + 7655.29166666667, 8529.75, 9937.54166666667, 13295.6666666667, + 18030.375), c(5186, 10631.25, 14214.6666666667, 16511, 21953.0833333333, + 25051.5, 30486.625), c(16277.2, 23612.3, 34219.6666666667, + 37700, 46755.6666666667, 52643.4, 54511.2), c(2348.8, 2873.3, + 11338.1666666667, 15017, 19428.8333333333, 47339.3666666666, + 78375), c(2555.5, 4928.06666666667, 7162, 9117, 11738, 19591.4, + 24964.6), c(8985.9, 10432.1333333333, 12980, 15048, 18340.1666666667, + 61439.9333333333, 76777.2), c(3298.3, 4450.66666666667, 7011.66666666667, + 8050, 9489.66666666667, 12995.4666666667, 17757.5), c(5306.2, + 10508.4, 14723, 16883, 22316.3333333333, 25275, 30892.3), + c(21211.45, 28351.35, 35104.7916666667, 37619.5, 46709.625, + 52688.85, 54583.9), c(2345.1, 2399.5, 11278.5416666667, 14926.5, + 19001.6666666667, 32358.8166666666, 74049.6499999998), c(2289.25, + 4771.45, 6906.25, 8946.5, 10652.625, 19977.2166666667, 24890.7 + ), c(9004.175, 10411.75, 12967.25, 15034.25, 17727.1666666667, + 62772.8333333333, 77138.025), c(2808.6, 3912.01666666667, + 6327.04166666667, 7521.5, 8971.58333333333, 12715.05, 17443.625 + ), c(5836.4, 10729.7166666667, 15191.2916666667, 18142.75, + 23089.5833333333, 25927.9166666667, 31707.975), c(23944.6, + 31138.5333333333, 36280.1666666667, 38179, 47104.25, 52733.9, + 54717.6), c(3052.35, 9589.2, 11699.75, 15017, 19227.6666666667, + 34023.7666666667, 75556.1), c(4574.65, 4940.86666666667, + 7336.5, 9224.5, 11000.25, 20507.3666666667, 25101.8), c(9577.75, + 10880.7333333333, 13239.8333333333, 15308, 17811.6666666667, + 63744.3333333333, 77498.85), c(2606.9, 3747.06666666667, + 6793.75, 7468, 8860.83333333333, 12844.2, 17417.75), c(6519.6, + 10972.3666666667, 15734.9166666667, 19251.5, 24015.8333333333, + 26772.6666666667, 32676.65), c(30143.55, 32933, 36254.9166666667, + 38288.5, 45981.375, 51866.35, 54783.025), c(9504.175, 10087.8, + 12112.75, 14926.5, 18563.375, 35687.7166666666, 77061.5499999999 + ), c(5122.3, 5440.33333333333, 7636.75, 9481, 11397.25, 21089.5166666666, + 25364.9), c(9483.625, 10792.0333333333, 13636.5833333333, + 15178.75, 17645.1666666667, 51192.7333333332, 71279.5), c(2699.2, + 3876.11666666667, 7081.375, 7650.5, 9037.66666666667, 13267.35, + 17685.875), c(6752.8, 10373.6333333333, 16482.0416666667, + 19689.25, 24420.75, 27167.4166666667, 33195.325), c(30054, + 32846.3333333333, 36222.6666666667, 38118, 44345.5, 51942.5, + 54863), c(9602, 10479.6666666667, 11966, 14836, 18000.6666666667, + 37351.6666666666, 78566.9999999998), c(4943, 5436, 7407, + 9224.5, 10590, 15999.5, 23867), c(10402, 11313.3333333333, + 13850.6666666667, 15049.5, 17181.3333333333, 52619.3333333333, + 71699.9999999999), c(2497.5, 4747.66666666667, 6887.5, 7468, + 8788.33333333333, 13396.5, 17660), c(6353, 7303.33333333333, + 16575.6666666667, 20127, 24683, 27562.1666666667, 33714), + c(30609, 33678.5333333333, 36115.4166666667, 38152.5, 44796.5, + 51943.65, 54882.775), c(8727.5, 10506.25, 11801.125, 14298.5, + 17316.75, 38593.1166666666, 80240.5749999997), c(5286.75, + 5818.43333333333, 7427.5, 9255, 10477.2083333333, 16246.2, + 23904.1), c(10572.9, 11566.1333333333, 13513.9166666667, + 14732.25, 16706.1666666667, 38273.0999999999, 71758.825), + c(3500.325, 5164.43333333333, 6782.58333333333, 7167.5, 8370.08333333333, + 12298.8833333333, 17550.65), c(8782.15, 9745.23333333333, + 18901.7916666667, 23672.5, 27843.3333333333, 30189.4166666667, + 38204.6499999997), c(29822, 33388.7333333333, 35722.6666666667, + 37746, 45086.5, 51783.8, 54741.55), c(7853, 10025.5, 11340, + 13761, 16558.5833333333, 25362.8666666667, 81914.1500000001 + ), c(5095.5, 5582.53333333333, 7253, 8914, 10045.1666666667, + 12628.6, 23801.2), c(10957.1, 11891.6666666667, 13177.1666666667, + 14415, 16394.8333333333, 19398.1666666667, 71817.65), c(3087.65, + 4791.53333333333, 6409.83333333333, 6725, 7740.16666666667, + 9516.13333333334, 17299.3), c(7598.3, 8574.13333333334, 17622.3333333333, + 22350, 26059.5, 28098, 39082.3000000001), c(32519.95, 33308.7333333333, + 35436.1666666667, 37488.5, 43019.5, 50277.7666666667, 52591.25 + ), c(6840.5, 9406.75, 10705.25, 13314.5, 14638.9583333333, + 18303.3166666667, 43700.7499999999), c(4966.25, 5408.63333333333, + 7129.75, 8776.5, 9930.33333333333, 11747.7666666667, 14690.275 + ), c(10918.65, 11891.1666666667, 13407.9166666667, 14469.75, + 16427.6666666667, 18348.5166666667, 28410.3749999999), c(2910.975, + 4654.63333333333, 6265.375, 6571.5, 7712.58333333333, 8852.8, + 11127.4), c(8925.95, 9914.53333333333, 18804.5833333333, + 23309.5, 27544.8333333333, 29565.5, 42471.4499999999), c(31164.6, + 32021.5333333333, 34218.6666666667, 36394, 41517.8333333333, + 47717.1333333333, 49796.3), c(6196, 9156, 10366, 13007, 13878.6666666667, + 17648.3, 20612.7), c(4813, 5210.73333333333, 6961, 8615, + 9759, 10560.0666666667, 11782.5), c(10610.2, 11515.6666666667, + 13055.6666666667, 14137, 16014.3333333333, 16913.2666666667, + 18871), c(2734.3, 4517.73333333333, 6168.83333333333, 6499, + 7697.33333333333, 8782.4, 9804.8), c(7477.6, 8478.93333333333, + 17148.3333333333, 22205, 26049, 27669.7333333333, 29791.2 + ), c(27272.6, 28129.5333333333, 30326.6666666667, 32502, + 36595, 42381.4666666667, 45904.3), c(8994, 9834.56666666667, + 10853, 13428, 14255.6666666667, 18330.8666666667, 20967.1 + ), c(4731, 5128.73333333333, 6879, 8533, 9524.33333333333, + 10012.9333333333, 11700.5), c(10609.2, 11514.6666666667, + 13054.6666666667, 14136, 15769, 16912.2666666667, 18870), + c(2066.3, 3849.73333333333, 5500.83333333333, 6018, 7084.66666666667, + 8114.4, 9136.8), c(7477.6, 8478.93333333333, 17148.3333333333, + 22350, 26049, 27669.7333333333, 29791.2), c(25775.6, 26632.5333333333, + 28829.6666666667, 30725, 33217, 40884.4666666667, 44407.3 + ), c(8802, 9642.56666666667, 10661, 13007, 13878.6666666667, + 17648.3, 18255.6), c(4731, 5128.73333333333, 6861, 8533, + 9524.33333333333, 10012.9333333333, 11700.5), c(10484.2, + 11389.6666666667, 12929.6666666667, 13997, 15644, 16787.2666666667, + 18745), c(2020.3, 3803.73333333333, 5559, 5972, 7038.66666666667, + 8068.4, 9090.8), c(7477.6, 9930.13333333334, 17199.3333333333, + 22350, 26049, 27669.7333333333, 29791.2), c(22459.4, 23246.3333333333, + 24843.6666666667, 26739, 29231, 36898.4666666667, 40421.3 + ), c(9153.2, 9642.56666666667, 10661, 13007, 13878.6666666667, + 17648.3, 18255.6), c(4848.6, 5776.73333333333, 6861, 8533, + 9524.33333333333, 10012.9333333333, 11700.5), c(10881.8, + 12145.2, 12929.6666666667, 13997, 15644, 16787.2666666667, + 18745), c(3192.3, 4022.2, 5559, 5972, 7038.66666666667, 8068.4, + 9090.8), c(10253.6, 12706.1333333333, 19975.3333333333, 25706, + 29214.3333333333, 31033, 38341.6), c(22387.4, 23065, 24717.6666666667, + 26667, 29159, 36826.4666666667, 40349.3), c(9153.2, 9828.53333333333, + 10712, 13236, 14046.6666666667, 17648.3, 18255.6), c(5472.2, + 6260.8, 6934, 8533, 9524.33333333333, 10012.9333333333, 11700.5 + ), c(11600.6, 12145.2, 12929.6666666667, 13997, 15644, 16787.2666666667, + 18745), c(3931.2, 4163.66666666667, 5559, 5972, 7038.66666666667, + 8068.4, 9090.8), c(10483.6, 12936.1333333333, 20205.3333333333, + 26048, 30210.6666666667, 31670.8, 38571.6), c(21345.4, 22418.3333333333, + 23729.6666666667, 25625, 28117, 35784.4666666667, 39307.3 + ), c(8948.2, 9623.53333333333, 10507, 12802, 13841.6666666667, + 17443.3, 18050.6), c(5090.2, 5878.8, 6552, 8120, 9142.33333333333, + 9630.93333333333, 11318.5), c(11098, 11842.3333333333, 12929.6666666667, + 13997, 15644, 16787.2666666667, 18745), c(3899.2, 4131.66666666667, + 5500.33333333333, 5753, 7006.66666666667, 8036.4, 9058.8), + c(10253.6, 12706.1333333333, 19975.3333333333, 25818, 29214.3333333333, + 31033, 33266.4), c(20870.4, 21943.3333333333, 23254.6666666667, + 25150, 27642, 35309.4666666667, 38832.3), c(8610.2, 9285.53333333333, + 10169, 11576, 13335.6666666667, 17105.3, 17712.6), c(5033.2, + 5821.8, 6850, 8094, 9179.33333333333, 9573.93333333333, 11261.5 + ), c(11009, 11228.6666666667, 12538.3333333333, 13863, 15555, + 16698.2666666667, 18656), c(3776.2, 4008.66666666667, 5284, + 5549, 6725.66666666667, 7913.4, 8935.8), c(7477.6, 9930.13333333334, + 17199.3333333333, 22930, 26049, 28257, 30490.4), c(19821.4, + 20894.3333333333, 22205.6666666667, 23879, 25537.3333333333, + 32287, 35424.1), c(8577.2, 9252.53333333333, 10136, 11543, + 13302.6666666667, 16077.6666666667, 17451.2), c(4858.2, 5646.8, + 6675, 7919, 9004.33333333333, 9398.93333333333, 11086.5), + c(10725, 10944.6666666667, 12245.6666666667, 13414, 15162.3333333333, + 16256.1333333333, 18372), c(3746.2, 3978.66666666667, 5254, + 5519, 6695.66666666667, 7883.4, 8905.8), c(7015.6, 9468.13333333334, + 16737.3333333333, 21888, 24995.3333333333, 27790.7333333333, + 30028.4), c(19739.4, 20812.3333333333, 22123.6666666667, + 23797, 25455.3333333333, 29471.8333333333, 33457.4), c(8457.2, + 9132.53333333333, 10016, 11423, 13064.6666666667, 14882.3333333333, + 16487.6), c(4756.2, 5544.8, 6573, 7817, 8785, 9005.33333333333, + 9658), c(10266, 10485.6666666667, 11786.6666666667, 12955, + 14703.3333333333, 15439.2666666667, 16501.4), c(3690.2, 3922.66666666667, + 5198, 5463, 6255.33333333333, 7378.46666666667, 8258.6), + c(6967.6, 9420.13333333334, 16689.3333333333, 21695, 24506.6666666667, + 27742.7333333333, 29980.4), c(17730.4, 18803.3333333333, + 20114.6666666667, 21531, 23113, 26977, 28232), c(8127.2, + 8802.53333333333, 9686, 11042, 12587, 14552.3333333333, 16157.6 + ), c(3962.2, 4750.8, 5779, 6992, 7636.66666666667, 8200.53333333333, + 8292.8), c(10207, 10426.6666666667, 11727.6666666667, 12896, + 14513, 15178.4666666667, 16052), c(3611.2, 3843.66666666667, + 5119, 5384, 6039.33333333333, 6873.33333333333, 7838.8), + c(6094.6, 8547.13333333334, 15152, 20127, 23361.1666666667, + 26869.7333333333, 29107.4), c(17030.4, 18103.3333333333, + 19360.6666666667, 20459, 22413, 26277, 27532), c(8010.2, + 8685.53333333333, 9518, 10925, 12470, 14435.3333333333, 16040.6 + ), c(3788.2, 4576.8, 5325.33333333333, 6818, 7462.66666666667, + 8026.53333333333, 8118.8), c(10036, 10255.6666666667, 11556.6666666667, + 12725, 14080.3333333333, 15007.4666666667, 15881), c(3493.2, + 3725.66666666667, 4809, 5266, 5921.33333333333, 6755.33333333333, + 7720.8), c(6094.6, 8547.13333333334, 14977.6666666667, 20127, + 23361.1666666667, 26869.7333333333, 29107.4), c(16349.4, + 17422.3333333333, 18679.6666666667, 19778, 21732, 25596, + 26851), c(7567.2, 8242.53333333333, 9379.66666666667, 10482, + 12027, 13992.3333333333, 15597.6), c(3747.2, 4535.8, 5711.33333333333, + 6777, 7421.66666666667, 7985.53333333333, 8077.8), c(9698, + 9917.66666666667, 11218.6666666667, 12387, 13742.3333333333, + 14669.4666666667, 15543), c(3670.6, 3908, 5001, 5347, 6121.66666666667, + 6755.33333333333, 7720.8), c(6094.6, 8547.13333333334, 15152, + 20127, 23361.1666666667, 26869.7333333333, 29107.4), c(16298.4, + 17371.3333333333, 18628.6666666667, 19727, 21681, 25545, + 26800), c(6850.2, 7525.53333333333, 8662.66666666667, 9765, + 11235, 13275.3333333333, 14880.6), c(3736.2, 4524.8, 5700.33333333333, + 6714, 7410.66666666667, 7974.53333333333, 8066.8), c(9287, + 9506.66666666667, 10599, 11883, 13331.3333333333, 14258.4666666667, + 15132), c(3402.6, 3640, 4733, 4998, 5853.66666666667, 6487.33333333333, + 7452.8), c(5597.6, 8050.13333333334, 14655, 19574, 21943, + 26372.7333333333, 28610.4), c(15152.4, 16225.3333333333, + 17482.6666666667, 18581, 20535, 24399, 25654), c(6628.2, + 7303.53333333333, 8440.66666666667, 9543, 11013, 13053.3333333333, + 14658.6), c(3110.2, 3898.8, 5074.33333333333, 6140, 6784.66666666667, + 7348.53333333333, 7440.8), c(8807, 9026.66666666667, 10119, + 11403, 13193.3333333333, 13778.4666666667, 14652), c(3349.6, + 3587, 4680, 4945, 5712.66666666667, 6263.53333333333, 7399.8 + ), c(5468.6, 7921.13333333334, 14526, 19172, 21297.6666666667, + 26243.7333333333, 28481.4), c(14243.4, 15316.3333333333, + 16573.6666666667, 17672, 19117.3333333333, 21921.5333333333, + 24647), c(6569.2, 7244.53333333333, 8381.66666666667, 9484, + 10946.6666666667, 12994.3333333333, 14599.6), c(2365.2, 3153.8, + 4329.33333333333, 5343, 6039.66666666667, 6603.53333333333, + 6695.8), c(8524, 8743.66666666667, 9836, 11120, 12910.3333333333, + 13495.4666666667, 14369), c(2761.6, 2999, 4092, 4438, 5124.66666666667, + 5675.53333333333, 6811.8), c(4336.6, 6789.13333333334, 13394, + 17898, 20165.6666666667, 25111.7333333333, 27349.4), c(13648.4, + 14721.3333333333, 15978.6666666667, 17077, 18266, 19933.0666666667, + 22864), c(5754.2, 6429.53333333333, 7566.66666666667, 8669, + 10050.3333333333, 10882.4, 13120.6), c(2361.2, 3149.8, 4325.33333333333, + 5270, 5878.66666666667, 6580.4, 6683.8), c(8434, 8653.66666666667, + 9746, 11030, 12458, 13329.8, 13463.8), c(2669.6, 2907, 4000, + 4346, 5032.66666666667, 5583.53333333333, 6719.8), c(3177.6, + 5630.13333333334, 12235, 16653, 18812.6666666667, 23952.7333333333, + 26190.4)), quantile_levels = c(0.05, 0.1, 0.25, 0.5, 0.75, + 0.9, 0.95), class = c("quantile_pred", "vctrs_vctr", "list"))), row.names = c(NA, + -186L), class = c("tbl_df", "tbl", "data.frame")) diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md new file mode 100644 index 000000000..cffb72703 --- /dev/null +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -0,0 +1,84 @@ +# printing step_adjust_latency results in expected output + + Code + r5 + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. Adj. extend_lags: has_role("raw") latency TBD at train time + 2. Lagging: death_rate by 0, 6, 11 + 3. Lagging: case_rate by 1, 5 + 4. Leading: death_rate by 7 + +--- + + Code + prep(r5, real_x) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 200 data points and no incomplete rows. + + -- Operations + 1. Adj. extend_lags: case_rate death_rate w/ forecast date 2021-07-24 | Trained + 2. Lagging: death_rate by 5, 11, 16, (lat adj) | Trained + 3. Lagging: case_rate by 6, 10, (lat adj) | Trained + 4. Leading: death_rate by 7 | Trained + +--- + + Code + r6 + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. Lagging: death_rate by 0, 7, 14 + 2. Adj. extend_ahead: has_role("raw") latency TBD at train time + 3. Leading: death_rate by 7 + +--- + + Code + prep(r6, covid_case_death_rates) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 20496 data points and no incomplete rows. + + -- Operations + 1. Lagging: death_rate by 0, 7, 14 | Trained + 2. Adj. extend_ahead: case_rate, ... w/ forecast date 2023-03-10 | Trained + 3. Leading: death_rate by -441, (lat adj) | Trained + diff --git a/tests/testthat/_snaps/step_epi_shift.md b/tests/testthat/_snaps/step_epi_shift.md index 44c828118..4c720792c 100644 --- a/tests/testthat/_snaps/step_epi_shift.md +++ b/tests/testthat/_snaps/step_epi_shift.md @@ -4,8 +4,8 @@ r1 <- epi_recipe(x) %>% step_epi_ahead(death_rate, ahead = 3.6) %>% step_epi_lag(death_rate, lag = 1.9) Condition - Error in `step_epi_ahead()`: - ! `ahead` must be a non-negative integer. + Error in `step_epi_lag()`: + ! `lag` must be a non-negative integer. # A negative lag value should should throw an error @@ -21,16 +21,13 @@ Code r3 <- epi_recipe(x) %>% step_epi_ahead(death_rate, ahead = -7) %>% step_epi_lag( death_rate, lag = 7) - Condition - Error in `step_epi_ahead()`: - ! `ahead` must be a non-negative integer. # Values for ahead and lag cannot be duplicates Code slm_fit(r4) Condition - Error in `bake()`: + Error in `add_shifted_columns()`: ! Name collision occured in The following variable name already exists: "lag_7_death_rate". diff --git a/tests/testthat/_snaps/step_growth_rate.md b/tests/testthat/_snaps/step_growth_rate.md index 5a3ac6f44..409aa197e 100644 --- a/tests/testthat/_snaps/step_growth_rate.md +++ b/tests/testthat/_snaps/step_growth_rate.md @@ -89,19 +89,18 @@ --- Code - step_growth_rate(r, value, skip = 1) + step_growth_rate(r, value, na_rm = 1) Condition Error in `step_growth_rate()`: - ! `skip` must be a scalar of type . + ! `na_rm` must be a scalar of type . --- Code - step_growth_rate(r, value, additional_gr_args_list = 1:5) + step_growth_rate(r, value, skip = 1) Condition Error in `step_growth_rate()`: - ! `additional_gr_args_list` must be a . - i See `?epiprocess::growth_rate` for available options. + ! `skip` must be a scalar of type . --- @@ -117,5 +116,5 @@ step_growth_rate(r, value, replace_Inf = c(1, 2)) Condition Error in `step_growth_rate()`: - ! replace_Inf must be a scalar. + ! `replace_Inf` must be a scalar. diff --git a/tests/testthat/_snaps/wis-dist-quantiles.md b/tests/testthat/_snaps/wis-dist-quantiles.md deleted file mode 100644 index fb9cfbdf6..000000000 --- a/tests/testthat/_snaps/wis-dist-quantiles.md +++ /dev/null @@ -1,17 +0,0 @@ -# wis dispatches and produces the correct values - - Code - weighted_interval_score(1:10, 10) - Condition - Error in `weighted_interval_score()`: - ! Weighted interval score can only be calculated if `x` - has class . - ---- - - Code - weighted_interval_score(dist_quantiles(list(1:4, 8:11), 1:4 / 5), 1:3) - Condition - Error in `weighted_interval_score()`: - ! Can't recycle `x` (size 2) to match `actual` (size 3). - diff --git a/tests/testthat/_snaps/wis-quantile_pred.md b/tests/testthat/_snaps/wis-quantile_pred.md new file mode 100644 index 000000000..f13bd74db --- /dev/null +++ b/tests/testthat/_snaps/wis-quantile_pred.md @@ -0,0 +1,16 @@ +# wis dispatches and produces the correct values + + Code + weighted_interval_score(1:10, 10) + Condition + Error in `UseMethod()`: + ! no applicable method for 'weighted_interval_score' applied to an object of class "c('integer', 'numeric')" + +--- + + Code + weighted_interval_score(quantile_pred(rbind(1:4, 8:11), 1:4 / 5), 1:3) + Condition + Error in `weighted_interval_score.quantile_pred()`: + ! Assertion on 'actual' failed: Must have length 2, but has length 3. + diff --git a/tests/testthat/test-arg_is_.R b/tests/testthat/test-arg_is_.R index b7a0b7644..f043328c7 100644 --- a/tests/testthat/test-arg_is_.R +++ b/tests/testthat/test-arg_is_.R @@ -149,6 +149,7 @@ test_that("coerce scalar to date", { test_that("simple surface step test", { expect_snapshot( error = TRUE, - epi_recipe(cases_deaths_subset) %>% step_epi_lag(death_rate, lag = "hello") + epi_recipe(cases_deaths_subset) %>% + step_epi_lag(death_rate, lag = "hello") ) }) diff --git a/tests/testthat/test-arx_args_list.R b/tests/testthat/test-arx_args_list.R index 03cbc0025..50379357a 100644 --- a/tests/testthat/test-arx_args_list.R +++ b/tests/testthat/test-arx_args_list.R @@ -26,11 +26,14 @@ test_that("arx_args checks inputs", { expect_snapshot(error = TRUE, arx_args_list(n_training_min = "de")) expect_snapshot(error = TRUE, arx_args_list(epi_keys = 1)) - expect_warning(arx_args_list( - forecast_date = as.Date("2022-01-01"), - target_date = as.Date("2022-01-03"), - ahead = 1L - )) + expect_error( + arx_args_list( + forecast_date = as.Date("2022-01-01"), + target_date = as.Date("2022-01-04"), + ahead = 1L + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) }) test_that("arx forecaster disambiguates quantiles", { @@ -38,8 +41,13 @@ test_that("arx forecaster disambiguates quantiles", { tlist <- eval(formals(quantile_reg)$quantile_levels) expect_identical( # both default compare_quantile_args(alist, tlist), - sort(c(alist, tlist)) + c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95) + ) + expect_snapshot( + error = TRUE, + compare_quantile_args(alist / 10, 1:9 / 10, "grf") ) + expect_identical(compare_quantile_args(alist, 1:9 / 10, "grf"), 1:9 / 10) alist <- c(.5, alist) expect_identical( # tlist is default, should give alist compare_quantile_args(alist, tlist), diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R new file mode 100644 index 000000000..d13e6d2ea --- /dev/null +++ b/tests/testthat/test-arx_forecaster.R @@ -0,0 +1,45 @@ +train_data <- epidatasets::cases_deaths_subset +test_that("arx_forecaster warns if forecast date beyond the implicit one", { + bad_date <- max(train_data$time_value) + 300 + expect_warning( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(forecast_date = bad_date)) + ), + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) +}) + +test_that("arx_forecaster errors if forecast date, target date, and ahead are inconsistent", { + max_date <- max(train_data$time_value) + expect_error( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) +}) + +test_that("warns if there's not enough data to predict", { + edf <- tibble( + geo_value = "ct", + time_value = seq(as.Date("2020-10-01"), as.Date("2023-05-31"), by = "day"), + ) %>% + mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>% + # Oct to May (flu season, ish) only: + filter(!dplyr::between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>% + # and actually, pretend we're around mid-October 2022: + filter(time_value <= as.Date("2022-10-12")) %>% + as_epi_df(as_of = as.Date("2022-10-12")) + edf %>% filter(time_value > "2022-08-01") + + expect_error( + edf %>% arx_forecaster("value"), + class = "epipredict__not_enough_data" + ) +}) diff --git a/tests/testthat/test-bake-method.R b/tests/testthat/test-bake-method.R index 06f861012..8e118a18d 100644 --- a/tests/testthat/test-bake-method.R +++ b/tests/testthat/test-bake-method.R @@ -1,5 +1,5 @@ test_that("bake method works in all cases", { - edf <- case_death_rate_subset %>% + edf <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(edf) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-blueprint.R b/tests/testthat/test-blueprint.R index 2d22aff6e..b37bd5e4a 100644 --- a/tests/testthat/test-blueprint.R +++ b/tests/testthat/test-blueprint.R @@ -4,7 +4,7 @@ test_that("epi_recipe blueprint keeps the class, mold works", { expect_s3_class(bp, "default_epi_recipe_blueprint") expect_s3_class(refresh_blueprint(bp), "default_epi_recipe_blueprint") - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates # expect_s3_class(er_check_is_data_like(jhu), "epi_df") r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-check-training-set.R b/tests/testthat/test-check-training-set.R index ad891dd45..071a5ccf3 100644 --- a/tests/testthat/test-check-training-set.R +++ b/tests/testthat/test-check-training-set.R @@ -1,5 +1,5 @@ test_that("training set validation works", { - template <- cases_deaths_subset[1, ] + template <- epidatasets::cases_deaths_subset[1, ] rec <- list(template = template) t1 <- template diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R new file mode 100644 index 000000000..3ca388afb --- /dev/null +++ b/tests/testthat/test-check_enough_data.R @@ -0,0 +1,139 @@ +# Setup toy data +n <- 10 +toy_epi_df <- tibble::tibble( + time_value = rep( + seq( + as.Date("2020-01-01"), + by = 1, + length.out = n + ), + times = 2 + ), + geo_value = rep(c("ca", "hi"), each = n), + x = c(1:n, c(1:(n - 2), NA, NA)), + y = 1:(2 * n) +) %>% epiprocess::as_epi_df() + +test_that("check_enough_data works on pooled data", { + # Check both columns have enough data + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n, drop_na = FALSE) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + # Check both column don't have enough data + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>% + prep(toy_epi_df) + ) + # Check drop_na works + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>% + prep(toy_epi_df) + ) +}) + +test_that("check_enough_data works on unpooled data", { + # Check both columns have enough data + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n, epi_keys = "geo_value", drop_na = FALSE) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + # Check one column don't have enough data + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% + prep(toy_epi_df) + ) + # Check drop_na works + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% + prep(toy_epi_df) + ) +}) + +test_that("check_enough_data outputs the correct recipe values", { + expect_no_error( + p <- epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 2) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + + expect_equal(nrow(p), 2 * n) + expect_equal(ncol(p), 4L) + expect_s3_class(p, "epi_df") + expect_named(p, c("geo_value", "time_value", "x", "y")) # order in epiprocess::new_epi_df + expect_equal( + p$time_value, + rep(seq(as.Date("2020-01-01"), by = 1, length.out = n), times = 2) + ) + expect_equal(p$geo_value, rep(c("ca", "hi"), each = n)) +}) + +test_that("check_enough_data only checks train data when skip = FALSE", { + # Check that the train data has enough data, the test data does not, but + # the check passes anyway (because it should be applied to training data) + toy_test_data <- toy_epi_df %>% + group_by(geo_value) %>% + slice(3:10) %>% + epiprocess::as_epi_df() + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n - 2, epi_keys = "geo_value") %>% + prep(toy_epi_df) %>% + bake(new_data = toy_test_data) + ) + # Making sure `skip = TRUE` is working correctly in `predict` + expect_no_error( + epi_recipe(toy_epi_df) %>% + add_role(y, new_role = "outcome") %>% + check_enough_data(x, min_observations = n - 2, epi_keys = "geo_value") %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) + ) + # making sure it works for skip = FALSE, where there's enough data to train + # but not enough to predict + expect_no_error( + forecaster <- epi_recipe(toy_epi_df) %>% + add_role(y, new_role = "outcome") %>% + check_enough_data(x, min_observations = 1, epi_keys = "geo_value", skip = FALSE) %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) + ) + expect_snapshot( + error = TRUE, + forecaster %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) + ) +}) + +test_that("check_enough_data works with all_predictors() downstream of constructed terms", { + # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the + # lags don't count) + expect_no_error( + epi_recipe(toy_epi_df) %>% + step_epi_lag(x, lag = c(1, 2)) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 5) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + step_epi_lag(x, lag = c(1, 2)) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 4) %>% + prep(toy_epi_df) + ) +}) diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_train_data.R deleted file mode 100644 index 9b2ef5f34..000000000 --- a/tests/testthat/test-check_enough_train_data.R +++ /dev/null @@ -1,127 +0,0 @@ -# Setup toy data -n <- 10 -toy_epi_df <- tibble::tibble( - time_value = rep( - seq( - as.Date("2020-01-01"), - by = 1, - length.out = n - ), - times = 2 - ), - geo_value = rep(c("ca", "hi"), each = n), - x = c(1:n, c(1:(n - 2), NA, NA)), - y = 1:(2 * n) -) %>% epiprocess::as_epi_df() - -test_that("check_enough_train_data works on pooled data", { - # Check both columns have enough data - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n, drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check both column don't have enough data - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check drop_na works - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) - -test_that("check_enough_train_data works on unpooled data", { - # Check both columns have enough data - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check one column don't have enough data - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check drop_na works - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) - -test_that("check_enough_train_data outputs the correct recipe values", { - expect_no_error( - p <- epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 2) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - - expect_equal(nrow(p), 2 * n) - expect_equal(ncol(p), 4L) - expect_s3_class(p, "epi_df") - expect_named(p, c("geo_value", "time_value", "x", "y")) # order in epiprocess::new_epi_df - expect_equal( - p$time_value, - rep(seq(as.Date("2020-01-01"), by = 1, length.out = n), times = 2) - ) - expect_equal(p$geo_value, rep(c("ca", "hi"), each = n)) -}) - -test_that("check_enough_train_data only checks train data", { - # Check that the train data has enough data, the test data does not, but - # the check passes anyway (because it should be applied to training data) - toy_test_data <- toy_epi_df %>% - group_by(geo_value) %>% - slice(3:10) %>% - epiprocess::as_epi_df() - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value") %>% - prep(toy_epi_df) %>% - bake(new_data = toy_test_data) - ) - # Same thing, but skip = FALSE - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(y, n = n - 2, epi_keys = "geo_value", skip = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = toy_test_data) - ) -}) - -test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", { - # With a lag of 2, we will get 2 * n - 6 non-NA rows - expect_no_error( - epi_recipe(toy_epi_df) %>% - step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 6) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) diff --git a/tests/testthat/test-climatological_forecaster.R b/tests/testthat/test-climatological_forecaster.R new file mode 100644 index 000000000..24f99c26f --- /dev/null +++ b/tests/testthat/test-climatological_forecaster.R @@ -0,0 +1,52 @@ +test_that("climate args list validates properly", { + expect_s3_class(climate_args_list(), c("climate_fcast", "alist")) + expect_s3_class( + climate_args_list(forecast_date = as.Date("2021-01-10")), + c("climate_fcast", "alist") + ) + expect_snapshot(error = TRUE, climate_args_list(forecast_date = 12345)) + expect_snapshot( + error = TRUE, + climate_args_list(forecast_date = as.Date(c("2021-01-10", "2024-01-22"))) + ) + expect_silent(climate_args_list(forecast_horizon = 1L)) + expect_silent(climate_args_list(forecast_horizon = -1:4)) + expect_snapshot(error = TRUE, climate_args_list(forecast_horizon = 1.3)) + expect_snapshot(error = TRUE, climate_args_list(window_size = -1)) + expect_snapshot(error = TRUE, climate_args_list(window_size = 2.5)) + expect_snapshot(error = TRUE, climate_args_list(window_size = 1:3)) + expect_snapshot(error = TRUE, climate_args_list(quantile_levels = -1)) + expect_snapshot(error = TRUE, climate_args_list(quantile_levels = 1.3)) + expect_snapshot(error = TRUE, climate_args_list(symmetrize = 2.5)) + expect_snapshot(error = TRUE, climate_args_list(symmetrize = c(TRUE, TRUE))) + expect_snapshot(error = TRUE, climate_args_list(nonneg = 2.5)) + expect_snapshot(error = TRUE, climate_args_list(nonneg = c(TRUE, TRUE))) + expect_snapshot(error = TRUE, climate_args_list(quantile_by_key = TRUE)) + expect_snapshot(error = TRUE, climate_args_list(quantile_by_key = 2:3)) +}) + +test_that("climatological_forecaster works as expected", { + single_yr <- seq(as.Date("2020-01-01"), as.Date("2020-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + y = rep(c(1:183, 184:2), times = 2L) + ) %>% + as_epi_df(as_of = max(single_yr)) + clim_forecast <- climatological_forecaster(x, "y", args_list = climate_args_list(time_type = "day")) + preds <- clim_forecast$predictions %>% + mutate( + quant_med = median(.pred_distn) + ) + expect_equal(preds$.pred, preds$quant_med) + + expected_res <- tibble( + geo_value = rep(c("reg1", "reg2"), 5), + forecast_date = as.Date("2020-12-31"), + target_date = c( + rep(as.Date("2020-12-31"), 2), rep(as.Date("2021-01-01"), 2), rep(as.Date("2021-01-02"), 2), rep(as.Date("2021-01-03"), 2), rep(as.Date("2021-01-04"), 2) + ), + .pred = c(rep(3, 8), rep(4, 2)) + ) + expect_equal(preds %>% select(geo_value, forecast_date, target_date, .pred), expected_res) +}) diff --git a/tests/testthat/test-dist_quantiles.R b/tests/testthat/test-dist_quantiles.R deleted file mode 100644 index 66f229956..000000000 --- a/tests/testthat/test-dist_quantiles.R +++ /dev/null @@ -1,111 +0,0 @@ -library(distributional) - -test_that("constructor returns reasonable quantiles", { - expect_snapshot(error = TRUE, new_quantiles(rnorm(5), c(-2, -1, 0, 1, 2))) - expect_silent(new_quantiles(sort(rnorm(5)), sort(runif(5)))) - expect_snapshot(error = TRUE, new_quantiles(sort(rnorm(5)), sort(runif(2)))) - expect_silent(new_quantiles(1:5, 1:5 / 10)) - expect_snapshot(error = TRUE, new_quantiles(c(2, 1, 3, 4, 5), c(.1, .1, .2, .5, .8))) - expect_snapshot(error = TRUE, new_quantiles(c(2, 1, 3, 4, 5), c(.1, .15, .2, .5, .8))) - expect_snapshot(error = TRUE, new_quantiles(c(1, 2, 3), c(.1, .2, 3))) -}) - - -test_that("single dist_quantiles works, quantiles are accessible", { - z <- new_quantiles(values = 1:5, quantile_levels = c(.2, .4, .5, .6, .8)) - expect_s3_class(z, "dist_quantiles") - expect_equal(median(z), 3) - expect_equal(quantile(z, c(.2, .4, .5, .6, .8)), 1:5) - expect_equal(quantile(z, c(.3, .7), middle = "linear"), c(1.5, 4.5)) - - Q <- stats::splinefun(c(.2, .4, .5, .6, .8), 1:5, method = "hyman") - expect_equal(quantile(z, c(.3, .7), middle = "cubic"), Q(c(.3, .7))) - expect_identical( - extrapolate_quantiles(z, c(.3, .7), middle = "linear"), - new_quantiles(values = c(1, 1.5, 2, 3, 4, 4.5, 5), quantile_levels = 2:8 / 10) - ) - # empty values slot results in a length zero distribution - # see issue #361 - expect_length(dist_quantiles(list(), c(.1, .9)), 0L) - expect_identical( - dist_quantiles(list(), c(.1, .9)), - distributional::dist_degenerate(double()) - ) -}) - - -test_that("quantile extrapolator works", { - dstn <- dist_normal(c(10, 2), c(5, 10)) - qq <- extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) - expect_s3_class(qq, "distribution") - expect_s3_class(vctrs::vec_data(qq[1])[[1]], "dist_quantiles") - expect_length(parameters(qq[1])$quantile_levels[[1]], 3L) - - dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) - qq <- extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) - expect_s3_class(qq, "distribution") - expect_s3_class(vctrs::vec_data(qq[1])[[1]], "dist_quantiles") - expect_length(parameters(qq[1])$quantile_levels[[1]], 7L) - - dstn <- dist_quantiles(1:4, 1:4 / 5) - qq <- extrapolate_quantiles(dstn, 1:9 / 10) - dstn_na <- dist_quantiles(c(1, 2, NA, 4), 1:4 / 5) - qq2 <- extrapolate_quantiles(dstn_na, 1:9 / 10) - expect_equal(qq, qq2) - qq3 <- extrapolate_quantiles(dstn_na, 1:9 / 10, replace_na = FALSE) - qq2_vals <- field(vec_data(qq2)[[1]], "values") - qq3_vals <- field(vec_data(qq3)[[1]], "values") - qq2_vals[6] <- NA - expect_equal(qq2_vals, qq3_vals) -}) - -test_that("small deviations of quantile requests work", { - l <- c(.05, .1, .25, .75, .9, .95) - v <- c(0.0890306, 0.1424997, 0.1971793, 0.2850978, 0.3832912, 0.4240479) - badl <- l - badl[1] <- badl[1] - 1e-14 - distn <- dist_quantiles(list(v), list(l)) - - # was broken before, now works - expect_equal(quantile(distn, l), quantile(distn, badl)) - - # The tail extrapolation was still poor. It needs to _always_ use - # the smallest (largest) values or we could end up unsorted - l <- 1:9 / 10 - v <- 1:9 - distn <- dist_quantiles(list(v), list(l)) - expect_equal(quantile(distn, c(.25, .75)), list(c(2.5, 7.5))) - expect_equal(quantile(distn, c(.1, .9)), list(c(1, 9))) - qv <- data.frame(q = l, v = v) - expect_equal( - unlist(quantile(distn, c(.01, .05))), - tail_extrapolate(c(.01, .05), head(qv, 2)) - ) - expect_equal( - unlist(quantile(distn, c(.99, .95))), - tail_extrapolate(c(.95, .99), tail(qv, 2)) - ) -}) - -test_that("unary math works on quantiles", { - dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) - dstn2 <- dist_quantiles(list(log(1:4), log(8:11)), list(c(.2, .4, .6, .8))) - expect_identical(log(dstn), dstn2) - - dstn2 <- dist_quantiles(list(cumsum(1:4), cumsum(8:11)), list(c(.2, .4, .6, .8))) - expect_identical(cumsum(dstn), dstn2) -}) - -test_that("arithmetic works on quantiles", { - dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) - dstn2 <- dist_quantiles(list(1:4 + 1, 8:11 + 1), list(c(.2, .4, .6, .8))) - expect_identical(dstn + 1, dstn2) - expect_identical(1 + dstn, dstn2) - - dstn2 <- dist_quantiles(list(1:4 / 4, 8:11 / 4), list(c(.2, .4, .6, .8))) - expect_identical(dstn / 4, dstn2) - expect_identical((1 / 4) * dstn, dstn2) - - expect_snapshot(error = TRUE, sum(dstn)) - expect_snapshot(error = TRUE, suppressWarnings(dstn + distributional::dist_normal())) -}) diff --git a/tests/testthat/test-enframer.R b/tests/testthat/test-enframer.R deleted file mode 100644 index 0926c587b..000000000 --- a/tests/testthat/test-enframer.R +++ /dev/null @@ -1,13 +0,0 @@ -test_that("enframer errors/works as needed", { - template1 <- data.frame(aa = 1:5, a = NA, b = NA, c = NA) - template2 <- data.frame(aa = 1:5, a = 2:6, b = 2:6, c = 2:6) - expect_snapshot(error = TRUE, enframer(1:5, letters[1])) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), 1:3)) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), letters[1:3])) - expect_identical(enframer(data.frame(aa = 1:5), letters[1:3]), template1) - expect_snapshot(error = TRUE, enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4)) - expect_identical( - enframer(data.frame(aa = 1:5), letters[1:3], fill = 2:6), - template2 - ) -}) diff --git a/tests/testthat/test-epi_recipe.R b/tests/testthat/test-epi_recipe.R index 1b06cf24c..b4c59c0e5 100644 --- a/tests/testthat/test-epi_recipe.R +++ b/tests/testthat/test-epi_recipe.R @@ -103,7 +103,7 @@ test_that("epi_recipe epi_df works", { test_that("add/update/adjust/remove epi_recipe works as intended", { - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-epi_shift.R b/tests/testthat/test-epi_shift.R index 78c9384f1..e8e843f9c 100644 --- a/tests/testthat/test-epi_shift.R +++ b/tests/testthat/test-epi_shift.R @@ -1,23 +1,3 @@ -x <- data.frame(x1 = 1:10, x2 = -10:-1) -lags <- list(c(0, 4), 1:3) - -test_that("epi shift works with NULL keys", { - time_value <- 1:10 - out <- epi_shift(x, lags, time_value) - expect_length(out, 7L) - expect_equal(nrow(out), 14L) - expect_equal(sum(complete.cases(out)), 6L) -}) - -test_that("epi shift works with groups", { - keys <- data.frame(a = rep(letters[1:2], each = 5), b = "z") - time_value <- 1:10 - out <- epi_shift(x, lags, time_value, keys) - expect_length(out, 8L) - expect_equal(nrow(out), 18L) - expect_equal(sum(complete.cases(out)), 2L) -}) - test_that("epi shift single works, renames", { tib <- tibble( x = 1:5, y = 1:5, diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R index 8bb58b0bc..cce68a80f 100644 --- a/tests/testthat/test-epi_workflow.R +++ b/tests/testthat/test-epi_workflow.R @@ -1,5 +1,5 @@ test_that("postprocesser was evaluated", { - r <- epi_recipe(case_death_rate_subset) + r <- epi_recipe(covid_case_death_rates) s <- parsnip::linear_reg() f <- frosting() @@ -12,7 +12,7 @@ test_that("postprocesser was evaluated", { test_that("outcome of the two methods are the same", { - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7)) %>% @@ -33,7 +33,7 @@ test_that("outcome of the two methods are the same", { }) test_that("model can be added/updated/removed from epi_workflow", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% @@ -64,7 +64,7 @@ test_that("model can be added/updated/removed from epi_workflow", { }) test_that("forecast method works", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -79,23 +79,17 @@ test_that("forecast method works", { )) ) - args <- list( - fill_locf = TRUE, - n_recent = 360 * 3, - forecast_date = as.Date("2024-01-01") - ) expect_equal( - forecast(wf, !!!args), + forecast(wf), predict(wf, new_data = get_test_data( hardhat::extract_preprocessor(wf), - jhu, - !!!args + jhu )) ) }) test_that("forecast method errors when workflow not fit", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -109,7 +103,7 @@ test_that("forecast method errors when workflow not fit", { test_that("fit method does not silently drop the class", { # This is issue #363 - library(recipes) + suppressPackageStartupMessages(library(recipes)) tbl <- tibble::tibble( geo_value = 1, time_value = 1:100, diff --git a/tests/testthat/test-extract_argument.R b/tests/testthat/test-extract_argument.R index 7434763e7..e60632289 100644 --- a/tests/testthat/test-extract_argument.R +++ b/tests/testthat/test-extract_argument.R @@ -8,27 +8,27 @@ test_that("layer argument extractor works", { expect_snapshot(error = TRUE, extract_argument(f$layers[[1]], "layer_predict", "bubble")) expect_identical( extract_argument(f$layers[[2]], "layer_residual_quantiles", "quantile_levels"), - c(0.0275, 0.9750) + c(0.0275, 0.5, 0.9750) ) expect_snapshot(error = TRUE, extract_argument(f, "layer_thresh", "quantile_levels")) expect_identical( extract_argument(f, "layer_residual_quantiles", "quantile_levels"), - c(0.0275, 0.9750) + c(0.0275, 0.5, 0.9750) ) wf <- epi_workflow(postprocessor = f) expect_snapshot(error = TRUE, extract_argument(epi_workflow(), "layer_residual_quantiles", "quantile_levels")) expect_identical( extract_argument(wf, "layer_residual_quantiles", "quantile_levels"), - c(0.0275, 0.9750) + c(0.0275, 0.5, 0.9750) ) expect_snapshot(error = TRUE, extract_argument(wf, "layer_predict", c("type", "opts"))) }) test_that("recipe argument extractor works", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-08-01") %>% dplyr::arrange(geo_value, time_value) diff --git a/tests/testthat/test-frosting.R b/tests/testthat/test-frosting.R index 1bdce3b5a..cd153b200 100644 --- a/tests/testthat/test-frosting.R +++ b/tests/testthat/test-frosting.R @@ -40,7 +40,7 @@ test_that("frosting can be created/added/updated/adjusted/removed", { test_that("prediction works without any postprocessor", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -62,7 +62,7 @@ test_that("prediction works without any postprocessor", { test_that("layer_predict is added by default if missing", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% @@ -89,7 +89,7 @@ test_that("layer_predict is added by default if missing", { test_that("parsnip settings can be passed through predict.epi_workflow", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index aa799150b..7822f5433 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -1,17 +1,17 @@ -library(dplyr) +suppressPackageStartupMessages(library(dplyr)) test_that("return expected number of rows and returned dataset is ungrouped", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - test <- get_test_data(recipe = r, x = case_death_rate_subset) + test <- get_test_data(recipe = r, x = covid_case_death_rates) expect_equal( nrow(test), - dplyr::n_distinct(case_death_rate_subset$geo_value) * 29 + dplyr::n_distinct(covid_case_death_rates$geo_value) * 29 ) expect_false(dplyr::is.grouped_df(test)) @@ -19,31 +19,32 @@ test_that("return expected number of rows and returned dataset is ungrouped", { test_that("expect insufficient training data error", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 367)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - expect_snapshot(error = TRUE, get_test_data(recipe = r, x = case_death_rate_subset)) + expect_snapshot(error = TRUE, get_test_data(recipe = r, x = covid_case_death_rates)) }) test_that("expect error that geo_value or time_value does not exist", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value) + wrong_epi_df <- covid_case_death_rates %>% dplyr::select(-geo_value) expect_snapshot(error = TRUE, get_test_data(recipe = r, x = wrong_epi_df)) }) test_that("NA fill behaves as desired", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), @@ -81,6 +82,7 @@ test_that("NA fill behaves as desired", { }) test_that("forecast date behaves", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), @@ -137,7 +139,7 @@ test_that("Omit end rows according to minimum lag when that’s not lag 0", { # Ex. using real built-in data - ca <- case_death_rate_subset %>% + ca <- covid_case_death_rates %>% filter(geo_value == "ca") rec <- epi_recipe(ca) %>% diff --git a/tests/testthat/test-grf_quantiles.R b/tests/testthat/test-grf_quantiles.R index 2570c247d..32f581d7a 100644 --- a/tests/testthat/test-grf_quantiles.R +++ b/tests/testthat/test-grf_quantiles.R @@ -1,5 +1,5 @@ set.seed(12345) -library(grf) +suppressPackageStartupMessages(library(grf)) tib <- tibble( y = rnorm(100), x = rnorm(100), z = rnorm(100), f = factor(sample(letters[1:3], 100, replace = TRUE)) @@ -9,8 +9,11 @@ test_that("quantile_rand_forest defaults work", { spec <- rand_forest(engine = "grf_quantiles", mode = "regression") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[, 2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9)) - expect_identical(pars$quantiles.orig, manual$quantiles) + manual <- quantile_forest( + as.matrix(tib[, 2:3]), tib$y, + quantiles = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95) + ) + expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) fseed <- 12345 @@ -43,10 +46,39 @@ test_that("quantile_rand_forest handles alternative quantiles", { test_that("quantile_rand_forest handles allows setting the trees and mtry", { - spec <- rand_forest(mode = "regression", mtry = 2, trees = 100, engine = "grf_quantiles") + spec <- rand_forest(mode = "regression", mtry = 2, trees = 100) %>% + set_engine(engine = "grf_quantiles", quantiles = c(0.1, 0.5, 0.9)) expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) manual <- quantile_forest(as.matrix(tib[, 2:3]), tib$y, mtry = 2, num.trees = 100) expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) + +test_that("quantile_rand_forest operates with arx_forecaster", { + spec <- rand_forest(mode = "regression") %>% + set_engine("grf_quantiles", quantiles = c(.1, .2, .5, .8, .9)) # non-default + expect_identical(rlang::eval_tidy(spec$eng_args$quantiles), c(.1, .2, .5, .8, .9)) + tib <- as_epi_df(tibble(time_value = 1:25, geo_value = "ca", value = rnorm(25))) + o <- arx_fcast_epi_workflow(tib, "value", trainer = spec) + spec2 <- parsnip::extract_spec_parsnip(o) + expect_identical( + rlang::eval_tidy(spec2$eng_args$quantiles), + rlang::eval_tidy(spec$eng_args$quantiles) + ) + spec <- rand_forest(mode = "regression", "grf_quantiles") + expect_null(rlang::eval_tidy(spec$eng_args)) + o <- arx_fcast_epi_workflow(tib, "value", trainer = spec) + spec2 <- parsnip::extract_spec_parsnip(o) + expect_identical( + rlang::eval_tidy(spec2$eng_args$quantiles), + c(.05, .1, 0.25, .5, 0.75, .9, .95) # merged with arx_args default + ) + df <- epidatasets::counts_subset %>% filter(time_value >= "2021-10-01") + + z <- arx_forecaster(df, "cases", "cases", spec2) + expect_identical( + hardhat::extract_quantile_levels(z$predictions$.pred_distn), + c(.05, .1, 0.25, .5, 0.75, .9, .95) + ) +}) diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index 3b3118740..021bbb50c 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -1,9 +1,9 @@ test_that("Extracts keys from a recipe; roles are NA, giving an empty vector", { - expect_equal(key_colnames(recipe(case_death_rate_subset)), character(0L)) + expect_equal(key_colnames(recipe(covid_case_death_rates)), character(0L)) }) test_that("key_colnames extracts time_value and geo_value, but not raw", { - my_recipe <- epi_recipe(case_death_rate_subset) %>% + my_recipe <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -14,9 +14,12 @@ test_that("key_colnames extracts time_value and geo_value, but not raw", { my_workflow <- epi_workflow() %>% add_epi_recipe(my_recipe) %>% add_model(linear_reg()) %>% - fit(data = case_death_rate_subset) + fit(data = covid_case_death_rates) expect_identical(key_colnames(my_workflow), c("geo_value", "time_value")) + + # `exclude =` works: + expect_identical(key_colnames(my_workflow, exclude = "geo_value"), c("time_value")) }) test_that("key_colnames extracts additional keys when they are present", { @@ -49,4 +52,7 @@ test_that("key_colnames extracts additional keys when they are present", { # order of the additional keys may be different expect_equal(key_colnames(my_workflow), c("geo_value", "state", "pol", "time_value")) + + # `exclude =` works: + expect_equal(key_colnames(my_workflow, exclude = c("time_value", "pol")), c("geo_value", "state")) }) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 428922f46..3beb87dc9 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,5 +1,7 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% @@ -62,11 +64,6 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { layer_naomit(.pred) wf3 <- wf %>% add_frosting(f3) - # this warning has been removed - # expect_warning( - # p3 <- predict(wf3, latest), - # "forecast_date is less than the most recent update date of the data." - # ) expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") @@ -75,6 +72,34 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date")) }) +test_that("`layer_add_forecast_date()` infers correct date when using `adjust_latency`", { + jhu_reasonable_date <- jhu + attributes(jhu_reasonable_date)$metadata$as_of <- as.Date("2022-01-03") + r_latent <- epi_recipe(jhu_reasonable_date) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_adjust_latency(death_rate, method = "extend_ahead") %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + frost_latent <- frosting() %>% + layer_predict() %>% + layer_add_forecast_date() %>% + layer_naomit(.pred) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% + fit(jhu_reasonable_date) %>% + add_frosting(frost_latent) + reasonable_date <- jhu %>% + dplyr::filter(time_value >= max(time_value) - 14) + p_latent <- predict(wf_latent, reasonable_date) + expect_equal( + p_latent$forecast_date, + rep(as.Date("2022-01-03"), times = 3) + ) + expect_equal( + p_latent$forecast_date - p_latent$time_value, + as.difftime(rep(3, times = 3), units = "days") + ) +}) test_that("forecast date works for daily", { f <- frosting() %>% diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 53506ad07..7cd164960 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -6,6 +6,7 @@ r <- epi_recipe(jhu) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 latest <- jhu %>% dplyr::filter(time_value >= max(time_value) - 14) @@ -39,6 +40,27 @@ test_that("Use ahead + max time value from pre, fit, post", { expect_equal(p2$target_date, rep(as.Date("2022-01-07"), times = 3)) expect_named(p2, c("geo_value", "time_value", ".pred", "forecast_date", "target_date")) }) +test_that("latency adjust doesn't interfere with correct target date", { + r_latent <- epi_recipe(jhu) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu) + f_latent <- frosting() %>% + layer_predict() %>% + layer_add_target_date() %>% + layer_naomit(.pred) + wf_latent <- wf_latent %>% add_frosting(f_latent) + + expect_silent(p_latent <- predict(wf_latent, latest)) + expect_equal(ncol(p_latent), 4L) + expect_s3_class(p_latent, "epi_df") + expect_equal(nrow(p_latent), 3L) + expect_equal(p_latent$target_date, rep(as.Date("2022-01-10"), times = 3)) + expect_named(p_latent, c("geo_value", "time_value", ".pred", "target_date")) +}) test_that("Use ahead + specified forecast date", { f <- frosting() %>% diff --git a/tests/testthat/test-layer_naomit.R b/tests/testthat/test-layer_naomit.R index 1d5b4ee25..8eb597f41 100644 --- a/tests/testthat/test-layer_naomit.R +++ b/tests/testthat/test-layer_naomit.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-layer_predict.R b/tests/testthat/test-layer_predict.R index 041516b29..ae51a5ec6 100644 --- a/tests/testthat/test-layer_predict.R +++ b/tests/testthat/test-layer_predict.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R index 09ef7c9d3..2421b8a1c 100644 --- a/tests/testthat/test-layer_residual_quantiles.R +++ b/tests/testthat/test-layer_residual_quantiles.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% @@ -23,13 +23,28 @@ test_that("Returns expected number or rows and columns", { expect_equal(nrow(p), 3L) expect_named(p, c("geo_value", "time_value", ".pred", ".pred_distn")) - nested <- p %>% dplyr::mutate(.quantiles = nested_quantiles(.pred_distn)) - unnested <- nested %>% tidyr::unnest(.quantiles) - - expect_equal(nrow(unnested), 9L) - expect_equal(unique(unnested$quantile_levels), c(.0275, .8, .95)) + unnested <- p %>% pivot_quantiles_longer(.pred_distn) + expect_equal(nrow(unnested), 12L) + expect_equal(unique(unnested$.pred_distn_quantile_level), c(.0275, 0.5, .8, .95)) }) +test_that("new name works correctly", { + f <- frosting() %>% + layer_predict() %>% + layer_naomit(.pred) %>% + layer_residual_quantiles(name = "foo") + + wf1 <- wf %>% add_frosting(f) + expect_equal(names(forecast(wf1)), c("geo_value", "time_value", ".pred", "foo")) + + f <- frosting() %>% + layer_predict() %>% + layer_naomit(.pred) %>% + layer_residual_quantiles(name = "geo_value") + + wf1 <- wf %>% add_frosting(f) + expect_error(forecast(wf1)) +}) test_that("Errors when used with a classifier", { tib <- tibble( @@ -88,7 +103,7 @@ test_that("Canned forecasters work with / without", { ) expect_silent( - arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate")) + arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(check_enough_data_n = 1)) ) expect_silent( flatline_forecaster( diff --git a/tests/testthat/test-layer_threshold_preds.R b/tests/testthat/test-layer_threshold_preds.R index 9df7e64ab..64e8608b2 100644 --- a/tests/testthat/test-layer_threshold_preds.R +++ b/tests/testthat/test-layer_threshold_preds.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -20,7 +20,7 @@ test_that("Default pred_lower and pred_upper work as intended", { expect_equal(ncol(p), 3L) expect_s3_class(p, "epi_df") expect_equal(nrow(p), 3L) - expect_equal(round(p$.pred, digits = 3), c(0.180, 0, 0.764)) + expect_equal(round(p$.pred, digits = 3), c(0.179, 0, 0.765)) # expect_named(p, c("time_value", "geo_value", ".pred")) expect_named(p, c("geo_value", "time_value", ".pred")) }) @@ -56,8 +56,10 @@ test_that("thresholds additional columns", { expect_equal(round(p$.pred, digits = 3), c(0.180, 0.180, 0.310)) expect_named(p, c("geo_value", "time_value", ".pred", ".pred_distn")) p <- p %>% - dplyr::mutate(.quantiles = nested_quantiles(.pred_distn)) %>% - tidyr::unnest(.quantiles) - expect_equal(round(p$values, digits = 3), c(0.180, 0.31, 0.180, .18, 0.310, .31)) - expect_equal(p$quantile_levels, rep(c(.1, .9), times = 3)) + pivot_quantiles_longer(.pred_distn) + expect_equal( + round(p$.pred_distn_value, digits = 3), + c(0.180, 0.180, 0.31, 0.180, .18, .18, .31, 0.310, .31) + ) + expect_equal(p$.pred_distn_quantile_level, rep(c(.1, 0.5, .9), times = 3)) }) diff --git a/tests/testthat/test-pad_to_end.R b/tests/testthat/test-pad_to_end.R deleted file mode 100644 index 6949f06ac..000000000 --- a/tests/testthat/test-pad_to_end.R +++ /dev/null @@ -1,37 +0,0 @@ -test_that("test set padding works", { - dat <- tibble::tibble( - gr1 = rep(c("a", "b"), times = c(3, 4)), - time_value = c(1:3, 1:4), - value = 1:7 - ) %>% arrange(time_value, gr1) - expect_identical(pad_to_end(dat, "gr1", 3), dat) - expect_equal(nrow(pad_to_end(dat, "gr1", 4)), 8L) - p <- pad_to_end(dat, "gr1", 5) - expect_equal(nrow(p), 10L) - expect_identical(p$gr1, rep(c("a", "b"), times = 5)) - expect_identical(p$time_value, rep(1:5, each = 2)) - expect_identical(p$value, as.integer(c(1, 4, 2, 5, 3, 6, NA, 7, NA, NA))) - - dat <- dat %>% arrange(gr1) - dat$gr2 <- c("c", "c", "d", "c", "c", "d", "d") - dat <- dat %>% arrange(time_value) - # don't treat it as a group - p <- pad_to_end(dat, "gr1", 4) - expect_identical(nrow(p), 8L) - expect_identical(p$gr2, c(rep("c", 4), "d", "d", NA, "d")) - - # treat it as a group (needs different time_value) - dat$time_value <- c(1, 1, 2, 2, 1, 1, 2) # double - p <- pad_to_end(dat, c("gr1", "gr2"), 2) - expect_equal(nrow(p), 8L) - expect_identical(p$gr1, rep(c("a", "a", "b", "b"), times = 2)) - expect_identical(p$gr2, rep(c("c", "d"), times = 4)) - expect_identical(p$time_value, rep(c(1, 2), each = 4)) - expect_identical(p$value, as.integer(c(1, 3, 4, 6, 2, NA, 5, 7))) - - # make sure it maintains the epi_df - dat <- dat %>% - dplyr::rename(geo_value = gr1) %>% - as_epi_df(other_keys = "gr2") - expect_s3_class(pad_to_end(dat, "geo_value", 2), "epi_df") -}) diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index 1639058e2..d32c16499 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -1,76 +1,57 @@ test_that("quantile pivotting wider behaves", { tib <- tibble::tibble(a = 1:5, b = 6:10) expect_snapshot(error = TRUE, pivot_quantiles_wider(tib, a)) - tib$c <- rep(dist_normal(), 5) - expect_snapshot(error = TRUE, pivot_quantiles_wider(tib, c)) - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) - # different quantiles - tib <- tib[1:2, ] - tib$d1 <- d1 - expect_snapshot(error = TRUE, pivot_quantiles_wider(tib, d1)) - - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) - tib$d1 <- d1 - # would want to error (mismatched quantiles), but hard to check efficiently - expect_silent(pivot_quantiles_wider(tib, d1)) + d1 <- quantile_pred(rbind(1:3, 2:4), 1:3 / 4) + d2 <- quantile_pred(rbind(2:4, 3:5), 2:4 / 5) + tib <- tibble(g = c("a", "b"), d1 = d1, d2 = d2) - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) - d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) - tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + # too many columns + expect_snapshot(error = TRUE, pivot_quantiles_wider(tib, d1, d2)) + expect_snapshot(error = TRUE, pivot_quantiles_longer(tib, d1, d2)) - expect_length(pivot_quantiles_wider(tib, c("d1", "d2")), 7L) - expect_length(pivot_quantiles_wider(tib, tidyselect::starts_with("d")), 7L) - expect_length(pivot_quantiles_wider(tib, d2), 5L) + expect_length(pivot_quantiles_wider(tib, d1), 5L) + expect_length(pivot_quantiles_wider(tib, tidyselect::ends_with("1")), 5L) + expect_equal(vctrs::vec_size(pivot_quantiles_longer(tib, d2)), 6L) }) test_that("pivotting wider still works if there are duplicates", { # previously this would produce a warning if pivotted because the # two rows of the result are identical - tb <- tibble(.pred = dist_quantiles(list(1:3, 1:3), list(c(.1, .5, .9)))) + tb <- tibble(.pred = quantile_pred(rbind(1:3, 1:3), c(.1, .5, .9))) res <- tibble(`0.1` = c(1, 1), `0.5` = c(2, 2), `0.9` = c(3, 3)) - expect_identical(tb %>% pivot_quantiles_wider(.pred), res) + expect_equal(tb %>% pivot_quantiles_wider(.pred), res) + res_longer <- tibble( + .pred_value = rep(1:3, 2), + .pred_quantile_level = rep(c(0.1, 0.5, 0.9), 2) + ) + expect_equal(tb %>% pivot_quantiles_longer(.pred), res_longer) }) test_that("quantile pivotting longer behaves", { tib <- tibble::tibble(a = 1:5, b = 6:10) expect_snapshot(error = TRUE, pivot_quantiles_longer(tib, a)) - tib$c <- rep(dist_normal(), 5) - expect_snapshot(error = TRUE, pivot_quantiles_longer(tib, c)) - - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) - # different quantiles - tib <- tib[1:2, ] - tib$d1 <- d1 - expect_length(pivot_quantiles_longer(tib, d1), 5L) - expect_identical(nrow(pivot_quantiles_longer(tib, d1)), 7L) - expect_identical(pivot_quantiles_longer(tib, d1)$values, as.double(c(1:3, 2:5))) - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) - tib$d1 <- d1 - expect_silent(pivot_quantiles_longer(tib, d1)) + d1 <- quantile_pred(rbind(1:3, 2:4), 1:3 / 4) + d2 <- quantile_pred(rbind(2:4, 3:5), 2:4 / 5) + tib <- tibble(g = c("a", "b"), d1 = d1, d2 = d2) - d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) - d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) - tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + # too many columns + expect_snapshot(error = TRUE, pivot_quantiles_longer(tib, d1, d2)) + # different quantiles + expect_length(pivot_quantiles_longer(tib, d1), 4L) + expect_identical(nrow(pivot_quantiles_longer(tib, d1)), 6L) + expect_identical(pivot_quantiles_longer(tib, d1)$d1_value, c(1:3, 2:4)) +}) - expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) - expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) - expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) - expect_length(pivot_quantiles_longer(tib, d2), 4L) - - tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) - # now the cols have different numbers of quantiles - expect_snapshot(error = TRUE, pivot_quantiles_longer(tib, d1, d3)) - expect_length( - pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE), - 6L - ) - expect_identical( - pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE)$d1_values, - as.double(rep(c(1:3, 2:4), each = 4)) - ) +test_that("nested_quantiles is deprecated, but works where possible", { + expect_snapshot(d <- dist_quantiles(list(1:4, 2:5), 1:4 / 5)) + expect_snapshot(o <- nested_quantiles(d)) + res <- as_tibble(hardhat::quantile_pred( + matrix(c(1:4, 2:5), nrow = 2, byrow = TRUE), 1:4 / 5 + )) + expect_identical(o |> mutate(.row = dplyr::row_number()) |> unnest(data), res) }) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index a40e6a94d..f31473f96 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -7,7 +7,7 @@ test_that("Column names can be passed with and without the tidy way", { pop_data2 <- pop_data %>% dplyr::rename(geo_value = states) - newdata <- case_death_rate_subset %>% + newdata <- covid_case_death_rates %>% filter(geo_value %in% c("ak", "al", "ar", "as", "az", "ca")) r1 <- epi_recipe(newdata) %>% @@ -88,9 +88,9 @@ test_that("Number of columns and column names returned correctly, Upper and lowe expect_equal(ncol(b), 5L) }) -## Postprocessing -test_that("Postprocessing workflow works and values correct", { - jhu <- cases_deaths_subset %>% +## Post-processing +test_that("Post-processing workflow works and values correct", { + jhu <- epidatasets::cases_deaths_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, cases) @@ -149,8 +149,8 @@ test_that("Postprocessing workflow works and values correct", { expect_equal(p$.pred_scaled, p$.pred * c(2, 3)) }) -test_that("Postprocessing to get cases from case rate", { - jhu <- case_death_rate_subset %>% +test_that("Post-processing to get cases from case rate", { + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) @@ -193,7 +193,64 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% + dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% + dplyr::select(geo_value, time_value, case_rate) + + reverse_pop_data <- data.frame( + geo_value = c("ca", "ny"), + values = c(1 / 20000, 1 / 30000) + ) + + r <- epi_recipe(jhu) %>% + step_population_scaling(case_rate, + df = reverse_pop_data, + df_pop_col = "values", + by = NULL, + suffix = "_scaled" + ) %>% + step_epi_lag(case_rate_scaled, lag = c(0, 7, 14)) %>% # cases + step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases + recipes::step_naomit(recipes::all_predictors()) %>% + recipes::step_naomit(recipes::all_outcomes(), skip = TRUE) + + prep <- prep(r, jhu) + + b <- bake(prep, jhu) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, + df = reverse_pop_data, + by = NULL, + df_pop_col = "values" + ) + + wf <- epi_workflow( + r, + parsnip::linear_reg() + ) %>% + fit(jhu) %>% + add_frosting(f) + + latest <- get_test_data( + recipe = r, + x = covid_case_death_rates %>% + dplyr::filter( + time_value > "2021-11-01", + geo_value %in% c("ca", "ny") + ) %>% + dplyr::select(geo_value, time_value, case_rate) + ) + + + p <- predict(wf, latest) + + + + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) @@ -247,8 +304,295 @@ test_that("test joining by default columns", { }) +test_that("test joining by default columns with less common keys/classes", { + # Make a model spec that expects no predictor columns and outputs a fixed + # (rate) prediction. Based on combining two linear inequalities. + fixed_rate_prediction <- 2e-6 + model_spec <- quantile_reg(quantile_levels = 0.5, method = "fnc") %>% + set_engine( + "rq", + R = matrix(c(1, -1), 2, 1), r = c(1, -1) * fixed_rate_prediction, + eps = fixed_rate_prediction * 1e-6 # prevent early stop + ) + + # Here's the typical setup + dat1 <- tibble::tibble(geo_value = 1:2, time_value = 1, y = c(3 * 5, 7 * 11)) %>% + as_epi_df() + pop1 <- tibble::tibble(geo_value = 1:2, population = c(5e6, 11e6)) + ewf1 <- epi_workflow( + epi_recipe(dat1) %>% + step_population_scaling(y, df = pop1, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1, estimated = FALSE) %>% + prep(dat1) %>% + bake(new_data = NULL), + dat1 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf1, dat1)) %>% + pivot_quantiles_wider(.pred), + dat1 %>% + select(!"y") %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With geo x age in time series but only geo in population data: + dat1b <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop1b <- pop1 + ewf1b <- epi_workflow( + epi_recipe(dat1b) %>% + step_population_scaling(y, df = pop1b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b, estimated = FALSE) %>% + prep(dat1b) %>% + bake(new_data = NULL), + dat1b %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf1b, dat1b)) %>% + pivot_quantiles_wider(.pred), + dat1b %>% + select(!"y") %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # Same thing but with time series in tibble, but with role hints: + dat1b2 <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) + pop1b2 <- pop1b + ewf1b2 <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1b2) %>% + update_role("geo_value", new_role = "geo_value") %>% + update_role("age_group", new_role = "key") %>% + update_role("time_value", new_role = "time_value") %>% + step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b2, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b2, estimated = FALSE) %>% + prep(dat1b2) %>% + bake(new_data = NULL), + dat1b2 %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: + predict(fit(ewf1b2, dat1b2), dat1b2) %>% + pivot_quantiles_wider(.pred) %>% + as_tibble(), + dat1b2 %>% + select(!"y") %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) %>% + select(geo_value, age_group, time_value, `0.5`) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # Same thing but with time series in tibble, but no role hints -> incorrect + # key guess -> we prep without warning, but error when we realize we don't + # actually have a unique key for our predictions: + dat1b3 <- dat1b2 + pop1b3 <- pop1b2 + ewf1b3 <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1b3) %>% + step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b3, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1b3, estimated = FALSE) %>% + prep(dat1b3) %>% + bake(new_data = NULL), + dat1b3 %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ) + expect_error( + predict(fit(ewf1b3, dat1b3), dat1b3) %>% + pivot_quantiles_wider(.pred), + class = "epipredict__grab_forged_keys__nonunique_key" + ) + + # With geo x age_group breakdown on both: + dat2 <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop2 <- pop1 %>% + mutate(age_group = geo_value, geo_value = 1) + ewf2 <- epi_workflow( + epi_recipe(dat2) %>% + step_population_scaling(y, df = pop2, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf2, estimated = FALSE) %>% + prep(dat2) %>% + bake(new_data = NULL), + dat2 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf2, dat2)) %>% + pivot_quantiles_wider(.pred) %>% + as_tibble(), + dat2 %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With only an age column in population data: + dat2b <- dat2 + pop2b <- pop1 %>% + mutate(age_group = geo_value, geo_value = NULL) + ewf2b <- epi_workflow( + epi_recipe(dat2b) %>% + step_population_scaling(y, df = pop2b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf2b, estimated = FALSE) %>% + prep(dat2b) %>% + bake(new_data = NULL), + dat2b %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf2b, dat2b)) %>% + pivot_quantiles_wider(.pred), + dat2b %>% + select(!"y") %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # with geo x time_value breakdown instead: + dat3 <- dat1 %>% + as_tibble() %>% + mutate(time_value = geo_value, geo_value = 1) %>% + as_epi_df() + pop3 <- pop1 %>% + mutate(time_value = geo_value, geo_value = 1) + ewf3 <- epi_workflow( + epi_recipe(dat3) %>% + step_population_scaling(y, df = pop3, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop3, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf3, estimated = FALSE) %>% + prep(dat3) %>% + bake(new_data = NULL), + dat3 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf3, dat3)) %>% + pivot_quantiles_wider(.pred), + # slightly edited copy-pasta due to test time selection: + dat3 %>% + select(!"y") %>% + dplyr::slice_max(by = geo_value, time_value) %>% + mutate(`0.5` = 2 * 11) + ) + + # With alternative geo naming... we're able to successfully prep (and we're + # not missing a warning as in 1b3 since we're actually in a "correct" setup), + # but still like 1b3, we fail at prediction time as key roles have not been + # set up and inference fails: + dat4 <- dat1 %>% rename(geo = geo_value) + pop4 <- pop1 %>% rename(geo = geo_value) + ewf4 <- epi_workflow( + recipe(dat4) %>% + step_population_scaling(y, df = pop4, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf4, estimated = FALSE) %>% + prep(dat4) %>% + bake(new_data = NULL), + dat4 %>% + mutate(y_scaled = c(3e-6, 7e-6)) + ) + expect_error( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: + predict(fit(ewf4, dat4), dat4) %>% + pivot_quantiles_wider(.pred), + class = "epipredict__grab_forged_keys__nonunique_key" + ) +}) + + test_that("expect error if `by` selector does not match", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) diff --git a/tests/testthat/test-quantile_pred.R b/tests/testthat/test-quantile_pred.R new file mode 100644 index 000000000..70d7c71a5 --- /dev/null +++ b/tests/testthat/test-quantile_pred.R @@ -0,0 +1,103 @@ +test_that("single quantile_pred works, quantiles are accessible", { + z <- hardhat::quantile_pred( + values = matrix(1:5, nrow = 1), + quantile_levels = c(.2, .4, .5, .6, .8) + ) + expect_equal(median(z), 3) + expect_equal(quantile(z, c(.2, .4, .5, .6, .8)), matrix(1:5, nrow = 1)) + expect_equal( + quantile(z, c(.3, .7), middle = "linear"), + matrix(c(1.5, 4.5), nrow = 1) + ) + + Q <- stats::splinefun(c(.2, .4, .5, .6, .8), 1:5, method = "hyman") + expect_equal(quantile(z, c(.3, .7)), matrix(Q(c(.3, .7)), nrow = 1)) + expect_identical( + extrapolate_quantiles(z, c(.3, .7), middle = "linear"), + hardhat::quantile_pred(matrix(c(1, 1.5, 2, 3, 4, 4.5, 5), nrow = 1), 2:8 / 10) + ) +}) + + +test_that("quantile extrapolator works", { + dstn <- hardhat::quantile_pred( + matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), + c(.2, .4, .6, .8) + ) + qq <- extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) + expect_s3_class(qq, c("quantile_pred", "vctrs_vctr", "list")) + expect_length(qq %@% "quantile_levels", 7L) + + dstn <- hardhat::quantile_pred(matrix(1:4, nrow = 1), 1:4 / 5) + qq <- extrapolate_quantiles(dstn, 1:9 / 10) + dstn_na <- hardhat::quantile_pred(matrix(c(1, 2, NA, 4), nrow = 1), 1:4 / 5) + qq2 <- extrapolate_quantiles(dstn_na, 1:9 / 10) + expect_equal(qq, qq2) + qq3 <- extrapolate_quantiles(dstn_na, 1:9 / 10, replace_na = FALSE) + qq2_vals <- unlist(qq2) + qq3_vals <- unlist(qq3) + qq2_vals[6] <- NA + expect_equal(qq2_vals, qq3_vals) +}) + +test_that("small deviations of quantile requests work", { + l <- c(.05, .1, .25, .75, .9, .95) + v <- c(0.0890306, 0.1424997, 0.1971793, 0.2850978, 0.3832912, 0.4240479) + badl <- l + badl[1] <- badl[1] - 1e-14 + distn <- hardhat::quantile_pred(matrix(v, nrow = 1), l) + + # was broken before, now works + expect_equal(quantile(distn, l), quantile(distn, badl)) + + # The tail extrapolation was still poor. It needs to _always_ use + # the smallest (largest) values or we could end up unsorted + l <- 1:9 / 10 + v <- 1:9 + distn <- hardhat::quantile_pred(matrix(v, nrow = 1), l) + expect_equal(quantile(distn, c(.25, .75)), matrix(c(2.5, 7.5), nrow = 1)) + expect_equal(quantile(distn, c(.1, .9)), matrix(c(1, 9), nrow = 1)) + qv <- data.frame(q = l, v = v) + expect_equal( + drop(quantile(distn, c(.01, .05))), + tail_extrapolate(c(.01, .05), head(qv, 2)) + ) + expect_equal( + drop(quantile(distn, c(.99, .95))), + tail_extrapolate(c(.95, .99), tail(qv, 2)) + ) +}) + +test_that("unary math works on quantiles", { + dstn <- hardhat::quantile_pred( + matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn2 <- hardhat::quantile_pred( + log(matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE)), + 1:4 / 5 + ) + expect_identical(log(dstn), dstn2) +}) + +test_that("arithmetic works on quantiles", { + dstn <- hardhat::quantile_pred( + matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn2 <- hardhat::quantile_pred( + matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE) + 1, + 1:4 / 5 + ) + expect_identical(dstn + 1, dstn2) + expect_identical(1 + dstn, dstn2) + + dstn2 <- hardhat::quantile_pred( + matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE) / 4, + 1:4 / 5 + ) + expect_identical(dstn / 4, dstn2) + expect_identical((1 / 4) * dstn, dstn2) + + expect_snapshot(error = TRUE, sum(dstn)) +}) diff --git a/tests/testthat/test-replace_Inf.R b/tests/testthat/test-replace_Inf.R index f9993ca13..c99a045b2 100644 --- a/tests/testthat/test-replace_Inf.R +++ b/tests/testthat/test-replace_Inf.R @@ -7,7 +7,7 @@ test_that("replace_inf works", { v1 = 1:5, v2 = c(1, 2, Inf, -Inf, NA) ) - library(dplyr) + suppressPackageStartupMessages(library(dplyr)) ok <- c("geo_value", "time_value") df2 <- df %>% mutate(across(!all_of(ok), ~ vec_replace_inf(.x, NA))) expect_identical(df[, 1:3], df2[, 1:3]) diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 83fc5322a..eb0439c10 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -1,4 +1,4 @@ -train_data <- cases_deaths_subset +train_data <- epidatasets::cases_deaths_subset expect_snapshot_tibble <- function(x) { expect_snapshot_value(x, style = "deparse", cran = FALSE) } @@ -68,14 +68,149 @@ test_that("arx_forecaster snapshots", { ) ) expect_snapshot_tibble(arx2$predictions) + attributes(train_data)$metadata$as_of <- max(train_data$time_value) + 5 + arx3 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "extend_ahead" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) + expect_equal( + arx3$predictions$target_date, + rep(attributes(train_data)$metadata$as_of + 1, times = 6) + ) + expect_equal( + arx3$predictions$target_date, + arx2$predictions$target_date + 5 + ) + expect_equal( + arx3$predictions$forecast_date, + arx2$predictions$forecast_date + 5 + ) + # not the same predictions + expect_false(all(arx2$predictions == arx3$predictions)) + + + arx4 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "locf" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) +}) + +test_that("arx_forecaster output format snapshots", { + jhu <- covid_case_death_rates %>% + dplyr::filter(time_value >= as.Date("2021-12-01")) + attributes(jhu)$metadata$as_of <- as.Date(attributes(jhu)$metadata$as_of) + out1 <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") + ) + expect_equal(as.Date(format(out1$metadata$forecast_created, "%Y-%m-%d")), Sys.Date()) + out1$metadata$forecast_created <- as.Date("1999-01-01") + expect_snapshot(out1) + out2 <- arx_forecaster(jhu, "case_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + quantile_levels = 1:9 / 10, + adjust_latency = "extend_lags", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(format(out2$metadata$forecast_created, "%Y-%m-%d")), Sys.Date()) + out2$metadata$forecast_created <- as.Date("1999-01-01") + expect_snapshot(out2) + out3 <- arx_forecaster(jhu, "death_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + adjust_latency = "extend_ahead", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(format(out3$metadata$forecast_created, "%Y-%m-%d")), Sys.Date()) + out3$metadata$forecast_created <- as.Date("1999-01-01") + expect_snapshot(out3) }) test_that("arx_classifier snapshots", { - arc1 <- arx_classifier( - case_death_rate_subset %>% + train <- covid_case_death_rates %>% + filter(geo_value %nin% c("as", "gu", "mp", "vi")) + expect_warning(arc1 <- arx_classifier( + train %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate") - ) + ), "fitted probabilities numerically") expect_snapshot_tibble(arc1$predictions) + max_date <- train$time_value %>% max() + arc2 <- arx_classifier( + train %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date + 2) + ) + expect_snapshot_tibble(arc2$predictions) + expect_error( + arc3 <- arx_classifier( + train %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_lags", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + expect_error( + arc4 <- arx_classifier( + train %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "locf", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) +}) + +test_that("climatological_forecaster snapshots", { + rates <- cases_deaths_subset + + # set as_of to the last day in the data + attr(rates, "metadata")$as_of <- as.Date("2021-12-31") + fcast <- climatological_forecaster(rates, "case_rate_7d_av") + expect_snapshot_tibble(fcast$predictions) + + # Compute quantiles separately by location, and a backcast + backcast <- climatological_forecaster( + rates, "cases", + climate_args_list( + quantile_by_key = "geo_value", + forecast_date = as.Date("2021-06-01") + ) + ) + expect_snapshot_tibble(backcast$predictions) + # compute the climate "daily" rather than "weekly" + # use a two week window (on both sides) + daily_fcast <- climatological_forecaster( + rates, "cases", + climate_args_list( + quantile_by_key = "geo_value", + time_type = "day", window_size = 14L, forecast_horizon = 0:30 + ) + ) + expect_snapshot_tibble(daily_fcast$predictions) }) diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R new file mode 100644 index 000000000..99709ecaf --- /dev/null +++ b/tests/testthat/test-step_adjust_latency.R @@ -0,0 +1,606 @@ +library(dplyr) +# Test ideas that were dropped: +# - "epi_adjust_latency works correctly when there's gaps in the time-series" +# - "epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked" +# - "`step_adjust_latency` only allows one instance of itself" +# - "data with epi_df shorn off works" + +x <- tibble( + geo_value = rep("place", 200), + time_value = as.Date("2021-01-01") + 0:199, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = as.POSIXct("2024-09-17")) +max_time <- max(x$time_value) +as_of <- attributes(x)$metadata$as_of +ahead <- 7 +latency <- 5 + +testing_as_of <- max_time + latency +# create x with a plausible as_of date +real_x <- x +attributes(real_x)$metadata$as_of <- testing_as_of + +slm_fit <- function(recipe, data = x) { + epi_workflow() %>% + add_epi_recipe(recipe) %>% + add_model(linear_reg()) %>% + fit(data = data) +} + + +# making a toy dataset with lag between geo_values +x_lagged <- x +x_lagged$time_value <- x$time_value - 1 +x_lagged$geo_value <- "other" +x_lagged <- add_row(x, x_lagged) +x_lagged +attributes(x_lagged)$metadata$as_of <- testing_as_of + +test_that("epi_adjust_latency correctly extends the lags", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(death_rate, method = "extend_lags")) + + r1 <- epi_recipe(x) %>% + step_adjust_latency(death_rate, case_rate, method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # directly checking the shifts + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_16_death_rate", max_time + 16, + "lag_11_death_rate", max_time + 11, + "lag_10_case_rate", max_time + 10, + "lag_6_case_rate", max_time + 6, + "lag_5_death_rate", max_time + 5, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) + + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error( + expect_warning( + expect_warning( + fit1 <- slm_fit(r1, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "simpleError" + ) + + # now trying with the as_of a reasonable distance in the future + fit1 <- slm_fit(r1, data = real_x) + expect_equal( + names(fit1$pre$mold$predictors), + c( + "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", + "lag_6_case_rate", "lag_10_case_rate" + ) + ) + latest <- get_test_data(r1, real_x) + pred <- predict(fit1, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of)) + + expect_equal( + names(fit1$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r1, x) + pred1 <- predict(fit1, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit1$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% + step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit1$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("epi_adjust_latency correctly extends the ahead", { + r2 <- epi_recipe(x) %>% + step_adjust_latency(death_rate, method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r2), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") + # real date example + fit2 <- slm_fit(r2, data = real_x) + expect_equal( + names(fit2$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r2, real_x) + pred2 <- predict(fit2, latest) + point_pred2 <- pred2 %>% filter(!is.na(.pred)) + # max time is still the forecast date + expect_equal(point_pred2$time_value, as.Date(max_time)) + # target column renamed + expect_equal( + names(fit2$pre$mold$outcomes), + glue::glue("ahead_{ahead + latency}_death_rate") + ) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- slm_fit(equivalent, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + expect_equal( + fit2$fit$fit$fit$coefficients, + equiv_fit$fit$fit$fit$coefficients + ) + + # should have four predictors, including the intercept + expect_equal(length(fit2$fit$fit$fit$coefficients), 6) +}) + +test_that("epi_adjust_latency correctly locfs", { + r1 <- epi_recipe(x) %>% + step_adjust_latency(has_role("raw"), method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_11_death_rate", max_time + 16, + "lag_6_death_rate", max_time + 11, + "lag_5_case_rate", max_time + 10, + "lag_1_case_rate", max_time + 6, + "case_rate", max_time + 5, + "death_rate", max_time + 5, + "lag_0_death_rate", max_time + 5, + "ahead_7_death_rate", max_time - 2, + ) + ) + # we expect a 5-fold repetition of the last values found in the original + # epi_df + last_real <- real_x %>% + group_by(geo_value) %>% + arrange(time_value) %>% + slice_tail() %>% + ungroup() %>% + select(case_rate, death_rate) %>% + tidyr::uncount(5) + # pulling just the region between the last day and the prediction day + filled_values <- + baked_x %>% + filter( + time_value > max(real_x$time_value), + time_value <= attributes(real_x)$metadata$as_of + ) %>% + ungroup() %>% + select(case_rate, death_rate) + expect_equal(last_real, filled_values) +}) + +test_that("epi_adjust_latency extends multiple aheads", { + aheads <- 1:3 + r3 <- epi_recipe(x) %>% + step_adjust_latency(death_rate, method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = aheads) + fitter <- smooth_quantile_reg( + quantile_levels = 0.5, + outcome_locations = aheads, + degree = 1L + ) + epi_wf <- epi_workflow(r3, fitter) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error( + expect_warning( + fit3 <- fit(epi_wf, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "simpleError" + ) + # real date example + fit3 <- fit(epi_wf, data = real_x) + expect_equal( + names(fit3$pre$mold$outcomes), + c( + "ahead_6_death_rate", "ahead_7_death_rate", "ahead_8_death_rate" + ) + ) + expect_equal( + names(fit3$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r3, real_x) + pred3 <- predict(fit3, latest) + point_pred <- pred3 %>% + unnest(.pred) %>% + filter(!is.na(distn)) + # max time is still the forecast date + expect_equal( + point_pred$time_value, + rep(as.Date(max_time), 3) + ) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- fit(epi_wf, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + expect_equal( + fit3$fit$fit$fit$rqfit, + equiv_fit$fit$fit$fit$rqfit + ) + + # should have four predictors, including the intercept + expect_equal(length(fit3$fit$fit$fit$rqfit$coefficients), 6) +}) + +test_that("epi_adjust_latency fixed_forecast_date works", { + r4 <- epi_recipe(x) %>% + step_adjust_latency(has_role("raw"), method = "extend_lags", fixed_forecast_date = max_time + 14) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + baked_x <- r4 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_25_death_rate", max_time + 25, + "lag_20_death_rate", max_time + 20, + "lag_19_case_rate", max_time + 19, + "lag_15_case_rate", max_time + 15, + "lag_14_death_rate", max_time + 14, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + +test_that("epi_adjust_latency fixed_latency works", { + r4.1 <- epi_recipe(x) %>% + step_adjust_latency(has_role("raw"), method = "extend_lags", fixed_latency = 2) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + baked_x <- r4.1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_13_death_rate", max_time + 13, + "lag_8_death_rate", max_time + 8, + "lag_7_case_rate", max_time + 7, + "lag_3_case_rate", max_time + 3, + "lag_2_death_rate", max_time + 2, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + + +# todo test variants on the columns for which this is applied +# todo need to have both on columns 1, and 2 + + + +# test_that("epi_adjust_latency works for other time types", {}) + +test_that("epi_adjust_latency warns there's steps before it", { + expect_warning( + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(death_rate, method = "extend_lags"), + regexp = "extend_lags" + ) + expect_warning( + r5 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(death_rate, method = "extend_ahead"), + regexp = "extend_ahead" + ) +}) + +# TODO check that epi_adjust_latency errors for nonsense `as_of`'s + + +# TODO make sure that `epi_keys_checked` works correctly for extra epi_keys + +test_that("epi_adjust_latency correctly extends the lags when there are different delays per-geo", { + r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(has_role("raw"), method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # now trying with the as_of a reasonable distance in the future + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_6_death_rate", "lag_12_death_rate", "lag_17_death_rate", + "lag_7_case_rate", "lag_11_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of) + 1) + + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(6, 12, 17)) %>% + step_epi_lag(case_rate, lag = c(7, 11)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("epi_adjust_latency correctly extends the ahead when there are different delays per-geo", { + r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(death_rate, method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(max_time)) + joint_latency <- latency + 1 + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead+joint_latency}_death_rate") + ) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, as.Date(max_time)) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + joint_latency) + + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("`step_adjust_latency` only uses the columns specified in the `...`", { + r5 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + fit5 <- slm_fit(r5, data = real_x) + expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) + + r51 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + baked_x <- r51 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values (so the variance is zero) + for (ii in seq(nrow(last_dates))) { + baked_var <- baked_x %>% + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() + if (grepl("case_rate", last_dates[[ii, "name"]])) { + expect_equal(baked_var, 0) + } else { + expect_true(baked_var > 0) + } + } +}) + +test_that("printing step_adjust_latency results in expected output", { + r5 <- epi_recipe(x) %>% + step_adjust_latency(has_role("raw"), method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + expect_snapshot(r5) + expect_snapshot(prep(r5, real_x)) + r6 <- epi_recipe(covid_case_death_rates) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_adjust_latency(has_role("raw"), method = "extend_ahead") %>% + step_epi_ahead(death_rate, ahead = 7) + expect_snapshot(r6) + expect_snapshot(prep(r6, covid_case_death_rates)) +}) + +test_that("locf works as intended", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(death_rate, method = "locf")) + + r6 <- epi_recipe(x) %>% + step_adjust_latency(has_role("raw"), method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r6 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values + for (ii in seq(nrow(last_dates))) { + baked_x %>% + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() %>% + expect_equal(0) + } + + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_warning(fit6 <- slm_fit(r6, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ) + + # now trying with the as_of a reasonable distance in the future + fit6 <- slm_fit(r6, data = real_x) + expect_equal( + names(fit6$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r6, real_x) + pred <- predict(fit6, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(max(point_pred$time_value), as.Date(testing_as_of)) + + expect_equal( + names(fit6$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r6, x) + pred1 <- predict(fit6, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) + expect_equal(max(actual_solutions$time_value), testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit6$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + # + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + locf_x <- real_x %>% rbind(tibble( + geo_value = rep("place", latency), + time_value = max_time + 1:latency, + case_rate = rep(real_x$case_rate[nrow(x)], latency), + death_rate = rep(real_x$death_rate[nrow(x)], latency) + )) + fit_hand_adj <- slm_fit(hand_adjusted, data = locf_x) + expect_equal( + fit6$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) diff --git a/tests/testthat/test-step_climate.R b/tests/testthat/test-step_climate.R new file mode 100644 index 000000000..231908b98 --- /dev/null +++ b/tests/testthat/test-step_climate.R @@ -0,0 +1,249 @@ +test_that("yday_leap works", { + # feburary 29th is assigned a negative value + expect_equal(yday_leap(as.Date("2024-02-29")), 999) + # before that is normal + expect_equal(yday_leap(as.Date("2024-02-28")), 31 + 28) + + # after that is decreased by 1 (so matches non leap years) + expect_equal( + yday_leap(as.Date("2024-05-28")), + lubridate::yday(as.Date("2022-05-28")) + ) + # off leap years have the right value + expect_equal(yday_leap(as.Date("2023-05-28")), 31 + 28 + 31 + 30 + 28) +}) +test_that("roll_modular_multivec works", { + tib <- tibble( + col = c(1, 2, 3, 3.5, 4, 1, -2, 4, 1, 0), + .idx = c(1, 1, 1, 2, 2, 3, 3, 3, 1, 1), + w = rep(1, 10) + ) + modulus <- 3L + + Mean <- function(x, w) weighted.mean(x, w, na.rm = TRUE) + Median <- function(x, w) median(x, na.rm = TRUE) + + # unweighted mean + # window of size 0 + expected_res <- tib |> + mutate(.idx = .idx %% modulus, .idx = .idx + (.idx == 0) * modulus) |> + summarise(climate_pred = weighted.mean(col, w = w), .by = .idx) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 0, modulus), + expected_res + ) + # window of size 1, which includes everything + expected_res <- tibble(.idx = as.double(1:3), climate_pred = mean(tib$col)) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 1L, modulus), + expected_res + ) + + # weighted mean + # window of size 0 + tib$w <- c(1, 2, 3, 1, 2, 1, 1, 2, 2, 1) + expected_res <- tib |> + mutate(.idx = .idx %% modulus, .idx = .idx + (.idx == 0) * modulus) |> + summarise(climate_pred = weighted.mean(col, w = w), .by = .idx) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 0, modulus), + expected_res + ) + # window of size 1 + expected_res <- tibble( + .idx = as.double(1:3), + climate_pred = weighted.mean(tib$col, tib$w) + ) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 1L, modulus), + expected_res + ) + # median + expected_res <- tib |> + mutate(.idx = .idx %% modulus, .idx = .idx + (.idx == 0) * modulus) |> + summarise(climate_pred = median(col), .by = .idx) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Median, 0, modulus), + expected_res + ) + expected_res <- tibble(.idx = as.double(1:3), climate_pred = median(tib$col)) + expect_equal( + roll_modular_multivec(tib$col, tib$.idx, tib$w, Median, 1L, modulus), + expected_res + ) +}) + +test_that("prep/bake steps create the correct training data", { + single_yr <- seq(as.Date("2021-01-01"), as.Date("2021-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + # shift by 2 days to match the epiweeks of 2021 + y = rep(c(1, 1, rep(c(1:26, 26:2), each = 7), 1, 1, 1, 1, 1, 1), times = 2L) + ) %>% + as_epi_df() + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% step_climate(y, time_type = "epiweek") + p <- prep(r, x) + + expected_res <- tibble(.idx = c(1:52, 999), climate_y = c(2, 2:25, 25, 25, 25:2, 2, 2)) + expect_equal(p$steps[[1]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expected_bake <- x %>% + mutate(.idx = epiweek_leap(time_value)) %>% + left_join(expected_res, by = join_by(.idx)) %>% + select(-.idx) + expect_equal(b, expected_bake) +}) + +test_that("prep/bake steps create the correct training data with an incomplete year", { + single_yr <- seq(as.Date("2021-01-01"), as.Date("2021-10-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + # shift by 2 days to match the epiweeks of 2021 + y = rep(c(1, 1, rep(c(1:26, 26:2), each = 7), 1, 1, 1, 1, 1, 1)[1:length(single_yr)], times = 2L) + ) %>% + as_epi_df() + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% step_climate(y, time_type = "epiweek") + p <- prep(r, x) + + expected_res <- tibble(.idx = c(1:44, 999), climate_y = c(2, 3, 3, 4:25, 25, 25, 25:12, 12, 11, 11, 2)) + expect_equal(p$steps[[1]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expected_bake <- x %>% + mutate(.idx = epiweek_leap(time_value)) %>% + left_join(expected_res, by = join_by(.idx)) %>% + select(-.idx) + expect_equal(b, expected_bake) +}) + +test_that("prep/bake steps create the correct training data for non leapweeks", { + single_yr <- seq(as.Date("2023-01-01"), as.Date("2023-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + # shift by 2 days to match the epiweeks of 2021 + y = rep(c(1, 1, rep(c(1:26, 26:2), each = 7), 1, 1, 1, 1, 1, 1), times = 2L) + ) %>% + as_epi_df() + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% step_climate(y, time_type = "epiweek") + p <- prep(r, x) + + expected_res <- tibble(.idx = 1:52, climate_y = c(2, 2:25, 25, 25, 25:2, 2)) + expect_equal(p$steps[[1]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expected_bake <- x %>% + mutate(.idx = lubridate::epiweek(time_value)) %>% + left_join(expected_res, by = join_by(.idx)) %>% + select(-.idx) + expect_equal(b, expected_bake) +}) + +test_that("prep/bake steps create the correct training data months", { + single_yr <- seq(as.Date("2021-01-01"), as.Date("2023-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + ) %>% + # 1 2 3 4 5 6 6 5 4 3 2 1, assigned based on the month + mutate(y = pmin(13 - month(time_value), month(time_value))) %>% + as_epi_df() + + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% step_climate(y, time_type = "month", window_size = 1) + p <- prep(r, x) + + expected_res <- tibble(.idx = 1:12, climate_y = c(1:6, 6:1)) + expect_equal(p$steps[[1]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expected_bake <- x %>% + mutate(.idx = month(time_value)) %>% + left_join(expected_res, by = join_by(.idx)) %>% + select(-.idx) + expect_equal(b, expected_bake) +}) + + +test_that("prep/bake steps create the correct training data for daily data", { + single_yr <- seq(as.Date("2020-01-01"), as.Date("2020-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + y = rep(c(1:183, 184:2), times = 2L) + ) %>% + as_epi_df() + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% step_climate(y, time_type = "day") + p <- prep(r, x) + + expected_res <- tibble( + .idx = c(1:365, 999), + climate_y = c(3, 3, 3:(59 - 4), 56.5:63.5, 65:181, rep(182, 5), 181:3, 3, 59) + ) + expect_equal(p$steps[[1]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expected_bake <- x %>% + mutate(.idx = yday_leap(time_value)) %>% + left_join(expected_res, by = join_by(.idx)) %>% + select(-.idx) + expect_equal(b, expected_bake) +}) + + +test_that("leading the climate predictor works as expected", { + single_yr <- seq(as.Date("2021-01-01"), as.Date("2021-12-31"), by = "1 day") + x <- tibble( + time_value = rep(single_yr, times = 2L), + geo_value = rep(c("reg1", "reg2"), each = length(single_yr)), + # shift by 2 days to match the epiweeks of 2021 + y = rep(c(1, 1, rep(c(1:26, 26:2), each = 7), 1, 1, 1, 1, 1, 1), times = 2L) + ) %>% + as_epi_df() + # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 + r <- epi_recipe(x) %>% + step_epi_ahead(y, ahead = 14L) %>% + step_epi_lag(y, lag = c(0, 7L, 14L)) %>% + step_climate(y, forecast_ahead = 2L, time_type = "epiweek") %>% + # matches the response + step_epi_naomit() + p <- prep(r, x) + + expected_res <- tibble(.idx = c(1:52, 999), climate_y = c(2, 2, 3, 4, 4.5, 5.5, 7:25, 25, 25, 25:2, 2, 2)) %>% + mutate( + climate_y = climate_y[c(3:53, 1:2)] + ) %>% + arrange(.idx) + expect_equal(p$steps[[3]]$climate_table, expected_res) + + b <- bake(p, new_data = NULL) + expect_identical(max(b$time_value), as.Date("2021-12-17")) # last date with no NAs + # expected climate predictor should be shifted forward by 2 weeks + expected_climate_pred <- x %>% + mutate( + .idx = lubridate::epiweek(time_value) %% 53, + .idx = dplyr::case_when(.idx == 0 ~ 53, TRUE ~ .idx) + ) %>% + left_join(expected_res, by = join_by(.idx)) %>% + arrange(time_value, geo_value) %>% + filter(time_value %in% unique(b$time_value)) %>% + pull(climate_y) + expect_identical(b$climate_y, expected_climate_pred) + + # Check if our test data has the right values + td <- get_test_data(r, x) + expected_test_x <- td %>% + filter(time_value == "2021-12-31") %>% + mutate( + ahead_14_y = NA_real_, lag_0_y = 1, lag_7_y = 2, lag_14_y = 3, + climate_y = 2 + ) + expect_equal(bake(p, td), expected_test_x) +}) diff --git a/tests/testthat/test-step_epi_naomit.R b/tests/testthat/test-step_epi_naomit.R index 0e5e1750f..2f361ec98 100644 --- a/tests/testthat/test-step_epi_naomit.R +++ b/tests/testthat/test-step_epi_naomit.R @@ -1,6 +1,6 @@ -library(dplyr) -library(parsnip) -library(workflows) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(parsnip)) +suppressPackageStartupMessages(library(workflows)) # Random generated dataset x <- tibble( diff --git a/tests/testthat/test-step_epi_shift.R b/tests/testthat/test-step_epi_shift.R index 1f83120b3..2a313b103 100644 --- a/tests/testthat/test-step_epi_shift.R +++ b/tests/testthat/test-step_epi_shift.R @@ -39,7 +39,7 @@ test_that("A negative lag value should should throw an error", { test_that("A nonpositive ahead value should throw an error", { expect_snapshot( - error = TRUE, + error = FALSE, r3 <- epi_recipe(x) %>% step_epi_ahead(death_rate, ahead = -7) %>% step_epi_lag(death_rate, lag = 7) @@ -66,3 +66,8 @@ test_that("Check that epi_lag shifts applies the shift", { # Should have four predictors, including the intercept expect_equal(length(fit5$fit$fit$fit$coefficients), 4) }) + +test_that("Shifting nothing is a no-op", { + expect_no_error(noop <- epi_recipe(x) %>% step_epi_ahead(ahead = 3) %>% prep(x) %>% bake(x)) + expect_equal(noop, x) +}) diff --git a/tests/testthat/test-step_growth_rate.R b/tests/testthat/test-step_growth_rate.R index f2845d812..dc8f04c98 100644 --- a/tests/testthat/test-step_growth_rate.R +++ b/tests/testthat/test-step_growth_rate.R @@ -16,8 +16,8 @@ test_that("step_growth_rate validates arguments", { expect_snapshot(error = TRUE, step_growth_rate(r, value, prefix = 1)) expect_snapshot(error = TRUE, step_growth_rate(r, value, id = 1)) expect_snapshot(error = TRUE, step_growth_rate(r, value, log_scale = 1)) + expect_snapshot(error = TRUE, step_growth_rate(r, value, na_rm = 1)) expect_snapshot(error = TRUE, step_growth_rate(r, value, skip = 1)) - expect_snapshot(error = TRUE, step_growth_rate(r, value, additional_gr_args_list = 1:5)) expect_snapshot(error = TRUE, step_growth_rate(r, value, replace_Inf = "c")) expect_snapshot(error = TRUE, step_growth_rate(r, value, replace_Inf = c(1, 2))) expect_silent(step_growth_rate(r, value, replace_Inf = NULL)) diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R index 6ff9884a7..d23a3b4fa 100644 --- a/tests/testthat/test-step_lag_difference.R +++ b/tests/testthat/test-step_lag_difference.R @@ -52,7 +52,7 @@ test_that("step_lag_difference works for a single signal", { }) -test_that("step_lag_difference works for a two signals", { +test_that("step_lag_difference works for a two signal epi_df", { df <- data.frame( time_value = 1:5, geo_value = rep("a", 5), diff --git a/tests/testthat/test-target_date_bug.R b/tests/testthat/test-target_date_bug.R index 02a825267..8be266414 100644 --- a/tests/testthat/test-target_date_bug.R +++ b/tests/testthat/test-target_date_bug.R @@ -2,7 +2,7 @@ # https://github.com/cmu-delphi/epipredict/issues/290 library(dplyr) -train <- cases_deaths_subset |> +train <- epidatasets::cases_deaths_subset |> filter(time_value >= as.Date("2021-10-01")) |> select(geo_value, time_value, cr = case_rate_7d_av, dr = death_rate_7d_av) ngeos <- n_distinct(train$geo_value) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R new file mode 100644 index 000000000..2ac32fc9f --- /dev/null +++ b/tests/testthat/test-utils_latency.R @@ -0,0 +1,216 @@ +time_values <- as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200] +as_of <- max(time_values) + 5 +max_time <- max(time_values) +old_data <- tibble( + geo_value = rep(c("place1", "place2"), 100), + time_value = as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200], + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + # place2 is slightly more recent than place1 + mutate(time_value = case_when( + geo_value == "place2" ~ time_value + 1, + TRUE ~ time_value + )) %>% + as_epi_df(as_of = as_of) +old_data +keys <- c("time_value", "geo_value") +old_data <- old_data %>% + full_join(epi_shift_single( + old_data, "tmp_death_rate", 1, "death_rate", keys + ), by = keys) %>% + select(-tmp_death_rate) +# old data is created so that death rate has a latency of 4, while case_rate has +# a latency of 5 +modified_data <- + old_data %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", -4, "case_rate_a", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", 3, "case_rate_b", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "death_rate", 7, "death_rate_a", keys), + by = keys + ) %>% + arrange(time_value) +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) + +modified_data %>% arrange(geo_value, desc(time_value)) +modified_data %>% + group_by(geo_value) %>% + filter(!is.na(case_rate)) %>% + summarise(max(time_value)) +as_of + +toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) %>% + as_epi_df(as_of = as.Date("2015-01-14")) +toy_df_src <- tribble( + ~geo_value, ~source, ~time_value, ~a, ~b, + "ma", "new", as.Date("2015-01-11"), 20, 6, + "ma", "new", as.Date("2015-01-12"), 23, NA, + "ma", "new", as.Date("2015-01-13"), 25, NA, + "ca", "new", as.Date("2015-01-11"), 100, 5, + "ca", "new", as.Date("2015-01-12"), 103, 10, + "ma", "old", as.Date("2013-01-01"), 19, 4, + "ma", "old", as.Date("2013-01-02"), 20, 2, + "ca", "old", as.Date("2013-01-03"), 28, 11, + "na", "new", as.Date("2013-01-05"), 28, 11, + "ma", "older", as.Date("2010-01-05"), 28, 11, +) %>% + as_epi_df(as_of = as.Date("2015-01-14"), other_keys = "source") + +test_that("get_latency works", { + expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) + expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) + expect_equal(get_latency(modified_data, as_of, "death_rate", 1, "geo_value"), 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) + expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", 1, "geo_value"), 2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", -1, "geo_value"), -2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", 1, "geo_value"), 3) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", -1, "geo_value"), -3) +}) + +test_that("get_latency ignores keys it's supposed to", { + keys_to_ignore <- list(geo_value = c("na"), source = c("old", "older")) + expected_df <- tribble( + ~geo_value, ~source, ~time_value, ~a, ~b, + "ma", "new", as.Date("2015-01-11"), 20, 6, + "ma", "new", as.Date("2015-01-12"), 23, NA, + "ma", "new", as.Date("2015-01-13"), 25, NA, + "ca", "new", as.Date("2015-01-11"), 100, 5, + "ca", "new", as.Date("2015-01-12"), 103, 10, + ) + expect_equal( + toy_df_src %>% drop_ignored_keys(keys_to_ignore) %>% as_tibble(), + expected_df + ) + + expect_equal( + get_latency_table(toy_df_src, c("a", "b"), as.Date("2015-01-14"), NULL, -1, c("geo_value", "source"), keys_to_ignore), + tibble(col_name = c("a", "b"), latency = c(-2, -3)) + ) +}) + +test_that("get_latency infers max_time to be the minimum `max time` across grouping the specified keys", { + # place 2 is already 1 day less latent than place 1, so decreasing it's + # latency it should have no effect + place2_delayed_data <- modified_data %>% mutate(time_value = time_value + 3 * (geo_value == "place2")) + expect_equal(get_latency(place2_delayed_data, as_of, "case_rate", 1, "geo_value"), 5) + # decreaseing the latency of place1 more than 1 pushes it past place2, so at most changes the latency by 1 + place1_delayed_data <- modified_data %>% mutate(time_value = time_value + 5 * (geo_value == "place1")) + expect_equal(get_latency(place1_delayed_data, as_of, "case_rate", 1, "geo_value"), 4) +}) + + +test_that("get_forecast_date works", { + info <- tribble( + ~variable, ~type, ~role, ~source, + "time_value", "date", "time_value", "original", + "geo_value", "nominal", "geo_value", "original", + "case_rate", "numeric", "raw", "original", + "death_rate", "numeric", "raw", "original", + "not_real", "numeric", "predictor", "derived" + ) + expect_equal(get_forecast_date(modified_data, info, "geo_value", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, "", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, NULL, NULL), as_of) +}) +test_that("get_forecast_date works for multiple key columns", { + info <- tribble( + ~variable, ~type, ~role, ~source, + "time_value", "date", "time_value", "original", + "geo_value", "nominal", "geo_value", "original", + "source", "nominal", "other_key", "original", + "a", "numeric", "raw", "original", + "b", "numeric", "raw", "original", + ) + expect_equal(get_forecast_date(toy_df_src, info, c("geo_value", "source"), NULL), attributes(toy_df_src)$metadata$as_of) +}) + +test_that("pad_to_end works correctly", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-13"), 2, -.6, + # internal NA + "1", as.Date("1066-10-14"), NA, NA, + "1", as.Date("1066-10-15"), 1, -.5, + "2", as.Date("1066-10-13"), 3, .9, + # note these are intentionally out of order + "3", as.Date("1066-10-14"), 2.5, NA, + "3", as.Date("1066-10-13"), 2, -.6, + ) %>% + as_epi_df(as_of = "1066-10-16") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-16")), + rbind( + single_ex[-5, ], + tibble(geo_value = "1", time_value = as.Date("1066-10-16"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 3, b = .9 + ), + tibble( + geo_value = "3", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 2.5, b = -0.6 + ) + ) %>% arrange(geo_value, time_value) + ) +}) + + +test_that("pad_to_end handles weeks", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-14"), 2, -.6, + "1", as.Date("1066-10-21"), 1, -.5, + "2", as.Date("1066-10-14"), 3, .9 + ) %>% + as_epi_df(as_of = "1066-10-28") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-28")), + rbind( + single_ex, + tibble(geo_value = "1", time_value = as.Date("1066-10-28"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-21"), + to = as.Date("1066-10-28"), + by = 7 + ), + a = 3, b = .9 + ) + ) %>% arrange(geo_value, time_value) + ) +}) +# todo case where somehow columns of different roles are selected diff --git a/tests/testthat/test-wis-dist-quantiles.R b/tests/testthat/test-wis-quantile_pred.R similarity index 65% rename from tests/testthat/test-wis-dist-quantiles.R rename to tests/testthat/test-wis-quantile_pred.R index 937793189..a51a67352 100644 --- a/tests/testthat/test-wis-dist-quantiles.R +++ b/tests/testthat/test-wis-quantile_pred.R @@ -8,45 +8,38 @@ test_that("wis dispatches and produces the correct values", { actual <- 5 expected <- c(wis_one_pred(q1, tau, actual), wis_one_pred(q2, tau, actual)) - dstn <- dist_quantiles(list(q1, q2), tau) + dstn <- quantile_pred(rbind(q1, q2), tau) expect_equal(weighted_interval_score(dstn, actual), expected) # works with a single dstn q <- sort(10 * rexp(23)) tau0 <- c(.01, .025, 1:19 / 20, .975, .99) - dst <- dist_quantiles(q, tau0) + dst <- quantile_pred(rbind(q), tau0) expect_equal(weighted_interval_score(dst, 10), wis_one_pred(q, tau0, 10)) # returns NA when expected - dst <- dist_quantiles(rep(NA, 3), c(.2, .5, .95)) + dst <- quantile_pred(rbind(rep(NA, 3)), c(.2, .5, .95)) expect_true(is.na(weighted_interval_score(dst, 10))) expect_equal( weighted_interval_score(dstn, c(NA, actual)), c(NA, wis_one_pred(q2, tau, actual)) ) - # errors for non distributions + # errors for non quantile_pred expect_snapshot(error = TRUE, weighted_interval_score(1:10, 10)) - expect_warning(w <- weighted_interval_score(dist_normal(1), 10)) - expect_true(all(is.na(w))) - expect_warning(w <- weighted_interval_score( - c(dist_normal(), dist_quantiles(1:5, 1:5 / 6)), - 10 - )) - expect_equal(w, c(NA, wis_one_pred(1:5, 1:5 / 6, 10))) # errors if sizes don't match expect_snapshot(error = TRUE, weighted_interval_score( - dist_quantiles(list(1:4, 8:11), 1:4 / 5), # length 2 + quantile_pred(rbind(1:4, 8:11), 1:4 / 5), # length 2 1:3 )) #' # Missing value behaviours - dstn <- dist_quantiles(c(1, 2, NA, 4), 1:4 / 5) + dstn <- quantile_pred(rbind(c(1, 2, NA, 4)), 1:4 / 5) expect_equal(weighted_interval_score(dstn, 2.5), 0.5) expect_equal(weighted_interval_score(dstn, 2.5, c(2, 4, 5, 6, 8) / 10), 0.4) expect_equal( - weighted_interval_score(dist_quantiles(c(1, 2, NA, 4), 1:4 / 5), 3, na_handling = "drop"), + weighted_interval_score(dstn, 3, na_handling = "drop"), 2 / 3 ) expect_equal( @@ -56,5 +49,10 @@ test_that("wis dispatches and produces the correct values", { expect_true(is.na( weighted_interval_score(dstn, 2.5, na_handling = "propagate") )) - weighted_interval_score(dist_quantiles(1:4, 1:4 / 5), 2.5, 1:9 / 10, na_handling = "fail") + expect_true(is.na( + weighted_interval_score( + quantile_pred(rbind(1:4), 1:4 / 5), 2.5, 1:9 / 10, + na_handling = "fail" + ) + )) }) diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 324ceaf7e..d255e9687 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,3 +1,4 @@ *.html *_cache/ *.R +!_common.R diff --git a/vignettes/_common.R b/vignettes/_common.R new file mode 100644 index 000000000..3bd7c929d --- /dev/null +++ b/vignettes/_common.R @@ -0,0 +1,23 @@ +knitr::opts_chunk$set( + digits = 3, + comment = "#>", + collapse = TRUE, + cache = TRUE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) diff --git a/vignettes/articles/all_states_covidcast_signals.rds b/vignettes/articles/all_states_covidcast_signals.rds deleted file mode 100644 index e4ad60153..000000000 Binary files a/vignettes/articles/all_states_covidcast_signals.rds and /dev/null differ diff --git a/vignettes/articles/case_death_rate_archive.rds b/vignettes/articles/case_death_rate_archive.rds deleted file mode 100644 index b5209fb1d..000000000 Binary files a/vignettes/articles/case_death_rate_archive.rds and /dev/null differ diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd deleted file mode 100644 index 1556c4a72..000000000 --- a/vignettes/articles/sliding.Rmd +++ /dev/null @@ -1,460 +0,0 @@ ---- -title: "Demonstrations of sliding AR and ARX forecasters" ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - warning = FALSE, - message = FALSE, - cache = TRUE -) -``` - -```{r pkgs} -library(epipredict) -library(epidatr) -library(data.table) -library(dplyr) -library(tidyr) -library(ggplot2) -library(magrittr) -library(purrr) -``` - -# Demonstrations of sliding AR and ARX forecasters - -A key function from the epiprocess package is `epix_slide()` (refer to the -following vignette for the basics of the function: ["Work with archive objects -and data -revisions"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)) -which allows performing version-aware computations. That is, the function only -uses data that would have been available as of time t for that reference time. - -In this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on -historical COVID-19 case data from the US and from Canada. We first examine the -results from a version-unaware forecaster, comparing two different fitting -engines and then we contrast this with version-aware forecasting. The former -will proceed by constructing an `epi_archive` that erases its version -information and then use `epix_slide()` to forecast the future. The latter will -keep the versioned data and proceed similarly by using `epix_slide()` to -forecast the future. - -## Comparing different forecasting engines - -### Example using CLI and case data from US states - -First, we download the version history (ie. archive) of the percentage of -doctor's visits with CLI (COVID-like illness) computed from medical insurance -claims and the number of new confirmed COVID-19 cases per 100,000 population -(daily) for all 50 states from the COVIDcast API. - -
- -Load a data archive - -We process as before, with the modification that we use `sync = locf` in -`epix_merge()` so that the last version of each observation can be carried -forward to extrapolate unavailable versions for the less up-to-date input -archive. - -```{r grab-epi-data} -theme_set(theme_bw()) - -y <- readRDS("all_states_covidcast_signals.rds") %>% - purrr::map(~ select(.x, geo_value, time_value, version = issue, value)) - -x <- epix_merge( - y[[1]] %>% rename(percent_cli = value) %>% as_epi_archive(compactify = FALSE), - y[[2]] %>% rename(case_rate = value) %>% as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = TRUE -) -rm(y) -``` - -
- -We then obtaining the latest snapshot of the data and proceed to fake the -version information by setting `version = time_value`. This has the effect of -obtaining data that arrives exactly at the day of the time_value. - -```{r arx-kweek-preliminaries, warning = FALSE} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, version = max(x$versions_end)) %>% - mutate(version = time_value) %>% - as_epi_archive() -fc_time_values <- seq( - from = as.Date("2020-08-01"), - to = as.Date("2021-11-01"), - by = "1 month" -) -aheads <- c(7, 14, 21, 28) - -forecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) { - epi_archive %>% - epix_slide( - .f = function(x, gk, rtv) { - arx_forecaster( - x, outcome, predictors, engine, - args_list = arx_args_list(ahead = ahead) - )$predictions %>% - mutate(engine_type = engine$engine) %>% - pivot_quantiles_wider(.pred_distn) - }, - .before = 120, - .versions = fc_time_values - ) -} -``` - -```{r make-arx-kweek} -# Generate the forecasts and bind them together -fc <- bind_rows( - map(aheads, ~ forecast_k_week_ahead( - x_latest, - outcome = "case_rate", - predictors = c("case_rate", "percent_cli"), - ahead = .x, - engine = linear_reg() - )), - map(aheads, ~ forecast_k_week_ahead( - x_latest, - outcome = "case_rate", - predictors = c("case_rate", "percent_cli"), - ahead = .x, - engine = rand_forest(mode = "regression") - )) -) -``` - -Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the -target (respecting time stamps and locations) along with lags of the features -(here, the response and doctors visits), estimates a forecasting model using the -specified engine, creates predictions, and non-parametric confidence bands. - -To see how the predictions compare, we plot them on top of the latest case -rates. Note that even though we've fitted the model on all states, we'll just -display the results for two states, California (CA) and Florida (FL), to get a -sense of the model performance while keeping the graphic simple. - -
- -Code for plotting -```{r plot-arx, message = FALSE, warning = FALSE} -fc_cafl <- fc %>% - tibble() %>% - filter(geo_value %in% c("ca", "fl")) -x_latest_cafl <- x_latest$DT %>% - tibble() %>% - filter(geo_value %in% c("ca", "fl")) - -p1 <- ggplot(fc_cafl, aes(target_date, group = forecast_date, fill = engine_type)) + - geom_line( - data = x_latest_cafl, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(engine_type), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - scale_fill_brewer(palette = "Set1") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` -
- -```{r show-plot1, fig.width = 9, fig.height = 6, echo=FALSE} -p1 -``` - -For the two states of interest, simple linear regression clearly performs better -than random forest in terms of accuracy of the predictions and does not result -in such in overconfident predictions (overly narrow confidence bands). Though, -in general, neither approach produces amazingly accurate forecasts. This could -be because the behaviour is rather different across states and the effects of -other notable factors such as age and public health measures may be important to -account for in such forecasting. Including such factors as well as making -enhancements such as correcting for outliers are some improvements one could -make to this simple model.[^1] - -[^1]: Note that, despite the above caveats, simple models like this tend to -out-perform many far more complicated models in the online Covid forecasting due -to those models high variance predictions. - - -### Example using case data from Canada - -
- -Data and forecasts. Similar to the above. - -By leveraging the flexibility of `epiprocess`, we can apply the same techniques -to data from other sources. Since some collaborators are in British Columbia, -Canada, we'll do essentially the same thing for Canada as we did above. - -The [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects -daily time series data on COVID-19 cases, deaths, recoveries, testing and -vaccinations at the health region and province levels. Data are collected from -publicly available sources such as government datasets and news releases. -Unfortunately, there is no simple versioned source, so we have created our own -from the Github commit history. - -First, we load versioned case rates at the provincial level. After converting -these to 7-day averages (due to highly variable provincial reporting -mismatches), we then convert the data to an `epi_archive` object, and extract -the latest version from it. Finally, we run the same forcasting exercise as for -the American data, but here we compare the forecasts produced from using simple -linear regression with those from using boosted regression trees. - -```{r get-can-fc, warning = FALSE} -# source("drafts/canada-case-rates.R) -can <- readRDS(system.file( - "extdata", "can_prov_cases.rds", - package = "epipredict", mustWork = TRUE -)) %>% - group_by(version, geo_value) %>% - arrange(time_value) %>% - mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>% - as_epi_archive(compactify = TRUE) - -can_latest <- epix_as_of(can, version = max(can$DT$version)) %>% - mutate(version = time_value) %>% - as_epi_archive() - -# Generate the forecasts, and bind them together -can_fc <- bind_rows( - map( - aheads, - ~ forecast_k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, linear_reg()) - ), - map( - aheads, - ~ forecast_k_week_ahead( - can_latest, "cr_7dav", "cr_7dav", .x, - boost_tree(mode = "regression", trees = 20) - ) - ) -) -``` - -The figures below shows the results for all of the provinces. - -```{r plot-can-fc-lr, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} -ggplot( - can_fc %>% filter(engine_type == "lm"), - aes(x = target_date, group = forecast_date) -) + - coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line( - data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4 - ) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 3) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs( - title = "Using simple linear regression", x = "Date", - y = "Reported COVID-19 case rates" - ) + - theme(legend.position = "none") -``` - -```{r plot-can-fc-boost, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} -ggplot( - can_fc %>% filter(engine_type == "xgboost"), - aes(x = target_date, group = forecast_date) -) + - coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line( - data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4 - ) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 3) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs( - title = "Using boosted regression trees", x = "Date", - y = "Reported COVID-19 case rates" - ) + - theme(legend.position = "none") -``` - -Both approaches tend to produce quite volatile forecasts (point predictions) -and/or are overly confident (very narrow bands), particularly when boosted -regression trees are used. But as this is meant to be a simple demonstration of -sliding with different engines in `arx_forecaster`, we may devote another -vignette to work on improving the predictive modelling using the suite of tools -available in `{epipredict}`. - -
- -## Version-aware forecasting - -### Example using case data from US states - -We will now employ a forecaster that uses properly-versioned data (that would -have been available in real-time) to forecast the 7 day average of future -COVID-19 case rates from current and past COVID-19 case rates and death rates -for all states. That is, we can make forecasts on the archive, `x`, and compare -those to forecasts on the latest data, `x_latest` using the same general set-up -as above. Note that in this example, we use a geo-pooled approach (using -combined data from all US states and territories) to train our model. - -
- -Download data using `{epidatr}` -```{r load-data, eval=FALSE} -# loading in the data -states <- "*" - -confirmed_incidence_prop <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20200301, 20211231), - geo_values = states, - issues = epirange(20000101, 20211231) -) %>% - select(geo_value, time_value, version = issue, case_rate = value) %>% - arrange(geo_value, time_value) %>% - as_epi_archive(compactify = FALSE) - -deaths_incidence_prop <- pub_covidcast( - source = "jhu-csse", - signals = "deaths_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20200301, 20211231), - geo_values = states, - issues = epirange(20000101, 20211231) -) %>% - select(geo_value, time_value, version = issue, death_rate = value) %>% - arrange(geo_value, time_value) %>% - as_epi_archive(compactify = FALSE) - - -x <- epix_merge(confirmed_incidence_prop, deaths_incidence_prop, sync = "locf") - -x <- x %>% - epix_slide( - .versions = fc_time_values, - function(x, gk, rtv) { - x %>% - group_by(geo_value) %>% - epi_slide_mean(case_rate, .window_size = 7L) %>% - rename(case_rate_7d_av = slide_value_case_rate) %>% - epi_slide_mean(death_rate, ..window_size = 7L) %>% - rename(death_rate_7d_av = slide_value_death_rate) %>% - ungroup() - } - ) %>% - rename(version = time_value) %>% - rename( - time_value = slide_value_time_value, - geo_value = slide_value_geo_value, - case_rate = slide_value_case_rate, - death_rate = slide_value_death_rate, - case_rate_7d_av = slide_value_case_rate_7d_av, - death_rate_7d_av = slide_value_death_rate_7d_av - ) %>% - as_epi_archive(compactify = TRUE) - -saveRDS(x$DT, file = "case_death_rate_archive.rds") -``` - -```{r load-stored-data} -x <- readRDS("case_death_rate_archive.rds") -x <- as_epi_archive(x) -``` -
- -Here we specify the ARX model. - -```{r make-arx-model} -aheads <- c(7, 14, 21) -fc_time_values <- seq( - from = as.Date("2020-09-01"), - to = as.Date("2021-12-31"), - by = "1 month" -) -forecaster <- function(x) { - map(aheads, function(ahead) { - arx_forecaster( - epi_data = x, - outcome = "death_rate_7d_av", - predictors = c("death_rate_7d_av", "case_rate_7d_av"), - trainer = quantile_reg(), - args_list = arx_args_list(lags = c(0, 7, 14, 21), ahead = ahead) - )$predictions - }) %>% - bind_rows() -} -``` - -We can now use our forecaster function that we've created and use it in the -pipeline for forecasting the predictions. We store the predictions into the -`arx_preds` variable and calculate the most up to date version of the data in the -epi archive and store it as `x_latest`. - -```{r running-arx-forecaster} -arx_preds <- x %>% - epix_slide( - ~ forecaster(.x), - .before = 120, .versions = fc_time_values - ) %>% - mutate(engine_type = quantile_reg()$engine) %>% - mutate(ahead_val = target_date - forecast_date) - -x_latest <- epix_as_of(x, version = max(x$versions_end)) -``` - -Now we plot both the actual and predicted 7 day average of the death rate for -the chosen states - -
- -Code for the plot -```{r plot-arx-asof, message = FALSE, warning = FALSE} -states_to_show <- c("ca", "ny", "mi", "az") -fc_states <- arx_preds %>% - filter(geo_value %in% states_to_show) %>% - pivot_quantiles_wider(.pred_distn) - -x_latest_states <- x_latest %>% filter(geo_value %in% states_to_show) - -p2 <- ggplot(fc_states, aes(target_date, group = forecast_date)) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), alpha = 0.4) + - geom_line( - data = x_latest_states, aes(x = time_value, y = death_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = .pred, color = geo_value)) + - geom_point(aes(y = .pred, color = geo_value), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 1L) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - scale_fill_brewer(palette = "Set1") + - scale_color_brewer(palette = "Set1") + - labs(x = "Date", y = "7 day average COVID-19 death rates") + - theme(legend.position = "none") -``` -
- -```{r show-plot2, fig.width = 9, fig.height = 6, echo = FALSE} -p2 -``` diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd deleted file mode 100644 index 291d8cb7e..000000000 --- a/vignettes/articles/smooth-qr.Rmd +++ /dev/null @@ -1,544 +0,0 @@ ---- -title: "Smooth quantile regression" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Smooth quantile regression} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = FALSE, - comment = "#>", - warning = FALSE, - message = FALSE, - out.width = "100%" -) -``` - -# Introducing smooth quantile regression - -Whereas other time-series forecasting examples in this package have used -(direct) models for single horizons, in multi-period forecasting, the goal is to -(directly) forecast several horizons simultaneously. This is useful in -epidemiological applications where decisions are based on the trend of a signal. - -The idea underlying smooth quantile regression is that set forecast targets can -be approximated by a smooth curve. This novel approach from -[Tuzhilina et al., 2022](https://arxiv.org/abs/2202.09723) -enforces smoothness across the -horizons and can be applied to point estimation by regression or interval -prediction by quantile regression. Our focus in this vignette is the latter. - -# Built-in function for smooth quantile regression and its parameters - -The built-in smooth quantile regression function, `smooth_quantile_reg()` -provides a model specification for smooth quantile regression that works under -the tidymodels framework. It has the following parameters and default values: - -```{r, eval = FALSE} -smooth_quantile_reg( - mode = "regression", - engine = "smoothqr", - outcome_locations = NULL, - quantile_levels = 0.5, - degree = 3L -) -``` - -For smooth quantile regression, the type of model or `mode` is regression. - -The only `engine` that is currently supported is `smooth_qr()` from the -[`smoothqr` package](https://dajmcdon.github.io/smoothqr/). - -The `outcome_locations` indicate the multiple horizon (ie. ahead) values. These -should be specified by the user. - -The `quantile_levels` parameter is a vector of values that indicates the -quantiles to be estimated. The default is the median (0.5 quantile). - -The `degree` parameter indicates the degree of the polynomials used for -smoothing of the response. It should be no more than the number of aheads. If -the degree is precisely equal to the number of aheads, then there is no -smoothing. To better understand this parameter and how it works, we should look -to its origins and how it is used in the model. - -# Model form - -Smooth quantile regression is linear auto-regressive, with the key feature being -a transformation that forces the coefficients to satisfy a smoothing constraint. -The purpose of this is for each model coefficient to be a smooth function of -ahead values, and so each such coefficient is set to be a linear combination of -smooth basis functions (such as a spline or a polynomial). - -The `degree` parameter controls the number of these polynomials used. It should -be no greater than the number of responses. This is a tuning parameter, and so -it can be chosen by performing a grid search with cross-validation. Intuitively, -$d = 1$ corresponds to the constant model, $d = 2$ gives straight line -forecasts, while $d = 3$ gives quadratic forecasts. Since a degree of 3 was -found to work well in the tested applications (see Section 9 of -[Tuzhilina et al., 2022](https://arxiv.org/abs/2202.09723)), -it is the default value. - -# Demonstration of smooth quantile regression - -```{r, message = FALSE} -library(epipredict) -library(dplyr) -library(purrr) -library(ggplot2) -theme_set(theme_bw()) -``` - -We will now apply smooth quantile regression on the real data used for COVID-19 -forecasting. The built-in dataset we will use is a subset of JHU daily data on -state cases and deaths. This sample data ranges from Dec. 31, 2020 to -Dec. 31, 2021. - -```{r} -edf <- case_death_rate_subset -``` - -We will set the forecast date to be November 30, 2021 so that we can produce -forecasts for target dates of 1 to 28 days ahead. We construct our test data, -`tedf` from the days beyond this. - -```{r} -fd <- as.Date("2021-11-30") - -tedf <- edf %>% filter(time_value >= fd) -``` - -We will use the most recent 3 months worth of data up to the forecast date for -training. - -```{r} -edf <- edf %>% filter(time_value < fd, time_value >= fd - 90L) -``` - -And for plotting our focus will be on a subset of two states - California and -Utah. - -```{r} -geos <- c("ut", "ca") -``` - -Suppose that our goal with this data is to predict COVID-19 death rates at -several horizons for each state. On day $t$, we want to predict new deaths $y$ -that are $a = 1,\dots, 28$ days ahead at locations $j$ using the death rates -from today, 1 week ago, and 2 weeks ago. So for each location, we'll predict the -median (0.5 quantile) for each of the target dates by using -$$ -\hat{y}_{j}(t+a) = \alpha(a) + \sum_{l = 0}^2 \beta_{l}(a) y_{j}(t - 7l) -$$ -where $\beta_{l}(a) = \sum_{i=1}^d \theta_{il} h_i(a)$ is the smoothing -constraint where ${h_1(a), \dots, h_d(a)}$ are the set of smooth basis functions -and $d$ is a hyperparameter that manages the flexibility of $\beta_{l}(a)$. -Remember that the goal is to have each $\beta_{l}(a)$ to be a smooth function of -the aheads and that is achieved through imposing the smoothing constraint. - -Note that this model is intended to be simple and straightforward. Our only -modification to this model is to add case rates as another predictive feature -(we will leave it to the reader to incorporate additional features beyond this -and the historical response values). We can update the basic model incorporate -the $k = 2$ predictive features of case and death rates for each location j, -$x_j(t) = (x_{j1}(t), x_{j2}(t))$ as follows: - -$$ -\hat{y}_{j}(t+a) = \alpha(a) + \sum_{k = 1}^2 \sum_{l = 0}^2 \beta_{kl}(a) x_{jk}(t - 7l) -$$ -where $\beta_{kl}(a) = \sum_{i=1}^d \theta_{ikl} h_i(a)$. - -Now, we will create our own forecaster from scratch by building up an -`epi_workflow` (there is no canned forecaster that is currently available). -Building our own forecaster allows for customization and control over the -pre-processing and post-processing actions we wish to take. - -The pre-processing steps we take in our `epi_recipe` are simply to lag the -predictor (by 0, 7, and 14 days) and lead the response by the multiple aheads -specified by the function user. - -The post-processing layers we add to our `frosting` are nearly as simple. We -first predict, unnest the prediction list-cols, omit NAs from them, and enforce -that they are greater than 0. - -The third component of an to an `epi_workflow`, the model, is smooth quantile -regression, which has three main arguments - the quantiles, aheads, and degree. - -After creating our `epi_workflow` with these components, we get our test data -based on longest lag period and make the predictions. - -We input our forecaster into a function for ease of use. - -```{r} -smooth_fc <- function(x, aheads = 1:28, degree = 3L, quantiles = 0.5, fd) { - rec <- epi_recipe(x) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% - step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = aheads) - - f <- frosting() %>% - layer_predict() %>% - layer_unnest(.pred) %>% - layer_naomit(distn) %>% - layer_add_forecast_date() %>% - layer_threshold(distn) - - ee <- smooth_quantile_reg( - quantile_levels = quantiles, - outcome_locations = aheads, - degree = degree - ) - - ewf <- epi_workflow(rec, ee, f) - - the_fit <- ewf %>% fit(x) - - latest <- get_test_data(rec, x, fill_locf = TRUE) - - preds <- predict(the_fit, new_data = latest) %>% - mutate(forecast_date = fd, target_date = fd + ahead) %>% - select(geo_value, target_date, distn, ahead) %>% - pivot_quantiles_wider(distn) - - preds -} -``` - -```{r load-stored-preds, echo=FALSE} -smooth_preds_list <- readRDS("smooth-qr_smooth_preds_list.rds") -baseline_preds <- readRDS("smooth-qr_baseline_preds.rds") -smooth_preds <- smooth_preds_list %>% - filter(degree == 3L) %>% - select(geo_value:ahead, `0.5`) -``` - -Notice that we allow the function user to specify the aheads, degree, and -quantile as they may want to change these parameter values. We also allow for -input of the forecast date as we fixed that at the onset of this demonstration. - -We now can produce smooth quantile regression predictions for our problem: - -```{r, eval = FALSE} -smooth_preds <- smooth_fc(edf, fd = fd) -smooth_preds -``` - -```{r, echo=FALSE} -smooth_preds -smooth_preds <- smooth_preds_list %>% - filter(degree == 3L) %>% - select(-degree) -``` - - -Most often, we're not going to want to limit ourselves to just predicting the -median value as there is uncertainty about the predictions, so let's try to -predict several different quantiles in addition to the median: - -```{r, eval = FALSE} -several_quantiles <- c(.1, .25, .5, .75, .9) -smooth_preds <- smooth_fc(edf, quantiles = several_quantiles, fd = fd) -smooth_preds -``` - -```{r, echo = FALSE} -several_quantiles <- c(.1, .25, .5, .75, .9) -smooth_preds -``` - -We can see that we have different columns for the different quantile -predictions. - -Let's visualize these results for the sample of two states. We will create a -simple plotting function, under which the median predictions are an orange line -and the surrounding quantiles are blue bands around this. For comparison, we -will include the actual values over time as a black line. - -```{r} -plot_preds <- function(preds, geos_to_plot = NULL, train_test_dat, fd) { - if (!is.null(geos_to_plot)) { - preds <- preds %>% filter(geo_value %in% geos_to_plot) - train_test_dat <- train_test_dat %>% filter(geo_value %in% geos_to_plot) - } - - ggplot(preds) + - geom_ribbon(aes(target_date, ymin = `0.1`, ymax = `0.9`), - fill = "cornflowerblue", alpha = .8 - ) + - geom_ribbon(aes(target_date, ymin = `0.25`, ymax = `0.75`), - fill = "#00488E", alpha = .8 - ) + - geom_line(data = train_test_dat, aes(time_value, death_rate)) + - geom_line(aes(target_date, `0.5`), color = "orange") + - geom_vline(xintercept = fd) + - facet_wrap(~geo_value) + - scale_x_date(name = "", date_labels = "%b %Y", date_breaks = "2 months") + - ylab("Deaths per 100K inhabitants") -} -``` - -Since we would like to plot the actual death rates for these states over time, -we bind the training and testing data together and input this into our plotting -function as follows: - -```{r, warning = FALSE} -plot_preds(smooth_preds, geos, bind_rows(tedf, edf), fd) -``` - -We can see that the predictions are smooth curves for each state, as expected -when using smooth quantile regression. In addition while the curvature of the -forecasts matches that of the truth, the forecasts do not look remarkably -accurate. - -## Varying the degrees parameter - -We can test the impact of different degrees by using the `map()` function. -Noting that this may take some time to run, let's try out all degrees from 1 -to 7: - -```{r, eval = FALSE} -smooth_preds_list <- map(1:7, function(x) { - smooth_fc( - edf, - degree = x, - quantiles = c(.1, .25, .5, .75, .9), - fd = fd - ) %>% - mutate(degree = x) -}) %>% list_rbind() -``` - -One way to quantify the impact of these on the forecasting is to look at the -mean absolute error (MAE) or mean squared error (MSE) over the degrees. We can -select the degree that results in the lowest MAE. - -Since the MAE compares the predicted values to the actual values, we will first -join the test data to the predicted data for our comparisons: -```{r, message = FALSE} -tedf_sub <- tedf %>% - rename(target_date = time_value, actual = death_rate) %>% - select(geo_value, target_date, actual) -``` - -And then compute the MAE for each of the degrees: -```{r, message = FALSE} -smooth_preds_df_deg <- smooth_preds_list %>% - left_join(tedf_sub, by = c("geo_value", "target_date")) %>% - group_by(degree) %>% - mutate(error = abs(`0.5` - actual)) %>% - summarise(mean = mean(error)) - -# Arrange the MAE from smallest to largest -smooth_preds_df_deg %>% arrange(mean) -``` - -Instead of just looking at the raw numbers, let's create a simple line plot to -visualize how the MAE changes over degrees for this data: - -```{r} -ggplot(smooth_preds_df_deg, aes(degree, mean)) + - geom_line() + - xlab("Degrees of freedom") + - ylab("Mean MAE") -``` - -We can see that the degree that results in the lowest MAE is 3. Hence, we could -pick this degree for future forecasting work on this data. - -## A brief comparison between smoothing and no smoothing - -Now, we will briefly compare the results from using smooth quantile regression -to those obtained without smoothing. The latter approach amounts to ordinary -quantile regression to get predictions for the intended target date. The main -drawback is that it ignores the fact that the responses all represent the same -signal, just for different ahead values. In contrast, the smooth quantile -regression approach utilizes this information about the data structure - the -fact that the aheads in are not be independent of each other, but that they are -naturally related over time by a smooth curve. - -To get the basic quantile regression results we can utilize the forecaster that -we've already built. We can simply set the degree to be the number of ahead -values to re-run the code without smoothing. - -```{r, eval = FALSE} -baseline_preds <- smooth_fc( - edf, - degree = 28L, quantiles = several_quantiles, fd = fd -) -``` - -And we can produce the corresponding plot to inspect the predictions obtained -under the baseline model: - -```{r, warning = FALSE} -plot_preds(baseline_preds, geos, bind_rows(tedf, edf), fd) -``` - -Unlike for smooth quantile regression, the resulting forecasts are not smooth -curves, but rather jagged and irregular in shape. - -For a more formal comparison between the two approaches, we could compare the -test performance in terms of accuracy through calculating either the, MAE or -MSE, where the performance measure of choice can be calculated over over all -times and locations for each ahead value - - -```{r, message = FALSE} -baseline_preds_mae_df <- baseline_preds %>% - left_join(tedf_sub, by = c("geo_value", "target_date")) %>% - group_by(ahead) %>% - mutate(error = abs(`0.5` - actual)) %>% - summarise(mean = mean(error)) %>% - mutate(type = "baseline") - -smooth_preds_mae_df <- smooth_preds %>% - left_join(tedf_sub, by = c("geo_value", "target_date")) %>% - group_by(ahead) %>% - mutate(error = abs(`0.5` - actual)) %>% - summarise(mean = mean(error)) %>% - mutate(type = "smooth") - -preds_mae_df <- bind_rows(baseline_preds_mae_df, smooth_preds_mae_df) - -ggplot(preds_mae_df, aes(ahead, mean, color = type)) + - geom_line() + - xlab("Ahead") + - ylab("Mean MAE") + - scale_color_manual(values = c("darkred", "#063970"), name = "") -``` - -or over all aheads, times, and locations for a single numerical summary. - -```{r} -mean(baseline_preds_mae_df$mean) -mean(smooth_preds_mae_df$mean) -``` - -The former shows that forecasts for the immediate future and for the distant -future are more inaccurate for both models under consideration. The latter shows -that the smooth quantile regression model and baseline models perform very -similarly overall, with the smooth quantile regression model only slightly -beating the baseline model in terms of overall average MAE. - -One other commonly used metric is the Weighted Interval Score -(WIS, [Bracher et al., 2021](https://arxiv.org/pdf/2005.12881.pdf)), -which a scoring rule that is based on the population quantiles. The point is to -score the interval, whereas MAE only evaluates the accuracy of the point -forecast. - -Let $F$ be a forecast composed of predicted quantiles $q_{\tau}$ for the set of -quantile levels $\tau$. Then, in terms of the predicted quantiles, the WIS for -target variable $Y$ is represented as follows -([McDonald etal., 2021](https://www.pnas.org/doi/full/10.1073/pnas.2111453118)): - -$$ -WIS(F, Y) = 2 \sum_{\tau} \phi_{\tau} (Y - q_{\tau}) -$$ -where $\phi_{\tau}(x) = \tau |x|$ for $x \geq 0$ -and$\phi_{\tau}(x) = (1 - \tau) |x|$ for $x < 0$. - -This form is general as it can accommodate both symmetric and asymmetric -quantile levels. If the quantile levels are symmetric, then we can alternatively -express the WIS as a collection of central prediction intervals -($\ell_{\alpha}, u_{\alpha}$) parametrized by the exclusion probability -$\alpha$: - -$$ -WIS(F, Y) = \sum_{\alpha} \{ (u_{\alpha} - \ell_{\alpha}) + 2 \cdot \text{dist}(Y, [\ell_{\alpha}, u_{\alpha}]) \} -$$ -where $\text{dist}(a,S)$ is the smallest distance between point $a$ and an -element of set $S$. - -While we implement the former representation, we mention this form because it -shows the that the score can be decomposed into the addition of a sharpness -component (first term in the summand) and an under/overprediction component -(second term in the summand). This alternative representation is useful because -from it, we more easily see the major limitation to the WIS, which is that the -score tends to prioritize sharpness (how wide the interval is) relative to -coverage (if the interval contains the truth). - -Now, we write a simple function for the first representation of the score that -is compatible with the latest version of `epipredict` (adapted from the -corresponding function in -[smoothmpf-epipredict](https://github.com/dajmcdon/smoothmpf-epipredict)). The -inputs for it are the actual and predicted values and the quantile levels. - -```{r} -wis_dist_quantile <- function(actual, values, quantile_levels) { - 2 * mean(pmax( - quantile_levels * (actual - values), - (1 - quantile_levels) * (values - actual), - na.rm = TRUE - )) -} -``` - -Next, we apply the `wis_dist_quantile` function to get a WIS score for each -state on each target date. We then compute the mean WIS for each ahead value -over all of the states. The results for each of the smooth and baseline -forecasters are shown in a similar style line plot as we chose for MAE: - -```{r} -smooth_preds_wis_df <- smooth_preds %>% - left_join(tedf_sub, by = c("geo_value", "target_date")) %>% - rowwise() %>% - mutate(wis = wis_dist_quantile( - actual, c(`0.1`, `0.25`, `0.5`, `0.75`, `0.9`), - several_quantiles - )) %>% - group_by(ahead) %>% - summarise(mean = mean(wis)) %>% - mutate(type = "smooth") - -baseline_preds_wis_df <- baseline_preds %>% - left_join(tedf_sub, by = c("geo_value", "target_date")) %>% - rowwise() %>% - mutate(wis = wis_dist_quantile( - actual, c(`0.1`, `0.25`, `0.5`, `0.75`, `0.9`), - several_quantiles - )) %>% - group_by(ahead) %>% - summarise(mean = mean(wis)) %>% - mutate(type = "baseline") - -preds_wis_df <- bind_rows(smooth_preds_wis_df, baseline_preds_wis_df) - -ggplot(preds_wis_df, aes(ahead, mean, color = type)) + - geom_line() + - xlab("Ahead") + - ylab("Mean WIS") + - scale_color_manual(values = c("darkred", "#063970"), name = "") -``` - -The results are consistent with what we saw for MAE: The forecasts for the near -and distant future tend to be inaccurate for both models. The smooth quantile -regression model only slightly outperforms the baseline model. - -Though averaging the WIS score over location and time tends to be the primary -aggregation scheme used in evaluation and model comparisons (see, for example, -[McDonald et al., 2021](https://www.pnas.org/doi/full/10.1073/pnas.2111453118)), -we can also obtain a single numerical summary by averaging over the aheads, -times, and locations: - -```{r} -mean(baseline_preds_wis_df$mean) -mean(smooth_preds_wis_df$mean) -``` - -Overall, both perspectives agree that the smooth quantile regression model tends -to perform only slightly better than the baseline model in terms of average WIS, -illustrating the difficulty of this forecasting problem. - -# What we've learned in a nutshell - -Smooth quantile regression is used in multi-period forecasting for predicting -several horizons simultaneously with a single smooth curve. It operates under -the key assumption that the future of the response can be approximated well by a -smooth curve. - -# Attribution - -The information presented on smooth quantile regression is from -[Tuzhilina et al., 2022](https://arxiv.org/abs/2202.09723). diff --git a/vignettes/arx-classifier.Rmd b/vignettes/arx-classifier.Rmd deleted file mode 100644 index b2a2bbf8e..000000000 --- a/vignettes/arx-classifier.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Auto-regressive classifier" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Auto-regressive classifier} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - echo = TRUE, - collapse = FALSE, - comment = "#>", - warning = FALSE, - message = FALSE, - out.width = "100%" -) -``` - -## Load required packages - -```{r, message = FALSE, warning = FALSE} -library(dplyr) -library(purrr) -library(ggplot2) -library(epipredict) -``` - -## Introducing the ARX classifier - -The `arx_classifier()` is an autoregressive classification model for `epi_df` -data that is used to predict a discrete class for each case under consideration. -It is a direct forecaster in that it estimates the classes at a specific horizon -or ahead value. - -To get a sense of how the `arx_classifier()` works, let's consider a simple -example with minimal inputs. For this, we will use the built-in -`case_death_rate_subset` that contains confirmed COVID-19 cases and deaths from -JHU CSSE for all states over Dec 31, 2020 to Dec 31, 2021. From this, we'll take -a subset of data for five states over June 4, 2021 to December 31, 2021. Our -objective is to predict whether the case rates are increasing when considering -the 0, 7 and 14 day case rates: - -```{r} -jhu <- case_death_rate_subset %>% - filter( - time_value >= "2021-06-04", - time_value <= "2021-12-31", - geo_value %in% c("ca", "fl", "tx", "ny", "nj") - ) - -out <- arx_classifier(jhu, outcome = "case_rate", predictors = "case_rate") - -out$predictions -``` - -The key takeaway from the predictions is that there are two prediction classes: -(-Inf, 0.25] and (0.25, Inf). This is because for our goal of classification -the classes must be discrete. The discretization of the real-valued outcome is -controlled by the `breaks` argument, which defaults to 0.25. Such breaks will be -automatically extended to cover the entire real line. For example, the default -break of 0.25 is silently extended to breaks = c(-Inf, .25, Inf) and, therefore, -results in two classes: [-Inf, 0.25] and (0.25, Inf). These two classes are -used to discretize the outcome. The conversion of the outcome to such classes is -handled internally. So if discrete classes already exist for the outcome in the -`epi_df`, then we recommend to code a classifier from scratch using the -`epi_workflow` framework for more control. - -The `trainer` is a `parsnip` model describing the type of estimation such that -`mode = "classification"` is enforced. The two typical trainers that are used -are `parsnip::logistic_reg()` for two classes or `parsnip::multinom_reg()` for -more than two classes. - -```{r} -workflows::extract_spec_parsnip(out$epi_workflow) -``` - -From the parsnip model specification, we can see that the trainer used is -logistic regression, which is expected for our binary outcome. More complicated -trainers like `parsnip::naive_Bayes()` or `parsnip::rand_forest()` may also be -used (however, we will stick to the basics in this gentle introduction to the -classifier). - -If you use the default trainer of logistic regression for binary classification -and you decide against using the default break of 0.25, then you should only -input one break so that there are two classification bins to properly -dichotomize the outcome. For example, let's set a break of 0.5 instead of -relying on the default of 0.25. We can do this by passing 0.5 to the `breaks` -argument in `arx_class_args_list()` as follows: - -```{r} -out_break_0.5 <- arx_classifier( - jhu, - outcome = "case_rate", - predictors = "case_rate", - args_list = arx_class_args_list( - breaks = 0.5 - ) -) - -out_break_0.5$predictions -``` -Indeed, we can observe that the two `.pred_class` are now (-Inf, 0.5] and (0.5, -Inf). See `help(arx_class_args_list)` for other available modifications. - -Additional arguments that may be supplied to `arx_class_args_list()` include the -expected `lags` and `ahead` arguments for an autoregressive-type model. These -have default values of 0, 7, and 14 days for the lags of the predictors and 7 -days ahead of the forecast date for predicting the outcome. There is also -`n_training` to indicate the upper bound for the number of training rows per -key. If you would like some practice with using this, then remove the filtering -command to obtain data within "2021-06-04" and "2021-12-31" and instead set -`n_training` to be the number of days between these two dates, inclusive of the -end points. The end results should be the same. In addition to `n_training`, -there are `forecast_date` and `target_date` to specify the date that the -forecast is created and intended, respectively. We will not dwell on such -arguments here as they are not unique to this classifier or absolutely essential -to understanding how it operates. The remaining arguments will be discussed -organically, as they are needed to serve our purposes. For information on any -remaining arguments that are not discussed here, please see the function -documentation for a complete list and their definitions. - -## Example of using the ARX classifier - -Now, to demonstrate the power and utility of this built-in arx classifier, we -will loosely adapt the classification example that was written from scratch in -`vignette("preprocessing-and-models")`. However, to keep things simple and not -merely a direct translation, we will only consider two prediction categories and -leave the extension to three as an exercise for the reader. - -To motivate this example, a major use of autoregressive classification models is -to predict upswings or downswings like in hotspot prediction models to -anticipate the direction of the outcome (see [McDonald, Bien, Green, Hu, et al. -(2021)](https://www.pnas.org/doi/full/10.1073/pnas.2111453118) for more on -these). In our case, one simple question that such models can help answer is... -Do we expect that the future will have increased case rates or not relative to -the present? - -To answer this question, we can create a predictive model for upswings and -downswings of case rates rather than the raw case rates themselves. For this -situation, our target is to predict whether there is an increase in case rates -or not. Following -[McDonald, Bien, Green, Hu, et al.(2021)](https://www.pnas.org/doi/full/10.1073/pnas.2111453118), -we look at the -relative change between $Y_{l,t}$ and $Y_{l, t+a}$, where the former is the case -rate at location $l$ at time $t$ and the latter is the rate for that location at -time $t+a$. Using these variables, we define a categorical response variable -with two classes - -$$\begin{align} -Z_{l,t} = \left\{\begin{matrix} -\text{up,} & \text{if } Y_{l,t}^\Delta > 0.25\\ -\text{not up,} & \text{otherwise} -\end{matrix}\right. -\end{align}$$ -where $Y_{l,t}^\Delta = (Y_{l, t} - Y_{l, t-7} / Y_{l, t-7}$. If $Y_{l,t}^\Delta$ > 0.25, meaning that the number of new cases over the week has increased by over 25\%, then $Z_{l,t}$ is up. This is the criteria for location $l$ to be a hotspot at time $t$. On the other hand, if $Y_{l,t}^\Delta$ \leq 0.25$, then then $Z_{l,t}$ is categorized as not up, meaning that there has not been a >25\% increase in the new cases over the past week. - -The logistic regression model we use to predict this binary response can be -considered to be a simplification of the multinomial regression model presented -in `vignette("preprocessing-and-models")`: - -$$\begin{align} -\pi_{\text{up}}(x) &= Pr(Z_{l, t} = \text{up}|x) = \frac{e^{g_{\text{up}}(x)}}{1 + e^{g_{\text{up}}(x)}}, \\ -\pi_{\text{not up}}(x)&= Pr(Z_{l, t} = \text{not up}|x) = 1 - Pr(Z_{l, t} = \text{up}|x) = \frac{1}{1 + e^{g_{\text{up}}(x)}} -\end{align}$$ -where - -$$ -g_{\text{up}}(x) = \log\left ( \frac{\Pr(Z_{l, t} = \text{up} \vert x)}{\Pr(Z_{l, t} = \text{not up} \vert x)} \right ) = \beta_{10} + \beta_{11}Y_{l,t}^\Delta + \beta_{12}Y_{l,t-7}^\Delta + \beta_{13}Y_{l,t-14}^\Delta. -$$ - -Now then, we will operate on the same subset of the `case_death_rate_subset` -that we used in our above example. This time, we will use it to investigate -whether the number of newly reported cases over the past 7 days has increased by -at least 25% compared to the preceding week for our sample of states. - -Notice that by using the `arx_classifier()` function we've completely eliminated -the need to manually categorize the response variable and implement -pre-processing steps, which was necessary in -`vignette("preprocessing-and-models")`. - -```{r} -log_res <- arx_classifier( - jhu, - outcome = "case_rate", - predictors = "case_rate", - args_list = arx_class_args_list( - breaks = 0.25 / 7 # division by 7 gives weekly not daily - ) -) - -log_res$epi_workflow -``` - -Comparing the pre-processing steps for this to those in the other vignette, we -can see that they are not precisely the same, but they cover the same essentials -of transforming `case_rate` to the growth rate scale (`step_growth_rate()`), -lagging the predictors (`step_epi_lag()`), leading the response -(`step_epi_ahead()`), which are both constructed from the growth rates, and -constructing the binary classification response variable (`step_mutate()`). - -On this topic, it is important to understand that we are not actually concerned -about the case values themselves. Rather we are concerned whether the quantity -of cases in the future is a lot larger than that in the present. For this -reason, the outcome does not remain as cases, but rather it is transformed by -using either growth rates (as the predictors and outcome in our example are) or -lagged differences. While the latter is closer to the requirements for the -[2022-23 CDC Flusight Hospitalization Experimental Target](https://github.com/cdcepi/Flusight-forecast-data/blob/745511c436923e1dc201dea0f4181f21a8217b52/data-experimental/README.md), -and it is conceptually easy to understand because it is simply the change of the -value for the horizon, it is not the default. The default is `growth_rate`. One -reason for this choice is because the growth rate is on a rate scale, not on the -absolute scale, so it fosters comparability across locations without any -conscious effort (on the other hand, when using the `lag_difference` one would -need to take care to operate on rates per 100k and not raw counts). We utilize -`epiprocess::growth_rate()` to create the outcome using some of the additional -arguments. One important argument for the growth rate calculation is the -`method`. Only `rel_change` for relative change should be used as the method -because the test data is the only data that is accessible and the other methods -require access to the training data. - -The other optional arguments for controlling the growth rate calculation (that -can be inputted as `additional_gr_args`) can be found in the documentation for -`epiprocess::growth_rate()` and the related -`vignette("growth_rate", package = "epiprocess")`. - -### Visualizing the results - -To visualize the prediction classes across the states for the target date, we -can plot our results as a heatmap. However, if we were to plot the results for -only one target date, like our 7-day ahead predictions, then that would be a -pretty sad heatmap (which would look more like a bar chart than a heatmap)... So -instead of doing that, let's get predictions for several aheads and plot a -heatmap across the target dates. To get the predictions across several ahead -values, we will use the map function in the same way that we did in other -vignettes: - -```{r} -multi_log_res <- map(1:40, ~ arx_classifier( - jhu, - outcome = "case_rate", - predictors = "case_rate", - args_list = arx_class_args_list( - breaks = 0.25 / 7, # division by 7 gives weekly not daily - ahead = .x - ) -)$predictions) %>% list_rbind() -``` - -We can plot a the heatmap of the results over the aheads to see if there's -anything novel or interesting to take away: - -```{r} -ggplot(multi_log_res, aes(target_date, geo_value, fill = .pred_class)) + - geom_tile() + - ylab("State") + - xlab("Target date") + - scale_fill_brewer(palette = "Set1") -``` - -While there is a bit of variability near to the end, we can clearly see that -there are upswings for all states starting from the beginning of January 2022, -which we can recall was when there was a massive spike in cases for many states. -So our results seem to align well with what actually happened at the beginning -of January 2022. - -## A brief reflection - -The most noticeable benefit of using the `arx_classifier()` function is the -simplification and reduction of the manual implementation of the classifier from -about 30 down to 3 lines. However, as we noted before, the trade-off for -simplicity is control over the precise pre-processing, post-processing, and -additional features embedded in the coding of a classifier. So the good thing is -that `epipredict` provides both - a built-in `arx_classifer()` or the means to -implement your own classifier from scratch by using the `epi_workflow` -framework. And which you choose will depend on the circumstances. Our advice is -to start with using the built-in classifier for ostensibly simple projects and -begin to implement your own when the modelling project takes a complicated turn. -To get some practice on coding up a classifier by hand, consider translating -this binary classification model example to an `epi_workflow`, akin to that in -`vignette("preprocessing-and-models")`. diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd new file mode 100644 index 000000000..f355f5516 --- /dev/null +++ b/vignettes/backtesting.Rmd @@ -0,0 +1,454 @@ +--- +title: "Accurately backtesting forecasters" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Accurately backtesting forecasters} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +source(here::here("vignettes/_common.R")) +``` + +Backtesting is a crucial step in the development of forecasting models. It +involves testing the model on historical time periods to see how well it generalizes to new +data. + +In the context of +epidemiological forecasting, to do backtesting accurately, we need to account +for the fact that the data available at _the time of the forecast_ would have been +different from the data available at the time of the _backtest_. +This is because +new data is constantly being collected and added to the dataset, and old data potentially revised. +Training and making +predictions only on finalized data can lead to overly optimistic estimates of accuracy +(see, for example, [McDonald et al. +2021](https://www.pnas.org/content/118/51/e2111453118/) and the references +therein). + +In the `{epiprocess}` package, we provide the function `epix_slide()` to help conviently perform version-faithful forecasting by only using the data as +it would have been available at forecast reference time. +In this vignette, we will demonstrate how to use `epix_slide()` to backtest an +auto-regressive forecaster constructed using `arx_forecaster()` on historical +COVID-19 case data from the US and Canada. + +# Getting case data from US states into an `epi_archive` + +```{r pkgs, message=FALSE} +# Setup +library(epipredict) +library(epiprocess) +library(epidatr) +library(data.table) +library(dplyr) +library(tidyr) +library(ggplot2) +library(magrittr) +library(purrr) +library(lubridate) +``` + +First, we create an `epi_archive()` to store the version history of the +percentage of doctor's visits with CLI (COVID-like illness) computed from +medical insurance claims and the number of new confirmed COVID-19 cases per +100,000 population (daily) for 4 states + +```{r grab-epi-data} +# Select the `percent_cli` column from the data archive +doctor_visits <- archive_cases_dv_subset$DT |> + select(geo_value, time_value, version, percent_cli) |> + tidyr::drop_na(percent_cli) |> + as_epi_archive(compactify = TRUE) +``` + +The data can also be fetched from the Delphi Epidata API with the following +query: + +```{r, message = FALSE, warning = FALSE, eval = FALSE} +library(epidatr) +doctor_visits <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) |> + # The version date column is called `issue` in the Epidata API. Rename it. + select(version = issue, geo_value, time_value, percent_cli = value) |> + as_epi_archive(compactify = TRUE) +``` + +In the interest of computational speed, we limit the dataset to 4 states and +2020–2021, but the full archive can be used in the same way and has performed +well in the past. + +We choose this dataset in particular partly because it is revision heavy; for +example, here is a plot that compares monthly snapshots of the data. + +
+Code for plotting +```{r plot_revision_example, warn = FALSE, message = FALSE} +geo_choose <- "ca" +forecast_dates <- seq( + from = as.Date("2020-08-01"), + to = as.Date("2021-11-01"), + by = "1 month") +percent_cli_data <- bind_rows( + # Snapshotted data for the version-faithful forecasts + map( + forecast_dates, + ~ doctor_visits |> + epix_as_of(.x) |> + mutate(version = .x) + ) |> + bind_rows() |> + mutate(version_faithful = "Version faithful"), + # Latest data for the version-un-faithful forecasts + doctor_visits |> + epix_as_of(doctor_visits$versions_end) |> + mutate(version_faithful = "Version un-faithful") +) |> as_tibble() +p0 <- autoplot( + archive_cases_dv_subset, percent_cli, + .versions = forecast_dates, + .mark_versions = TRUE, + .facet_filter = (geo_value == "ca") +) + + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + + labs(x = "", y = "% of doctor's visits with\n Covid-like illness") + + scale_color_viridis_c( + option = "viridis", + guide = guide_legend(reverse=TRUE), direction = -1) + + scale_y_continuous(limits = c(0, NA), expand = expansion(c(0, 0.05))) + + theme(legend.position = "none") +``` +
+ +```{r plot_just_revisioning, echo = FALSE, warning = FALSE, warn = FALSE, message = FALSE} +p0 +``` + +The snapshots are taken on the first of each month, with the vertical dashed +line representing the issue date for the time series of the corresponding +color. +For example, the snapshot on March 1st, 2021 is aquamarine, and increases to +slightly over 10. +Every series is necessarily to the left of the snapshot date (since all known +values must happen before the snapshot is taken[^4]). +The black line overlaying the various snapshots represents the "final +value", which is just the snapshot at the last version in the archive (the +`versions_end`). + +Comparing with the black line tells us how much the value at the time of the +snapshot differs with what was eventually reported. +The drop in January 2021 in the snapshot on `2021-02-01` was initially reported +as much steeper than it eventually turned out to be, while in the period after +that the values were initially reported as higher than they actually were. + +Handling data latency is important in both real-time forecasting and retrospective +forecasting. +Looking at the very first snapshot, `2020-08-01` (the purple dotted +vertical line), there is a noticeable gap between the forecast date and the end +of the red time-series to its left. +In fact, if we take a snapshot and get the last `time_value`, + +```{r} +doctor_visits |> + epix_as_of(as.Date("2020-08-01")) |> + pull(time_value) |> + max() +``` + +the last day of data is the 25th, a entire week before `2020-08-01`. +This can require some effort to work around, especially if the latency is +variable; see `step_adjust_latency()` for some methods included in this package. +Much of that functionality is built into `arx_forecaster()` using the parameter +`adjust_ahead`, which we will use below. + + +# Backtesting a simple autoregressive forecaster + +In addition to outlier detection and nowcasting, a common use case of `epiprocess::epi_archive()` object is for accurate model +back-testing. + +To start, let's use a simple autoregressive forecaster to predict `percent_cli`, the percentage +of doctor's hospital visits associated with COVID-like illness, 14 +days in the future. +For increased accuracy we will use quantile regression. + +## Comparing a single day and ahead + +As a sanity check before we backtest the _entire_ dataset, let's +forecast a single day in the middle of the dataset. +We can do this by setting the `.versions` argument in `epix_slide()`: + +```{r single_version, warn = FALSE} +forecast_date <- as.Date("2021-04-06") +forecasts <- doctor_visits |> + epix_slide( + ~ arx_forecaster( + .x, + outcome = "percent_cli", + predictors = "percent_cli", + args_list = arx_args_list() + )$predictions |> + pivot_quantiles_wider(.pred_distn), + .versions = forecast_date + ) +``` + +We need truth data to compare our forecast against. We can construct it by using `epix_as_of()` to snapshot +the archive at the last available date[^1]. + +_Note:_ We always want to compare our forecasts to actual (most recently reported) values because that is the outcome we care about. +`as_of` data is useful for understanding why we're getting the forecasts we're getting, but `as_of` values are only preliminary outcomes. +Therefore, it's not meaningful to use them for evaluating the performance of a forecast. +Unfortunately, it's not uncommon for revisions to cause poor (final) performance of a forecaster that was decent at the time of the forecast. + +```{r compare_single_with_result} +forecasts |> + inner_join( + doctor_visits |> + epix_as_of(doctor_visits$versions_end), + by = c("geo_value", "target_date" = "time_value") + ) |> + select(geo_value, forecast_date, .pred, `0.05`, `0.95`, percent_cli) +``` + +`.pred` corresponds to the point forecast (median), and `0.05` and `0.95` +correspond to the 5th and 95th quantiles. +The `percent_cli` truth data falls within the prediction intervals, so our +implementation passes a simple validation. + +## Comparing version faithful and version un-faithful forecasts + +Now let's compare the behavior of this forecaster, both properly considering data versioning +("version faithful") and ignoring data versions ("version un-faithful"). + +For the version un-faithful approach, we need to do some setup if we want to use `epix_slide` for backtesting. +We want to simulate a data set that receives finalized updates every day, that is, a data set with no revisions. +To do this, we will snapshot the latest version of the data to create a synthetic data set, and convert it into an archive +where `version = time_value`[^2]. + +```{r} +archive_cases_dv_subset_faux <- doctor_visits |> + epix_as_of(doctor_visits$versions_end) |> + mutate(version = time_value) |> + as_epi_archive() +``` + +For the version faithful approach, we will continue using the original `epi_archive` object containing all version updates. + +We will also create the helper function `forecast_wrapper()` to let us easily map across aheads. + +```{r arx-kweek-preliminaries, warning = FALSE} +forecast_wrapper <- function( + epi_data, aheads, outcome, predictors, process_data = identity + ) { + map( + aheads, + \(ahead) { + arx_forecaster( + process_data(epi_data), outcome, predictors, + args_list = arx_args_list( + ahead = ahead, + lags = c(0:7, 14, 21), + adjust_latency = "extend_ahead" + ) + )$predictions |> pivot_quantiles_wider(.pred_distn) + } + ) |> bind_rows() +} +``` + +_Note:_ In the helper function, we're using the parameter `adjust_latency`. +We need to use it because the most recently released data may still be several days old on any given forecast date (lag > 0); +`adjust_latency` will modify the forecaster to compensate[^5]. +See the function `step_adjust_latency()` for more details and examples. + +Now that we're set up, we can generate forecasts for both the version faithful and un-faithful +archives, and bind the results together. + +```{r generate_forecasts, warning = FALSE} +forecast_dates <- seq( + from = as.Date("2020-09-01"), + to = as.Date("2021-11-01"), + by = "1 month" +) +aheads <- c(1, 7, 14, 21, 28) + +version_unfaithful <- archive_cases_dv_subset_faux |> + epix_slide( + ~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"), + .before = 120, + .versions = forecast_dates + ) |> + mutate(version_faithful = "Version un-faithful") + +version_faithful <- doctor_visits |> + epix_slide( + ~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"), + .before = 120, + .versions = forecast_dates + ) |> + mutate(version_faithful = "Version faithful") + +forecasts <- + bind_rows( + version_unfaithful, + version_faithful + ) +``` + +`arx_forecaster()` does all the heavy lifting. +It creates and lags copies of the features (here, the response and doctors visits), +creates and leads copies of the target while respecting timestamps and locations, fits a +forecasting model using the specified engine, creates predictions, and +creates non-parametric confidence bands. + +To see how the version faithful and un-faithful predictions compare, let's plot them on top of the latest case +rates, using the same versioned plotting method as above. +Note that even though we fit the model on four states (California, Texas, Florida, and +New York), we'll just display the results for two states, California (CA) and Florida +(FL), to get a sense of the model performance while keeping the graphic simpler. + +
+Code for plotting + +```{r plot_ca_forecasts, warning = FALSE} +geo_choose <- "ca" +forecasts_filtered <- forecasts |> + filter(geo_value == geo_choose) |> + mutate(time_value = version) +# we need to add the ground truth data to the version faithful plot as well +plotting_data <- bind_rows( + percent_cli_data, + percent_cli_data |> + filter(version_faithful == "Version un-faithful") |> + mutate(version = max(percent_cli_data$version)) |> + mutate(version_faithful = "Version faithful") +) + +p1 <- ggplot(data = forecasts_filtered, + aes(x = target_date, group = time_value)) + + geom_ribbon( + aes(ymin = `0.05`, ymax = `0.95`, fill = (time_value)), + alpha = 0.4) + + geom_line(aes(y = .pred, color = (time_value)), linetype = 2L) + + geom_point(aes(y = .pred, color = (time_value)), size = 0.75) + + # the forecast date + geom_vline( + data = percent_cli_data |> + filter(geo_value == geo_choose) |> + select(-version_faithful), + aes(color = version, xintercept = version, group = version), + lty = 2 + ) + + # the underlying data + geom_line( + data = plotting_data |> filter(geo_value == geo_choose), + aes(x = time_value, y = percent_cli, color = (version), group = version), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_faithful ~ geo_value, scales = "free") + + scale_x_date(breaks = "2 months", date_labels = "%b %Y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", + y = "smoothed, day of week adjusted covid-like doctors visits") + + scale_color_viridis_c(option = "viridis", direction = -1) + + scale_fill_viridis_c(option = "viridis", direction = -1) + + theme(legend.position = "none") +``` + +```{r plot_fl_forecasts, warning = FALSE} +geo_choose <- "fl" +forecasts_filtered <- forecasts |> + filter(geo_value == geo_choose) |> + mutate(time_value = version) + +forecasts_filtered %>% names +p2 <- + ggplot(data = forecasts_filtered, aes(x = target_date, group = time_value)) + + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = (time_value)), alpha = 0.4) + + geom_line(aes(y = .pred, color = (time_value)), linetype = 2L) + + geom_point(aes(y = .pred, color = (time_value)), size = 0.75) + + # the forecast date + geom_vline( + data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful), + aes(color = version, xintercept = version, group = version), + lty = 2 + ) + + # the underlying data + geom_line( + data = plotting_data |> filter(geo_value == geo_choose), + aes(x = time_value, y = percent_cli, color = (version), group = version), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_faithful ~ geo_value, scales = "free") + + scale_x_date(breaks = "2 months", date_labels = "%b %Y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", y = "smoothed, day of week adjusted covid-like doctors visits") + + scale_color_viridis_c(option = "viridis", direction = -1) + + scale_fill_viridis_c(option = "viridis", direction = -1) + + theme(legend.position = "none") +p2 +``` +
+ +```{r show-plot1, warning = FALSE, echo=FALSE} +p1 +``` + +There are some weeks when the forecasts are somewhat similar, and others when they are wildly different, although neither approach produces amazingly accurate forecasts. + +In the version faithful case for California, the March 2021 forecast (turquoise) +starts at a value just above 10, which is very well lined up with reported values leading up to that forecast. +The measured and forecasted trends are also concordant (both increasingly moderately fast). + +Because the data for this time period was later adjusted down with a decreasing trend, the March 2021 forecast looks quite bad compared to finalized data. +The equivalent version un-faithful forecast starts at a value of 5, which is in line with the finalized data but would have been out of place compared to the version data. + +The October 2021 forecast for the version faithful case floors out at zero, whereas the un-faithful is much closer to the finalized data. + +```{r show-plot2, warning = FALSE, echo=FALSE} +p2 +``` + +Now let's look at Florida. +In the version faithful case, the three late-2021 forecasts (purples and pinks) starting in September predict very low values, near 0. +The trend leading up to each forecast shows a substantial decrease, so these forecasts seem appropriate and we would expect them to score fairly well on various performance metrics when compared to the versioned data. + +However in hindsight, we know that early versions of the data systematically under-reported COVID-related doctor visits such that these forecasts don't actually perform well compared to _finalized_ data. +In this example, version faithful forecasts predicted values at or near 0 while finalized data shows values in the 5-10 range. +As a result, the version un-faithful forecasts for these same dates are quite a bit higher, and would perform well when scored using the finalized data and poorly with versioned data. + +In general, the longer ago a forecast was made, the worse its performance is compared to finalized data. Finalized data accumulates revisions over time that make it deviate more and more from the non-finalized data a model was trained on. +Forecasts _trained_ on finalized data will of course appear to perform better when _scored_ on finalized data, but will have unknown performance on the non-finalized data we need to use if we want timely predictions. + +Without using data that would have been available on the actual forecast date, +you have little insight into what level of performance you +can expect in practice. + +Good performance of a version un-faithful model is a mirage; it is only achievable if the training data has no revisions. +If a data source has any revisions, version un-faithful-level performance is unachievable when making forecasts in real time. + + +[^1]: For forecasting a single day like this, we could have actually just used + `doctor_visits |> epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing +page](../index.html#motivating-example). + + +[^2]: Generally we advise against this; the only time to consider faking + versioning like this are if you're back-testing data with no versions available + at all, or if you're doing an explicit comparison like this. If you have no + versions you should assume performance is worse than what the test would + otherwise suggest. + +[^4]: Until we have a time machine + +[^5]: In this case by adjusting the length of the ahead so that it is actually + forecasting from the last day of data (e.g. for 2 day latent data and a true + ahead of 5, the `extended_ahead` would actually be 7) diff --git a/vignettes/custom_epiworkflows.Rmd b/vignettes/custom_epiworkflows.Rmd new file mode 100644 index 000000000..9d2d35093 --- /dev/null +++ b/vignettes/custom_epiworkflows.Rmd @@ -0,0 +1,624 @@ +--- +title: "Custom Epiworkflows" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Custom Epiworkflows} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +source(here::here("vignettes/_common.R")) +``` + +```{r setup, message=FALSE, include = FALSE} +library(dplyr) +library(parsnip) +library(workflows) +library(recipes) +library(epipredict) +library(epiprocess) +library(ggplot2) +library(rlang) # for %@% +forecast_date <- as.Date("2021-08-01") +used_locations <- c("ca", "ma", "ny", "tx") +library(epidatr) +``` + +If you want to do custom data preprocessing or fit a model that isn't included in the canned workflows, you'll need to write a custom `epi_workflow()`. +An `epi_workflow()` is a sub-class of a `workflows::workflow()` from the +`{workflows}` package designed to handle panel data specifically. + +To understand how to work with custom `epi_workflow()`s, let's recreate and then +modify the `four_week_ahead` example from the [landing +page](../index.html#motivating-example). +Let's first remind ourselves how to use a simple canned workflow: + +```{r make-four-forecasts, warning=FALSE} +training_data <- covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations) +four_week_ahead <- arx_forecaster( + training_data, + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) +) +four_week_ahead$epi_workflow +``` + +# Anatomy of an `epi_workflow` + +An `epi_workflow()` is an extension of a `workflows::workflow()` that is specially designed to handle panel +data, and to apply custom post-processing steps to the output of a model. +All `epi_workflows`, including simple and canned workflows, consist of 3 components, a preprocessor, trainer, and postprocessor. + +### Preprocessor + +A preprocessor (in the context of epipredict this means a `{recipe}`) transforms the data before model training and prediction. +Transformations can include converting counts to rates, applying a running average +to columns, or [any of the `step`s found in `{recipes}`](https://recipes.tidymodels.org/reference/index.html). + +All workflows must include a preprocessor. +The most basic preprocessor just assigns roles to columns, telling the model in the next step which to use as predictors or the outcome. + +However, preprocessors can do much more. +You can think of a preprocessor as a more flexible `formula` that you would pass to `lm()`: `y ~ x1 + log(x2) + lag(x1, 5)`. +The simple model above internally runs 6 of these steps, such as creating lagged predictor columns. + +In general, there are 2 broad classes of transformation that `{recipes}` `step`s handle: + +- Operations that are applied to both training and test data without using stored information. + Examples include taking the log of a variable, leading or lagging columns, + filtering out rows, handling dummy variables, calculating growth rates, + etc. +- Operations that rely on stored information (parameters estimated during training) to modify both train and test data. + Examples include centering by the mean, and normalizing the variance to be one (whitening). + +We differentiate between these types of transformations because the second type can result in information leakage if not done properly. +Information leakage or data leakage happens when a system has access to information that would not have been available at prediction time and could change our evaluation of the model's real-world performance. + +In the case of centering, we need to store the mean of the predictor from +the training data and use that value on the prediction data, rather than +using the mean of the test predictor for centering or including test data in the mean calculation. + +A major benefit of `{recipes}` is that it prevents information leakage. +However, the _main_ mechanism we rely on to prevent data leakage in the context +of epidemiological forecasting is proper [backtesting](backtesting.html). + +### Trainer + +A trainer (aso called a model or engine) fits a `{parsnip}` model to training data, and outputs a fitted model object. +Examples include linear regression, quantile regression, or [any `{parsnip}` engine](https://www.tidymodels.org/find/parsnip/). +The `{parsnip}` front-end abstracts away the differences in interface between a wide collection of statistical models. + +All workflows must include a model. + +### Postprocessor + +Generally a postprocessor modifies and formats the prediction after a model has +been fit. +An `{epipredict}` postprocessor is called a `frosting()`; there are alternatives +such as [tailor](https://tailor.tidymodels.org/) which performs calibration. + +The postprocessor is _optional_. +It only needs to be included in a workflow if you need to process the model output. + +Each operation within a postprocessor is called a "layer" (functions are named +`layer_*`), and the stack of layers is known as `frosting()`, continuing the +metaphor of baking a cake established in `{recipes}`. +Some example operations include: + +- generating quantiles from purely point-prediction models +- reverting transformations done in prior steps, such as converting from rates back to counts +- thresholding forecasts to remove negative values +- generally adapting the format of the prediction to a downstream use. + +# Recreating `four_week_ahead` in an `epi_workflow()` + +To understand how to create custom workflows, let's first recreate the simple canned `arx_forecaster()` from scratch. + +We'll think through the following sub-steps: + +1. Define the `epi_recipe()`, which contains the preprocessing steps +2. Define the `frosting()` which contains the post-processing layers +3. Combine these with a trainer such as `quantile_reg()` into an + `epi_workflow()`, which we can then fit on the training data +4. `fit()` the workflow on some data +5. Grab the right prediction data using `get_test_data()` and apply the fit data + to generate a prediction + +## Define the `epi_recipe()` + +The steps found in `four_week_ahead` look like: + +```{r inspect_fwa_steps, warning=FALSE} +hardhat::extract_recipe(four_week_ahead$epi_workflow) +``` + +There are 6 steps we will need to recreate. +Note that all steps in the extracted recipe are marked as already having been +`Trained`. For steps such as `recipes::step_BoxCox()` that have parameters that change their behavior, this means that their +parameters have already been calculated based on the training data set. + +Let's create an `epi_recipe()` to hold the 6 steps: + +```{r make_recipe} +filtered_data <- covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations) +four_week_recipe <- epi_recipe( + filtered_data, + reference_date = (filtered_data %@% metadata)$as_of +) +``` + +The data set passed to `epi_recipe` isn't required to be the actual +data set on which you are going to train the model. +However, it should have the same columns and the same metadata (such as `as_of` +and `other_keys`); it is typically easiest just to use the training data itself. + +This means that you can use the same workflow for multiple data sets as long as the format remains the same. +This might be useful if you continue to get updates to a data set over time and you want to train a new instance of the same model. + +Then we can append each `step` using pipes. In principle, the order matters, though for this +recipe only `step_epi_naomit()` and `step_training_window()` depend on the steps +before them. +The other steps can be thought of as setting parameters that help specify later processing and computation. + +```{r make_steps} +four_week_recipe <- four_week_recipe |> + step_epi_lag(case_rate, lag = c(0, 1, 2, 3, 7, 14)) |> + step_epi_lag(death_rate, lag = c(0, 7, 14)) |> + step_epi_ahead(death_rate, ahead = 4 * 7) |> + step_epi_naomit() |> + step_training_window() +``` + +Note we said before that `four_week_ahead` contained 6 steps. +We've only added _5_ top-level steps here because `step_epi_naomit()` is +actually a wrapper around adding two `step_naomit()`s, one for +`all_predictors()` and one for `all_outcomes()`. +The `step_naomit()`s differ in their treatment of the data at predict time. + +`step_epi_lag()` and `step_epi_ahead()` both accept ["tidy" syntax](https://dplyr.tidyverse.org/reference/select.html) so processing can be applied to multiple columns at once. +For example, if we wanted to use the same lags for both `case_rate` and `death_rate`, we could +specify them in a single step, like `step_epi_lag(ends_with("rate"), lag = c(0, 7, 14))`. + +In general, `{recipes}` `step`s assign roles (such as `predictor`, or `outcome`, +see the [Roles vignette for +details](https://recipes.tidymodels.org/articles/Roles.html)) to columns either +by adding new columns or adjusting existing +ones. +`step_epi_lag()`, for example, creates a new column for each lag with the name +`lag_x_column_name` and labels them each with the `predictor` role. +`step_epi_ahead()` creates `ahead_x_column_name` columns and labels each with +the `outcome` role. + +In general, to inspect the 'prepared' steps, we can run `prep()`, which fits any +parameters used in the recipe, calculates new columns, and assigns roles[^4]. +For example, we can use `prep()` to make sure that we are training on the +correct columns: + +```{r prep_recipe} +prepped <- four_week_recipe |> prep(training_data) +prepped$term_info |> print(n = 14) +``` + +`bake()` applies a prepared recipe to a (potentially new) dataset to create the dataset as handed to the `epi_workflow()`. +We can inspect newly-created columns by running `bake()` on the recipe so far: + +```{r bake_recipe} +four_week_recipe |> + prep(training_data) |> + bake(training_data) +``` + +This is also useful for debugging malfunctioning pipelines. +You can run `prep()` and `bake()` on a new recipe containing a subset of `step`s -- all `step`s from the beginning up to the one that is misbehaving -- from the full, original recipe. +This will return an evaluation of the `recipe` up to that point so that you can see the data that the misbehaving `step` is being applied to. +It also allows you to see the exact data that a later `{parsnip}` model is trained on. + +## Define the `frosting()` + +The post-processing `frosting` layers[^1] found in `four_week_ahead` look like: + +```{r inspect_fwa_layers, warning=FALSE} +epipredict::extract_frosting(four_week_ahead$epi_workflow) +``` + +_Note_: since `frosting` is unique to this package, we've defined a custom function `extract_frosting()` to inspect these steps. + +Using the detailed information in the output above, +we can recreate the layers similar to how we defined the +`recipe` `step`s[^2]: + +```{r make_frosting} +four_week_layers <- frosting() |> + layer_predict() |> + layer_residual_quantiles(quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9)) |> + layer_add_forecast_date() |> + layer_add_target_date() |> + layer_threshold() +``` + +`layer_predict()` needs to be included in every postprocessor to actually predict on the prediction data. + +Most layers work with any engine or `step`s. +There are a couple of layers, however, that depend on whether the engine predicts quantiles or point estimates. + +The following layers are only supported by point estimate engines, such as +`linear_reg()`: + +- `layer_residual_quantiles()`: for models that don't generate quantiles, the + preferred method of generating quantiles. + This function uses the error residuals of the engine to calculate quantiles. + This will work for most `{parsnip}` engines. +- `layer_predictive_distn()`: alternate method of generating quantiles using + an approximate parametric distribution. This will work for linear regression + specifically. + +On the other hand, the following layers are only supported by engines that +output quantiles, such as `quantile_reg()`: + +- `layer_quantile_distn()`: adds the specified quantiles. + If the user-requested quantile levels differ from the ones actually fit, they will be interpolated and/or + extrapolated. +- `layer_point_from_distn()`: this generates a point estimate from a + distribution (either median or mean), and, if used, should be included after + `layer_quantile_distn()`. + +## Fitting an `epi_workflow()` + +Now that we have a recipe and some layers, we can assemble the workflow. +This is as simple as passing the component preprocessor, model, and postprocessor into `epi_workflow()`. + +```{r workflow_building} +four_week_workflow <- epi_workflow( + four_week_recipe, + linear_reg(), + four_week_layers +) +``` + +After fitting it, we will have recreated `four_week_ahead$epi_workflow`. + +```{r workflow_fitting} +fit_workflow <- four_week_workflow |> fit(training_data) +``` + +Running `fit()` calculates all preprocessor-required parameters, and trains the model on the data passed in `fit()`. +However, it does not generate any predictions; predictions need to be created in a separate step. + +## Predicting + +To make a prediction, it helps to narrow the data set down to the relevant observations using `get_test_data()`. +We can still generate predictions without doing this first, but it will predict on _every_ day in the data-set, and not just on the `reference_date`. + +```{r grab_data} +relevant_data <- get_test_data( + four_week_recipe, + training_data +) +``` + +In this example, we're creating `relevant_data` from `training_data`, but the data set we want predictions for could be entirely new data, unrelated to the one we used when building the workflow. + +With a trained workflow and data in hand, we can actually make our predictions: + +```{r workflow_pred} +fit_workflow |> predict(relevant_data) +``` + +Note that if we simply plug the full `training_data` into `predict()` we will still get +predictions: + +```{r workflow_pred_training} +fit_workflow |> predict(training_data) +``` + +The resulting tibble is 800 rows long, however. +Passing the non-subsetted data set produces forecasts for not just the requested `reference_date`, but for every +day in the data set that has sufficient data to produce a prediction. +To narrow this down, we could filter to rows where the `time_value` matches the `forecast_date`: + +```{r workflow_pred_training_filter} +fit_workflow |> + predict(training_data) |> + filter(time_value == forecast_date) +``` + +This can be useful as a workaround when `get_test_data()` fails to pull enough +data to produce a forecast. +This is generally a problem when the recipe (preprocessor) is sufficiently complicated, and `get_test_data()` can't determine precisely what data is required. +The forecasts generated with `filter` and `get_test_data` are identical. + +# Extending `four_week_ahead` + +Now that we know how to create `four_week_ahead` from scratch, we can start modifying the workflow to get custom behavior. + +There are many ways we could modify `four_week_ahead`. We might consider: + +- Converting from rates to counts +- Including a growth rate estimate as a predictor +- Including a time component as a predictor --- useful if we +expect there to be a strong seasonal component to the outcome +- Scaling by a factor + +We will demonstrate a couple of these modifications below. + +## Growth rate + +Let's say we're interested in including growth rate as a predictor in our model because +we think it may potentially improve our forecast. +We can easily create a new growth rate column as a step in the `epi_recipe`. + +```{r growth_rate_recipe} +growth_rate_recipe <- epi_recipe( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations) +) |> + # Calculate growth rate from death rate column. + step_growth_rate(death_rate) |> + step_epi_lag(case_rate, lag = c(0, 1, 2, 3, 7, 14)) |> + step_epi_lag(death_rate, lag = c(0, 7, 14)) |> + step_epi_ahead(death_rate, ahead = 4 * 7) |> + step_epi_naomit() |> + step_training_window() +``` + +Inspecting the newly added column: + +```{r growth_rate_print} +growth_rate_recipe |> + prep(training_data) |> + bake(training_data) |> + select( + geo_value, time_value, case_rate, + death_rate, gr_7_rel_change_death_rate + ) |> + arrange(geo_value, time_value) |> + tail() +``` + +And the role: + +```{r growth_rate_roles} +prepped <- growth_rate_recipe |> + prep(training_data) +prepped$term_info |> filter(grepl("gr", variable)) +``` + +Let's say we want to use `quantile_reg()` as the model. +Because `quantile_reg()` outputs quantiles only, we need to change our `frosting` to convert a quantile distribution into quantiles and point predictions. +To do that, we need to switch out `layer_residual_quantiles()` (used for converting point + residuals output, e.g. from `linear_reg()` into quantiles) for `layer_quantile_distn()` and `layer_point_from_distn()`: +```{r layer_and_fit} +growth_rate_layers <- frosting() |> + layer_predict() |> + layer_quantile_distn( + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) |> + layer_point_from_distn() |> + layer_add_forecast_date() |> + layer_add_target_date() |> + layer_threshold() + +growth_rate_workflow <- epi_workflow( + growth_rate_recipe, + quantile_reg(quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9)), + growth_rate_layers +) + +relevant_data <- get_test_data( + growth_rate_recipe, + training_data +) +gr_fit_workflow <- growth_rate_workflow |> fit(training_data) +gr_predictions <- gr_fit_workflow |> + predict(relevant_data) |> + filter(time_value == forecast_date) +``` +
+ Plot + +We'll reuse some code from the landing page to plot the result. + +```{r plotting} +forecast_date_label <- + tibble( + geo_value = rep(used_locations, 2), + .response_name = c(rep("case_rate", 4), rep("death_rate", 4)), + dates = rep(forecast_date - 7 * 2, 2 * length(used_locations)), + heights = c(rep(150, 4), rep(0.30, 4)) + ) + +result_plot <- autoplot( + object = gr_fit_workflow, + predictions = gr_predictions, + observed_response = covid_case_death_rates |> + filter(geo_value %in% used_locations, time_value > "2021-07-01") +) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label |> filter(.response_name == "death_rate"), + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` +
+```{r, echo=FALSE} +result_plot +``` + +## Population scaling + +Suppose we want to modify our predictions to return a rate prediction, rather than the count prediction. +To do that, we can adjust _just_ the `frosting` to perform post-processing on our existing rates forecaster. +Since rates are calculated as counts per 100 000 people, we will convert back to counts by multiplying rates by the factor $\frac{ \text{regional population} }{100,000}$. + +```{r rate_scale} +count_layers <- + frosting() |> + layer_predict() |> + layer_residual_quantiles(quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9)) |> + layer_population_scaling( + starts_with(".pred"), + # `df` contains scaling values for all regions; in this case it is the state populations + df = epidatasets::state_census, + df_pop_col = "pop", + create_new = FALSE, + # `rate_rescaling` gives the denominator of the existing rate predictions + rate_rescaling = 1e5, + by = c("geo_value" = "abbr") + ) |> + layer_add_forecast_date() |> + layer_add_target_date() |> + layer_threshold() + +# building the new workflow +count_workflow <- epi_workflow( + four_week_recipe, + linear_reg(), + count_layers +) +count_pred_data <- get_test_data(four_week_recipe, training_data) +count_predictions <- count_workflow |> + fit(training_data) |> + predict(count_pred_data) + +count_predictions +``` + +note that we've used `tidyselect::starts_with(".pred")` here, which will apply the function to both the `.pred` and `.pred_distn` columns. + +# Custom classifier workflow + +Let's work through an example of a more complicated kind of pipeline you can build using +the `epipredict` framework. +This is a hotspot prediction model, which predicts whether case rates are increasing (`up`), decreasing (`down`) or flat +(`flat`). +The model comes from a paper by McDonald, Bien, Green, Hu et al[^3], and roughly +serves as an extension of `arx_classifier()`. + +First, we need to add a factor version of `geo_value`, so that it can be used as a feature. + +```{r training_factor} +training_data <- + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations) |> + mutate(geo_value_factor = as.factor(geo_value)) +``` + +Then we put together the recipe, using a combination of base `{recipe}` +functions such as `add_role()` and `step_dummy()`, and `{epipredict}` functions +such as `step_growth_rate()`. + +```{r class_recipe} +classifier_recipe <- epi_recipe(training_data) |> + # Label `time_value` as predictor and do no other processing + add_role(time_value, new_role = "predictor") |> + # Use one-hot encoding on `geo_value_factor` and label each resulting column as a predictor + step_dummy(geo_value_factor) |> + # Create and lag `case_rate` growth rate + step_growth_rate(case_rate, role = "none", prefix = "gr_") |> + step_epi_lag(starts_with("gr_"), lag = c(0, 7, 14)) |> + step_epi_ahead(starts_with("gr_"), ahead = 7) |> + # divide growth rate into 3 bins + step_cut(ahead_7_gr_7_rel_change_case_rate, breaks = c(-Inf, -0.2, 0.25, Inf) / 7) |> + # Drop unused columns based on role assignments. This is not strictly + # necessary, as columns with roles unused in the model will be ignored anyway. + step_rm(has_role("none"), has_role("raw")) |> + step_epi_naomit() +``` + +This adds as predictors: + +- time value as a continuous variable (via `add_role()`) +- `geo_value` as a set of indicator variables (via `step_dummy()` and the previous `as.factor()`) +- growth rate of case rate, both at prediction time (no lag), and lagged by one and two weeks + +The outcome variable is created by composing several steps together. `step_epi_ahead()` +creates a column with the growth rate one week into the future, and +`step_mutate()` turns that column into a factor with 3 possible values, + +$$ + Z_{\ell, t}= + \begin{cases} + \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ + \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.20\\ + \text{flat}, & \text{otherwise} + \end{cases} +$$ + +where $Y^{\Delta}_{\ell, t}$ is the growth rate at location $\ell$ and time $t$. +`up` means that the `case_rate` is has increased by at least 25%, while `down` +means it has decreased by at least 20%. + +Note that in both `step_growth_rate()` and `step_epi_ahead()` we explicitly assign the role +`none`. This is because those columns are used as intermediaries to create +predictor and outcome columns. +Afterwards, `step_rm()` drops the temporary columns, along with the original `role = "raw"` columns +`death_rate` and `case_rate`. Both `geo_value_factor` and `time_value` are retained +because their roles have been reassigned. + + +To fit a 3-class classification model like this, we will need to use a `{parsnip}` model +that has `mode = "classification"`. +The simplest example of a `{parsnip}` `classification`-`mode` model is `multinomial_reg()`. +The needed layers are more or less the same as the `linear_reg()` regression layers, with the addition that we need to remove some `NA` values: + +```{r, warning=FALSE} +frost <- frosting() |> + layer_naomit(starts_with(".pred")) |> + layer_add_forecast_date() |> + layer_add_target_date() +``` + +```{r, warning=FALSE} +wf <- epi_workflow( + classifier_recipe, + multinom_reg(), + frost +) |> + fit(training_data) + +forecast(wf) +``` + +And comparing the result with the actual growth rates at that point in time, +```{r growth_rate_results} +growth_rates <- covid_case_death_rates |> + filter(geo_value %in% used_locations) |> + group_by(geo_value) |> + mutate( + # Multiply by 7 to estimate weekly equivalents + case_gr = growth_rate(x = time_value, y = case_rate) * 7 + ) |> + ungroup() + +growth_rates |> filter(time_value == "2021-08-01") |> select(-death_rate) +``` + +we see that they're all significantly higher than 25% per week (36%-62%), +which matches the classification model's predictions. + + +See the [tooling book](https://cmu-delphi.github.io/delphi-tooling-book/preprocessing-and-models.html) for a more in-depth discussion of this example. + + +[^1]: Think of baking a cake, where adding the frosting is the last step in the + process of actually baking. + +[^2]: Note that the frosting doesn't require any information about the training + data, since the output of the model only depends on the model used. + +[^3]: McDonald, Bien, Green, Hu, et al. “Can auxiliary indicators improve + COVID-19 forecasting and hotspot prediction?.” Proceedings of the National + Academy of Sciences 118.51 (2021): e2111453118. doi:10.1073/pnas.2111453118 + +[^4]: Note that `prep()` and `bake()` are standard `{recipes}` functions, so any discussion of them there applies just as well here. For example in the [guide to creating a new step](https://www.tidymodels.org/learn/develop/recipes/#create-the-prep-method). + diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index 97bbcee70..3786bc356 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -1,474 +1,630 @@ --- -title: "Get started with epipredict" +title: "Get started with `epipredict`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Get started with epipredict} + %\VignetteIndexEntry{Get started with `epipredict`} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- +# Introduction + ```{r, include = FALSE} -knitr::opts_chunk$set( - echo = TRUE, - collapse = TRUE, - comment = "#>", - out.width = "100%" -) +source(here::here("vignettes/_common.R")) ``` -```{r setup, message=FALSE} -library(dplyr) -library(parsnip) -library(workflows) -library(recipes) -library(epipredict) -``` +At a high level, the goal of `{epipredict}` is to make it easy to run simple machine +learning and statistical forecasters for epidemiological data. +To do this, we have extended the [tidymodels](https://www.tidymodels.org/) +framework to handle the case of panel time-series data. -# Goals for the package +Our hope is that it is easy for users with epidemiological training and some statistical knowledge to +estimate baseline models, while also allowing those with more nuanced statistical +understanding to create complex custom models using the same framework. +Towards that end, `{epipredict}` provides two main classes of tools: -At a high level, our goal with `{epipredict}` is to make running simple Machine -Learning / Statistical forecasters for epidemiology easy. However, this package -is extremely extensible, and that is part of its utility. Our hope is that it is -easy for users with epi training and some statistics to fit baseline models -while still allowing those with more nuanced statistical understanding to create -complicated specializations using the same framework. +## Canned forecasters -Serving both populations is the main motivation for our efforts, but at the same -time, we have tried hard to make it useful. +A set of basic, easy-to-use "canned" forecasters that work out of the box. +We currently provide the following basic forecasters: + * `flatline_forecaster()`: predicts as the median the most recently seen value + with increasingly wide quantiles. + * `climatological_forecaster()`: predicts the median and quantiles based on the historical values around the same date in previous years. + * `arx_forecaster()`: an AutoRegressive eXogenous feature forecaster, which + estimates a model (e.g. linear regression) on lagged data to predict quantiles + for continuous values. + * `arx_classifier()`: fits a model (e.g. logistic regression) on lagged data + to predict a binned version of the growth rate. + * `cdc_baseline_forecaster()`: a variant of the flatline forecaster that is + used as a baseline in the CDC's [FluSight forecasting competition](https://www.cdc.gov/flu-forecasting/about/index.html). -## Baseline models +## Forecasting framework -We provide a set of basic, easy-to-use forecasters that work out of the box. You -should be able to do a reasonably limited amount of customization on them. Any -serious customization happens with the framework discussed below). +A framework for creating custom forecasters out of modular components, from +which the canned forecasters were created. There are three types of +components: -For the basic forecasters, we provide: + * _Preprocessor_: transform the data before model training, such as converting + counts to rates, creating smoothed columns, or [any `{recipes}` + `step`](https://recipes.tidymodels.org/reference/index.html) + * _Trainer_: train a model on data, resulting in a fitted model object. + Examples include linear regression, quantile regression, or [any `{parsnip}` + engine](https://www.tidymodels.org/find/parsnip/). + * _Postprocessor_: unique to `{epipredict}`; used to transform the + predictions after the model has been fit, such as + - generating quantiles from purely point-prediction models, + - reverting operations done in the `step`s, such as converting from + rates back to counts + - generally adapting the format of the prediction to its eventual use. -* Baseline flat-line forecaster -* Autoregressive forecaster -* Autoregressive classifier +The rest of this "Get Started" vignette will focus on using and modifying the canned forecasters. +Check out the [Custom Epiworkflows vignette](preprocessing-and-models) for examples of using the forecaster +framework to make more complex, custom forecasters. -All the forcasters we provide are built on our framework. So we will use these -basic models to illustrate its flexibility. +If you are interested in time series in a non-panel data context, you may also +want to look at `{timetk}` and `{modeltime}` for some related techniques. -## Forecasting framework +For a more in-depth treatment with some practical applications, see also the +[Forecasting Book](https://cmu-delphi.github.io/delphi-tooling-book/). -Our framework for creating custom forecasters views the prediction task as a set -of modular components. There are four types of components: - -1. Preprocessor: make transformations to the data before model training -2. Trainer: train a model on data, resulting in a fitted model object -3. Predictor: make predictions, using a fitted model object and processed test data -4. Postprocessor: manipulate or transform the predictions before returning - -Users familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially -the [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot -of overlap. This is by design, and is in fact a feature. The truth is that -`{epipredict}` is a wrapper around much that is contained in these packages. -Therefore, if you want something from this -verse, it should "just work" (we -hope). - -The reason for the overlap is that `{workflows}` *already implements* the first -three steps. And it does this very well. However, it is missing the -postprocessing stage and currently has no plans for such an implementation. And -this feature is important. The baseline forecaster we provide *requires* -postprocessing. Anything more complicated needs this as well. - -The second omission from `{tidymodels}` is support for panel data. Besides -epidemiological data, economics, psychology, sociology, and many other areas -frequently deal with data of this type. So the framework of behind -`{epipredict}` implements this. In principle, this has nothing to do with -epidemiology, and one could simply use this package as a solution for the -missing functionality in `{tidymodels}`. Again, this should "just work". - -All of the *panel data* functionality is implemented through the `epi_df` data -type in the companion [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) -package. There is much more to see there, but for the moment, it's enough to -look at a simple one: - -```{r epidf} -jhu <- case_death_rate_subset -jhu -``` +# Panel forecasting basics -This data is built into the package and contains the measured variables -`case_rate` and `death_rate` for COVID-19 at the daily level for each US state -for the year 2021. The "panel" part is because we have repeated measurements -across a number of locations. +This section gives basic usage examples for the package beyond the most basic usage of `arx_forecaster()` for forecasting a single ahead using the default engine. +Before we start actually building forecasters, lets import some relevant libraries -The `epi_df` encodes the time stamp as `time_value` and the `key` as -`geo_value`. While these 2 names are required, the values don't need to actually -represent such objects. Additional `key`'s are also supported (like age group, -ethnicity, taxonomy, etc.). +```{r setup, message=FALSE} +library(dplyr) +library(parsnip) +library(workflows) +library(recipes) +library(epidatasets) +library(epipredict) +library(epiprocess) +library(ggplot2) +library(purrr) +library(epidatr) +``` + +And our default forecasting date and selected states (we will use these to limit the data to make discussion easier): -The `epi_df` also contains some metadata that describes the keys as well as the -vintage of the data. It's possible that data collected at different times for -the *same set* of `geo_value`'s and `time_value`'s could actually be different. -For more details, see -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html). +```{r} +forecast_date <- as.Date("2021-08-01") +used_locations <- c("ca", "ma", "ny", "tx") +``` -## Why doesn't this package already exist? +## Example data -As described above: +The forecasting methods in this package are designed to work with panel time +series data in `epi_df` format as made available in the `{epiprocess}` +package. +An `epi_df` is a collection of one or more time-series indexed by one or more +categorical variables. +The [`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/) package makes several +pre-compiled example datasets available. +Let's look at an example `epi_df`: -* Parts actually DO exist. There's a universe called `{tidymodels}`. It handles -preprocessing, training, and prediction, bound together, through a package called -`{workflows}`. We built `{epipredict}` on top of that setup. In this way, you CAN -use almost everything they provide. +```{r data_ex} +covid_case_death_rates +``` -* However, `{workflows}` doesn't do postprocessing. And nothing in the -verse -handles _panel data_. +An `epi_df` always has a `geo_value` and a `time_value` as keys, along with some number of value columns, in this case `case_rate` and `death_rate`. +Each of these has an associated `geo_type` (state) and `time_type` (day), for which there are some utilities. +While this `geo_value` and `time_value` are the minimal set of keys, the functions of `{epiprocess}` and `{epipredict}` are designed to accommodate other key values, such as age, ethnicity, or other demographic +information. +For example, `grad_employ_subset` from `{epidatasets}` also has both `age_group` +and `edu_qual` as additional keys: -* The tidy-team doesn't have plans to do either of these things. (We checked). +```{r extra_keys} +grad_employ_subset +``` -* There are two packages that do _time series_ built on `{tidymodels}`, but it's -"basic" time series: 1-step AR models, exponential smoothing, STL decomposition, -etc.[^2] Our group has not prioritized these sorts of models for epidemic -forecasting, but one could also integrate these methods into our framework. +See `{epiprocess}` for [more details on the `epi_df` format](https://cmu-delphi.github.io/epiprocess/articles/epi_df.html). -[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) -and [`{modeltime}`](https://business-science.github.io/timetk/index.html). There -are *lots* of useful methods there than can be used to do fairly complex machine -learning methodology, though not directly for panel data and not for direct -prediction of future targets. +Panel time series are ubiquitous in epidemiology, but are also common in +economics, psychology, sociology, and many other areas. +While this package was designed with epidemiology in mind, many of the +techniques are more broadly applicable. -# Show me the basics +## Customizing `arx_forecaster()` -We start with the `jhu` data displayed above. One of the "canned" forecasters we -provide is an autoregressive forecaster with (or without) covariates that -*directly* trains on the response. This is in contrast to a typical "iterative" -AR model that trains to predict one-step-ahead, and then plugs in the -predictions to "leverage up" to longer horizons. +Let's expand on the basic example presented on the [landing +page](../index.html#motivating-example), starting with adjusting some parameters in +`arx_forecaster()`. -We'll estimate the model jointly across all locations using only the most -recent 30 days. +The `trainer` argument allows us to set the computational engine. We can use either +one of the relevant [parsnip models](https://www.tidymodels.org/find/parsnip/), +or one of the included engines, such as `smooth_quantile_reg()`: -```{r demo-workflow} -jhu <- jhu %>% filter(time_value >= max(time_value) - 30) -out <- arx_forecaster( - jhu, +```{r make-forecasts, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), outcome = "death_rate", - predictors = c("case_rate", "death_rate") + trainer = quantile_reg(), + predictors = c("death_rate"), + args_list = arx_args_list( + lags = list(c(0, 7, 14)), + ahead = 14 + ) ) +hardhat::extract_fit_engine(two_week_ahead$epi_workflow) ``` -The `out` object has two components: - - 1. The predictions which is just another `epi_df`. It contains the predictions for -each location along with additional columns. By default, these are a 90% -predictive interval, the `forecast_date` (the date on which the forecast was -putatively made) and the `target_date` (the date for which the forecast is being -made). - ```{r} -out$predictions - ``` - 2. A list object of class `epi_workflow`. This object encapsulates all the -instructions necessary to create the prediction. More details on this below. - ```{r} -out$epi_workflow - ``` - -By default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, -using 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), 1 -week back and 2 weeks back. The predictors and outcome can be changed directly. -The rest of the defaults are encapsulated into a list of arguments. This list is -produced by `arx_args_list()`. - -## Simple adjustments - -Basic adjustments can be made through the `args_list`. - -```{r kill-warnings, echo=FALSE} -knitr::opts_chunk$set(warning = FALSE, message = FALSE) -``` - -```{r differential-lags} -out2week <- arx_forecaster( - jhu, +The default trainer is `parsnip::linear_reg()`, which generates quantiles after +the fact in the post-processing layers, rather than as part of the model. +While these post-processing layers will produce prediction intervals for an +arbitrary trainer, it is generally preferable to use `quantile_reg()` (or an +alternative that produces statistically justifiable prediction intervals), as +the quantiles generated in post-processing can be poorly behaved. +`quantile_reg()` on the other hand directly estimates a different linear model +for each quantile, reflected in the several different columns for `tau` above. + +Because of the flexibility of `{parsnip}`, there are a whole host of models +available to us[^5]; as an example, we could have just as easily substituted a +non-linear random forest model from `{ranger}`: + +```{r rand_forest_ex, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), outcome = "death_rate", - predictors = c("case_rate", "death_rate"), + trainer = rand_forest(mode = "regression"), + predictors = c("death_rate"), args_list = arx_args_list( - lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + lags = list(c(0, 7, 14)), ahead = 14 ) ) ``` -Here, we've used different lags on the `case_rate` and are now predicting 2 -weeks ahead. This example also illustrates a major difficulty with the -"iterative" versions of AR models. This model doesn't produce forecasts for -`case_rate`, and so, would not have data to "plug in" for the necessary -lags.[^1] - -[^1]: An obvious fix is to instead use a VAR and predict both, but this would -likely increase the variance of the model, and therefore, may lead to less -accurate forecasts for the variable of interest. - -Another property of the basic model is the predictive interval. We describe this -in more detail in a different vignette, but it is easy to request multiple -quantiles. +Other customization is possible via `args_list = arx_args_list()`; for +example, if we wanted to increase the number of quantiles fit: -```{r differential-levels} -out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), +```{r make-quantile-levels-forecasts, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + trainer = quantile_reg(), + predictors = c("death_rate"), args_list = arx_args_list( - quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99) + lags = list(c(0, 7, 14)), + ahead = 14, + ############ changing quantile_levels ############ + quantile_levels = c(0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95) + ################################################## ) ) +hardhat::extract_fit_engine(two_week_ahead$epi_workflow) ``` -The column `.pred_dstn` in the `predictions` object is actually a "distribution" -here parameterized by its quantiles. For this default forecaster, these are -created using the quantiles of the residuals of the predictive model (possibly -symmetrized). Here, we used 23 quantiles, but one can grab a particular -quantile, - -```{r q1} -round(head(quantile(out_q$predictions$.pred_distn, p = .4)), 3) +See the function documentation for `arx_args_list()` for more examples of the modifications available. +If you want to make further modifications, you will need a custom +workflow; see the [Custom Epiworkflows vignette](custom_epiworkflows) for details. + +## Generating multiple aheads + +We often want to generate a a trajectory +of forecasts over a range of dates, rather than for a single day. +We can do this with `arx_forecaster()` by looping over aheads. +For example, to predict every day over a 4-week time period: + +```{r aheads-loop} +all_canned_results <- lapply( + seq(0, 28), + \(days_ahead) { + arx_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = days_ahead + ) + ) + } +) +# pull out the workflow and the predictions to be able to +# effectively use autoplot +workflow <- all_canned_results[[1]]$epi_workflow +results <- all_canned_results |> + purrr::map(~ `$`(., "predictions")) |> + list_rbind() +autoplot( + object = workflow, + predictions = results, + observed_response = covid_case_death_rates |> + filter(geo_value %in% used_locations, time_value > "2021-07-01") +) ``` -or extract the entire distribution into a "long" `epi_df` with `quantile_levels` -being the probability and `values` being the value associated to that quantile. - -```{r q2} -out_q$predictions %>% - pivot_quantiles_longer(.pred_distn) +## Other canned forecasters + +This section gives a brief example of each of the canned forecasters. + +### `flatline_forecaster()` + +The simplest model we provide is the `flatline_forecaster()`, which predicts a +flat line (with quantiles generated from the residuals using +`layer_residual_quantiles()`). +For example, on the same dataset as above: +```{r make-flatline-forecast, warning=FALSE} +all_flatlines <- lapply( + seq(0, 28), + \(days_ahead) { + flatline_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + args_list = flatline_args_list( + ahead = days_ahead, + ) + ) + } +) +# same plotting code as in the arx multi-ahead case +workflow <- all_flatlines[[1]]$epi_workflow +results <- all_flatlines |> + purrr::map(~ `$`(., "predictions")) |> + list_rbind() +autoplot( + object = workflow, + predictions = results, + observed_response = covid_case_death_rates |> filter(geo_value %in% used_locations, time_value > "2021-07-01") +) ``` -Additional simple adjustments to the basic forecaster can be made using the -function: - -```{r, eval = FALSE} -arx_args_list( - lags = c(0L, 7L, 14L), ahead = 7L, n_training = Inf, - forecast_date = NULL, target_date = NULL, quantile_levels = c(0.05, 0.95), - symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf +### `cdc_baseline_forecaster()` + +This is a different method of generating a flatline forecast, used as a baseline +for [the CDC COVID-19 Forecasting Hub](https://covid19forecasthub.org). + +```{r make-cdc-forecast, warning=FALSE} +all_cdc_flatline <- + cdc_baseline_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + args_list = cdc_baseline_args_list( + aheads = 1:28, + data_frequency = "1 day" + ) + ) +# same plotting code as in the arx multi-ahead case +workflow <- all_cdc_flatline$epi_workflow +results <- all_cdc_flatline$predictions +autoplot( + object = workflow, + predictions = results, + observed_response = covid_case_death_rates |> filter(geo_value %in% used_locations, time_value > "2021-07-01") ) ``` -## Changing the engine +`cdc_baseline_forecaster()` and `flatline_forecaster()` generate medians in the same way, +but `cdc_baseline_forecaster()`'s quantiles are generated using +`layer_cdc_flatline_quantiles()` instead of `layer_residual_quantiles()`. +Both quantile-generating methods use the residuals to compute quantiles, but +`layer_cdc_flatline_quantiles()` extrapolates the quantiles by repeatedly +sampling the initial quantiles to generate the next set. +This results in much smoother quantiles, but ones that only capture the +one-ahead uncertainty. -So far, our forecasts have been produced using simple linear regression. But -this is not the only way to estimate such a model. The `trainer` argument -determines the type of model we want. This takes a -[`{parsnip}`](https://parsnip.tidymodels.org) model. The default is linear -regression, but we could instead use a random forest with the `{ranger}` -package: +### `climatological_forecaster()` -```{r ranger, warning = FALSE} -out_rf <- arx_forecaster( - jhu, - outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = rand_forest(mode = "regression") -) -``` +The `climatological_forecaster()` is a different kind of baseline. It produces a +point forecast and quantiles based on the historical values for a given time of +year, rather than extrapolating from recent values. +Among our forecasters, it is the only one well suited for forecasts at long time horizons. -Or boosted regression trees with `{xgboost}`: +Since it requires multiple years of data and a roughly seasonal signal, the dataset we've been using for demonstrations so far is poor example for a climate forecast[^8]. +Instead, we'll use the fluview ILI dataset, which is weekly influenza like illness data for hhs regions, going back to 1997. -```{r xgboost, warning = FALSE} -out_gb <- arx_forecaster( - jhu, - outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = boost_tree(mode = "regression", trees = 20) + +We'll predict the 2023/24 season using all previous data, including 2020-2022, the two years where there was approximately no seasonal flu, forecasting from the start of the season, `2023-10-08`: + +```{r make-climatological-forecast, warning=FALSE} +fluview_hhs <- pub_fluview( + regions = paste0("hhs", 1:10), + epiweeks = epirange(100001,222201) +) +fluview <- fluview_hhs %>% + select( + geo_value = region, + time_value = epiweek, + issue, + ili) %>% + as_epi_archive() %>% + epix_as_of_current() + +all_climate <- climatological_forecaster( + fluview %>% filter(time_value < "2023-10-08"), + outcome = "ili", + args_list = climate_args_list( + forecast_horizon = seq(0, 28), + time_type = "week", + quantile_by_key = "geo_value", + forecast_date = as.Date("2023-10-08") + ) +) +workflow <- all_climate$epi_workflow +results <- all_climate$predictions +autoplot( + object = workflow, + predictions = results, + observed_response = fluview %>% + filter(time_value >= "2023-10-08", time_value < "2024-05-01") %>% + mutate(geo_value = factor(geo_value, levels = paste0("hhs", 1:10))) ) ``` -Or quantile regression, using our custom forecasting engine `quantile_reg()`: -```{r quantreg, warning = FALSE} -out_qr <- arx_forecaster( - jhu, +One feature of the climatological baseline is that it forecasts multiple aheads +simultaneously; here we do so for the entire season of 28 weeks. +This is possible for `arx_forecaster()`, but only using `trainer = +smooth_quantile_reg()`, which is built to handle multiple aheads simultaneously[^9]. + +A pure climatological forecast can be thought of as forecasting a typical year so far. +The 2023/24 had some regions, such as `hhs10` which were quite close to the typical year, and some, such as `hhs2` that were frequently outside even the 90% prediction band (the lightest shown above). + +### `arx_classifier()` + +Unlike the other canned forecasters, `arx_classifier` predicts binned growth rate. +The forecaster converts the raw outcome variable into a growth rate, which it then bins and predicts, using bin thresholds provided by the user. +For example, on the same dataset and `forecast_date` as above, this model outputs: + +```{r discrete-rt} +classifier <- arx_classifier( + covid_case_death_rates |> + filter(geo_value %in% used_locations, time_value < forecast_date), outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = quantile_reg() + predictors = c("death_rate", "case_rate"), + trainer = multinom_reg(), + args_list = arx_class_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = 2 * 7, + breaks = 0.25 / 7 + ) ) +classifier$predictions ``` -FWIW, this last case (using quantile regression), is not far from what the -Delphi production forecast team used for its Covid forecasts over the past few -years. - -## Inner workings +The number and size of the growth rate categories is controlled by `breaks`, +which define the bin boundaries. + +In this example, the custom `breaks` passed to `arx_class_args_list()` correspond to 2 bins: +`(-∞, 0.0357]` and `(0.0357, ∞)`. +The bins can be interpreted as: `death_rate` is decreasing/growing slowly, + or `death_rate` is growing quickly. + +The returned `predictions` assigns each state to one of the growth rate bins. +In this case, the classifier expects the growth rate for all 4 of the states to fall into the same category, +`(-∞, 0.0357]`. + +To see how this model performed, let's compare to the actual growth rates for the `target_date`, as computed using +`{epiprocess}`: + +```{r growth_rate_results} +growth_rates <- covid_case_death_rates |> + filter(geo_value %in% used_locations) |> + group_by(geo_value) |> + mutate( + deaths_gr = growth_rate(x = time_value, y = death_rate) + ) |> + ungroup() +growth_rates |> filter(time_value == "2021-08-14") +``` -Underneath the hood, this forecaster creates (and returns) an `epi_workflow`. -Essentially, this is a big S3 object that wraps up the 4 modular steps -(preprocessing - postprocessing) described above. +The accuracy is 50%, since all 4 states were predicted to be in the interval +`(-Inf, 0.0357]`, while two, `ca` and `ny` actually were. -### Preprocessing -Preprocessing is accomplished through a `recipe` (imagine baking a cake) as -provided in the [`{recipes}`](https://recipes.tidymodels.org) package. -We've made a few modifications (to handle -panel data) as well as added some additional options. The recipe gives a -specification of how to handle training data. Think of it like a fancified -`formula` that you would pass to `lm()`: `y ~ x1 + log(x2)`. In general, -there are 2 extensions to the `formula` that `{recipes}` handles: +## Handling multi-key panel data - 1. Doing transformations of both training and test data that can always be - applied. These are things like taking the log of a variable, leading or - lagging, filtering out rows, handling dummy variables, etc. - 2. Using statistics from the training data to eventually process test data. - This is a major benefit of `{recipes}`. It prevents what the tidy team calls - "data leakage". A simple example is centering a predictor by its mean. We - need to store the mean of the predictor from the training data and use that - value on the test data rather than accidentally calculating the mean of - the test predictor for centering. +If multiple keys are set in the `epi_df` as `other_keys`, `arx_forecaster` will +automatically group by those in addition to the required geographic key. +For example, predicting the number of graduates in a subset of the categories in +`grad_employ_subset` from above: -A recipe is processed in 2 steps, first it is "prepped". This calculates and -stores any intermediate statistics necessary for use on the test data. -Then it is "baked" -resulting in training data ready for passing into a statistical model (like `lm`). +```{r multi_key_forecast, warning=FALSE} +edu_quals <- c("Undergraduate degree", "Professional degree") +geo_values <- c("Quebec", "British Columbia") -We have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle -the `time_value`, `geo_value`, and any additional keys so that these are available -when necessary. +grad_employ <- grad_employ_subset |> + filter(edu_qual %in% edu_quals, geo_value %in% geo_values) -The `epi_recipe` from `out_gb` can be extracted from the result: +grad_employ -```{r} -extract_recipe(out_gb$epi_workflow) +grad_forecast <- arx_forecaster( + grad_employ |> + filter(time_value < 2017), + outcome = "num_graduates", + predictors = c("num_graduates"), + args_list = arx_args_list( + lags = list(c(0, 1, 2)), + ahead = 1 + ) +) +# and plotting +autoplot( + grad_forecast$epi_workflow, + grad_forecast$predictions, + observed_response = grad_employ, +) + geom_vline(aes(xintercept = 2016)) ``` -The "Inputs" are the original `epi_df` and the "roles" that these are assigned. -None of these are predictors or outcomes. Those will be created -by the recipe when it is prepped. The "Operations" are the sequence of -instructions to create the cake (baked training data). -Here we create lagged predictors, lead the outcome, and then remove `NA`s. -Some models like `lm` internally handle `NA`s, but not everything does, so we -deal with them explicitly. The code to do this (inside the forecaster) is - -```{r} -er <- epi_recipe(jhu) %>% - step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = 7) %>% - step_epi_naomit() +The 8 graphs represent all combinations of the `geo_values` (`"Quebec"` and `"British Columbia"`), `edu_quals` (`"Undergraduate degree"` and `"Professional degree"`), and age brackets (`"15 to 34 years"` and `"35 to 64 years"`). + +## Estimating models without geo-pooling + +The methods shown so far estimate a single model across all geographic regions, treating them as if they are independently and identically distributed (see [Mathematical description] for an explicit model example). +This is called "geo-pooling". +In the context of `{epipredict}`, the simplest way to avoid geo-pooling and use different parameters for each geography is to loop over the `geo_value`s: + +```{r fit_non_geo_pooled, warning=FALSE} +geo_values <- covid_case_death_rates |> + pull(geo_value) |> + unique() + +all_fits <- + purrr::map(geo_values, \(geo) { + covid_case_death_rates |> + filter( + geo_value == geo, + time_value <= forecast_date + ) |> + arx_forecaster( + outcome = "death_rate", + trainer = linear_reg(), + predictors = c("death_rate"), + args_list = arx_args_list( + lags = list(c(0, 7, 14)), + ahead = 14 + ) + ) + }) +all_fits |> + map(~ pluck(., "predictions")) |> + list_rbind() ``` -While `{recipes}` provides a function `step_lag()`, it assumes that the data -have no breaks in the sequence of `time_values`. This is a bit dangerous, so -we avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the -amount of data to avoid accidentally dropping recent predictors from the test -data. - -### The model specification - -Users with familiarity with the `{parsnip}` package will have no trouble here. -Basically, `{parsnip}` unifies the function signature across statistical models. -For example, `lm()` "likes" to work with formulas, but `glmnet::glmnet()` uses -`x` and `y` for predictors and response. `{parsnip}` is agnostic. Both of these -do "linear regression". Above we switched from `lm()` to `xgboost()` without -any issue despite the fact that these functions couldn't be more different. - -```{r, eval = FALSE} -lm(formula, data, subset, weights, na.action, - method = "qr", - model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, - contrasts = NULL, offset, ... -) +Estimating separate models for each geography uses far less data for each estimate than geo-pooling and is 56 times slower[^7]. +If a dataset contains relatively few observations for each geography, fitting a geo-pooled model is likely to produce better, more stable results. +However, geo-pooling can only be used if values are comparable in meaning and scale across geographies or can be made comparable, for example by normalization. -xgboost( - data = NULL, label = NULL, missing = NA, weight = NULL, - params = list(), nrounds, verbose = 1, print_every_n = 1L, - early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, - save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), - ... -) -``` +If we wanted to build a geo-aware model, such as a linear regression with a +different intercept for each geography, we would need to build a [custom +workflow](custom_epiworkflows) with geography as a factor. -`{epipredict}` provides a few engines/modules (the flatline forecaster and -quantile regression), but you should be able to use any available models -listed [here](https://www.tidymodels.org/find/parsnip/). +# Anatomy of a canned forecaster -To estimate (fit) a preprocessed model, one calls `fit()` on the `epi_workflow`. +This section describes the resulting object from `arx_forecaster()`, a fairly minimal description of the mathematical model used, and a description of an `arx_fcast` object. -```{r} -ewf <- epi_workflow(er, linear_reg()) %>% fit(jhu) -``` +## Mathematical description -### Postprocessing +Let's look at the mathematical details of the model in more detail, using a minimal version of +`four_week_ahead`: -To stretch the metaphor of preparing a cake to its natural limits, we have -created postprocessing functionality called "frosting". Much like the recipe, -each postprocessing operation is a "layer" and we "slather" these onto our -baked cake. To fix ideas, below is the postprocessing `frosting` for -`arx_forecaster()` +```{r, four_week_again} +four_week_small <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0, 7, 14), c(0, 7, 14)), + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) +) +hardhat::extract_fit_engine(four_week_small$epi_workflow) +``` -```{r} -extract_frosting(out_q$epi_workflow) +If $d_{t,j}$ is the death rate on day $t$ at location $j$ and $c_{t,j}$ is the +associated case rate, then the corresponding model is: + +$$ +\begin{aligned} +d_{t+28, j} = & a_0 + a_1 d_{t,j} + a_2 d_{t-7,j} + a_3 d_{t-14, j} +\\ + & a_4 c_{t, j} + a_5 c_{t-7, j} + a_6 c_{t-14, j} + \varepsilon_{t,j}. +\end{aligned} +$$ + +For example, $a_1$ is `lag_0_death_rate` above, with a value of +`r round(hardhat::extract_fit_engine(four_week_small$epi_workflow)$coefficients["lag_0_death_rate"],3)`, +while $a_5$ is +`r round(hardhat::extract_fit_engine(four_week_small$epi_workflow)$coefficients["lag_7_case_rate"],4) `. +Note that unlike `d_{t,j}` or `c_{t,j}`, these *don't* depend on either the time +$t$ or the location $j$. +This is what make it a geo-pooled model. + + +The training data for estimating the parameters of this linear model is +constructed within the `arx_forecaster()` function by shifting a series of +columns the appropriate amount -- based on the requested `lags`. +Each row containing no `NA` values in the predictors is used as a training +observation to fit the coefficients $a_0,\ldots, a_6$. + +The equation above is only an accurate description of the model for a linear +engine like `quantile_reg()` or `linear_reg()`; a nonlinear model like +`rand_forest(mode = "regression")` will use the same input variables and +training data, but fit the appropriate model for them. + +## Code object +Let's dissect the forecaster we trained back on the [landing +page](../index.html#motivating-example): + +```{r make-four-forecasts, warning=FALSE} +four_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) +) ``` -Here we have 5 layers of frosting. The first generates the forecasts from the test data. -The second uses quantiles of the residuals to create distributional -forecasts. The next two add columns for the date the forecast was made and the -date for which it is intended to occur. Because we are predicting rates, they -should be non-negative, so the last layer thresholds both predicted values and -intervals at 0. The code to do this (inside the forecaster) is +`four_week_ahead` has three components: an `epi_workflow`, a table of +`predictions`, and a list of `metadata`. +The table of predictions is a simple tibble, -```{r} -f <- frosting() %>% - layer_predict() %>% - layer_residual_quantiles( - quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99), - symmetrize = TRUE - ) %>% - layer_add_forecast_date() %>% - layer_add_target_date() %>% - layer_threshold(starts_with(".pred")) +```{r show_predictions} +four_week_ahead$predictions ``` -At predict time, we add this object onto the `epi_workflow` and call `forecast()` +where `.pred` gives the point/median prediction, and `.pred_distn` is a +`hardhat::quantile_pred()` object representing a distribution through various quantile +levels. +The `5` in `` refers to the number of quantiles that have been +explicitly created, while the [0.234] is the median value[^4]. +By default, `.pred_distn` covers the quantiles `c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)`. + +The `epi_workflow` is a significantly more complicated object, extending a +`workflows::workflow()` to include post-processing steps: -```{r, warning=FALSE} -ewf %>% - add_frosting(f) %>% - forecast() +```{r show_workflow} +four_week_ahead$epi_workflow ``` -The above `get_test_data()` function examines the recipe and ensures that enough -test data is available to create the necessary lags and produce a prediction -for the desired future time point (after the end of the training data). This mimics -what would happen if `jhu` contained the most recent available historical data and -we wanted to actually predict the future. We could have instead used any test data -that contained the necessary predictors. +An `epi_workflow()` consists of 3 parts: +- `preprocessor`: a collection of steps that transform the data to be ready for + modelling. Steps can be custom, as are those included in this package, + or [be defined in `{recipes}`](https://recipes.tidymodels.org/reference/index.html). + `four_week_ahead` has 5 steps; you can inspect them more closely by + running `hardhat::extract_recipe(four_week_ahead$epi_workflow)`.[^6] +- `spec`: a `parsnip::model_spec` which includes both the model parameters and + an engine to fit those parameters to the training data as prepared by + `preprocessor`. `four_week_ahead` uses the default of + `parsnip::linear_reg()`, which is a `{parsnip}` wrapper for several linear + regression engines, by default `stats::lm()`. You can inspect the model more + closely by running + `hardhat::extract_fit_recipe(four_week_ahead$epi_workflow)`. +- `postprocessor`: a collection of layers to be applied to the resulting + forecast. Layers are internal to this package. `four_week_ahead` just so happens to have + 5 of as these well. You can inspect the layers more closely by running + `epipredict::extract_layers(four_week_ahead$epi_workflow)`. -## Conclusion +See the [Custom Epiworkflows vignette](custom_epiworkflows) for recreating and then +extending `four_week_ahead` using the custom forecaster framework. -Internally, we provide some simple functions to create reasonable forecasts. -But ideally, a user could create their own forecasters by building up the -components we provide. In other vignettes, we try to walk through some of these -customizations. -To illustrate everything above, here is (roughly) the code for the -`flatline_forecaster()` applied to the `case_rate`. +[^4]: in the case of a `{parsnip}` engine which doesn't explicitly predict + quantiles, these quantiles are created using `layer_residual_quantiles()`, + which infers the quantiles from the residuals of the fit. -```{r} -r <- epi_recipe(jhu) %>% - step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>% - update_role(case_rate, new_role = "predictor") %>% - add_role(all_of(key_colnames(jhu)), new_role = "predictor") - -f <- frosting() %>% - layer_predict() %>% - layer_residual_quantiles() %>% - layer_add_forecast_date() %>% - layer_add_target_date() %>% - layer_threshold(starts_with(".pred")) - -eng <- linear_reg() %>% set_engine("flatline") -wf <- epi_workflow(r, eng, f) %>% fit(jhu) -preds <- forecast(wf) -``` +[^5]: in the case of `arx_forecaster`, this is any model with + `mode="regression"` from [this + list](https://www.tidymodels.org/find/parsnip/). -All that really differs from the `arx_forecaster()` is the `recipe`, the -test data, and the engine. The `frosting` is identical, as is the fitting -and predicting procedure. +[^6]: alternatively, for an unfit version of the preprocessor, you can call + `hardhat::extract_preprocessor(four_week_ahead$epi_workflow)` -```{r} -preds -``` +[^7]: the number of geographies + +[^8]: It has only a year of data, which is barely enough to run the method without errors, let alone get a meaningful prediction. +[^9]: Though not 28 weeks into the future! Such a forecast will likely be absurdly low or high. diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd index 0dea322f2..1396ebba7 100644 --- a/vignettes/panel-data.Rmd +++ b/vignettes/panel-data.Rmd @@ -7,14 +7,8 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r setup, include=F} -knitr::opts_chunk$set( - echo = TRUE, - collapse = TRUE, - comment = "#>", - out.width = "90%", - fig.align = "center" -) +```{r, include = FALSE} +source(here::here("vignettes/_common.R")) ``` ```{r libraries, warning=FALSE, message=FALSE} @@ -24,30 +18,30 @@ library(parsnip) library(recipes) library(epiprocess) library(epipredict) +library(epidatasets) library(ggplot2) theme_set(theme_bw()) ``` -[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, -contain cross-sectional measurements of subjects over time. The `epipredict` -package is most suitable for running forecasters on epidemiological panel data. -A built-in example of this is the [`case_death_rate_subset`]( - https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) +[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, +contain cross-sectional measurements of subjects over time. The `epipredict` +package is most suitable for running forecasters on epidemiological panel data. +An example of this is the [`covid_case_death_rates`]( + https://cmu-delphi.github.io/epidatasets/reference/covid_case_death_rates.html) dataset, which contains daily state-wise measures of `case_rate` and `death_rate` for COVID-19 in 2021: ```{r epi-panel-ex, include=T} -head(case_death_rate_subset, 3) +head(covid_case_death_rates, 3) ``` -`epipredict` functions work with data in -[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) -format. Despite the stated goal and name of the package, other panel datasets -are also valid candidates for `epipredict` functionality, as long as they are +`epipredict` functions work with data in +[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) +format. Despite the stated goal and name of the package, other panel datasets +are also valid candidates for `epipredict` functionality, as long as they are in `epi_df` format. ```{r employ-stats, include=F} -data("grad_employ_subset") year_start <- min(grad_employ_subset$time_value) year_end <- max(grad_employ_subset$time_value) ``` @@ -55,36 +49,36 @@ year_end <- max(grad_employ_subset$time_value) # Example panel data overview In this vignette, we will demonstrate using `epipredict` with employment panel -data from Statistics Canada. We will be using +data from Statistics Canada. We will be using [ - Table 37-10-0115-01: Characteristics and median employment income of - longitudinal cohorts of postsecondary graduates two and five years after - graduation, by educational qualification and field of study (primary + Table 37-10-0115-01: Characteristics and median employment income of + longitudinal cohorts of postsecondary graduates two and five years after + graduation, by educational qualification and field of study (primary groupings) -](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). - -The full dataset contains yearly median employment income two and five years -after graduation, and number of graduates. The data is stratified by -variables such as geographic region (Canadian province), education, and -age group. The year range of the dataset is `r year_start` to `r year_end`, -inclusive. The full dataset also contains metadata that describes the -quality of data collected. For demonstration purposes, we make the following +](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). + +The full dataset contains yearly median employment income two and five years +after graduation, and number of graduates. The data is stratified by +variables such as geographic region (Canadian province), education, and +age group. The year range of the dataset is `r year_start` to `r year_end`, +inclusive. The full dataset also contains metadata that describes the +quality of data collected. For demonstration purposes, we make the following modifications to get a subset of the full dataset: -* Only keep provincial-level geographic region (the full data also has +* Only keep provincial-level geographic region (the full data also has "Canada" as a region) * Only keep "good" or better quality data rows, as indicated by the [`STATUS`]( https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column -* Choose a subset of covariates and aggregate across the remaining ones. The +* Choose a subset of covariates and aggregate across the remaining ones. The chosen covariates are age group, and educational qualification. -To use this data with `epipredict`, we need to convert it into `epi_df` format +To use this data with `epipredict`, we need to convert it into `epi_df` format using `epiprocess::as_epi_df()` with additional keys. In our case, the additional keys are `age_group`, -and `edu_qual`. Note that in the above modifications, we encoded `time_value` +and `edu_qual`. Note that in the above modifications, we encoded `time_value` as type `integer`. This lets us set `time_type = "year"`, and ensures that lag and ahead modifications later on are using the correct time units. See the -`epiprocess::epi_df` for +`epiprocess::epi_df` for a list of all the `time_type`s available. ```{r data-dim, include=F} @@ -92,20 +86,20 @@ employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",") employ_colcount <- length(names(grad_employ_subset)) ``` -Now, we are ready to use `grad_employ_subset` with `epipredict`. +Now, we are ready to use `grad_employ_subset` with `epipredict`. Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns. Here is a quick summary of the columns in our `epi_df`: -* `time_value` (time value): year in `date` format +* `time_value` (time value): year in `date` format * `geo_value` (geo value): province in Canada -* `num_graduates` (raw, time series value): number of graduates -* `med_income_2y` (raw, time series value): median employment income 2 years -after graduation -* `med_income_5y` (raw, time series value): median employment income 5 years -after graduation -* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 -years -* `edu_qual` (key): one of 32 unique educational qualifications, e.g., +* `num_graduates` (raw, time series value): number of graduates +* `med_income_2y` (raw, time series value): median employment income 2 years +after graduation +* `med_income_5y` (raw, time series value): median employment income 5 years +after graduation +* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 +years +* `edu_qual` (key): one of 32 unique educational qualifications, e.g., "Master's diploma" ```{r preview-data, include=T} @@ -114,18 +108,18 @@ employ <- grad_employ_subset sample_n(employ, 6) ``` -In the following sections, we will go over pre-processing the data in the -`epi_recipe` framework, and fitting a model and making predictions within the +In the following sections, we will go over pre-processing the data in the +`epi_recipe` framework, and estimating a model and making predictions within the `epipredict` framework and using the package's canned forecasters. -# Autoregressive (AR) model to predict number of graduates in a year +# Autoregressive (AR) model to predict number of graduates in a year -## Pre-processing +## Pre-processing -As a simple example, let's work with the `num_graduates` column for now. We will -first pre-process by standardizing each numeric column by the total within +As a simple example, let's work with the `num_graduates` column for now. We will +first pre-process by standardizing each numeric column by the total within each group of keys. We do this since those raw numeric values will vary greatly -from province to province since there are large differences in population. +from province to province since there are large differences in population. ```{r employ-small, include=T} employ_small <- employ %>% @@ -141,9 +135,9 @@ employ_small <- employ %>% head(employ_small) ``` -Below is a visualization for a sample of the small data for British Columbia and Ontario. -Note that some groups -do not have any time series information since we filtered out all time series +Below is a visualization for a sample of the small data for British Columbia and Ontario. +Note that some groups +do not have any time series information since we filtered out all time series with incomplete dates. ```{r employ-small-graph, include=T, eval=T, fig.width=9, fig.height=6} @@ -161,32 +155,32 @@ employ_small %>% theme(legend.position = "bottom") ``` -We will predict the standardized number of graduates (a proportion) in the -next year (time $t+1$) using an autoregressive model with three lags (i.e., an -AR(3) model). Such a model is represented algebraically like this: +We will predict the standardized number of graduates (a proportion) in the +next year (time $t+1$) using an autoregressive model with three lags (i.e., an +AR(3) model). Such a model is represented algebraically like this: \[ - y_{t+1,ijk} = + y_{t+1,ijk} = \alpha_0 + \alpha_1 y_{tijk} + \alpha_2 y_{t-1,ijk} + \alpha_3 y_{t-2,ijk} + \epsilon_{tijk} \] where $y_{tij}$ is the proportion of graduates at time $t$ in location $i$ and age group $j$ with education quality $k$. -In the pre-processing step, we need to create additional columns in `employ` for -each of $y_{t+1,ijk}$, $y_{tijk}$, $y_{t-1,ijk}$, and $y_{t-2,ijk}$. -We do this via an -`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these -outcome and predictor columns; the recipe just stores the instructions for -adding them. +In the pre-processing step, we need to create additional columns in `employ` for +each of $y_{t+1,ijk}$, $y_{tijk}$, $y_{t-1,ijk}$, and $y_{t-2,ijk}$. +We do this via an +`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these +outcome and predictor columns; the recipe just stores the instructions for +adding them. -Our `epi_recipe` should add one `ahead` column representing $y_{t+1,ijk}$ and +Our `epi_recipe` should add one `ahead` column representing $y_{t+1,ijk}$ and 3 `lag` columns representing $y_{tijk}$, $y_{t-1,ijk}$, and $y_{t-2,ijk}$ (it's more accurate to think of the 0th "lag" as the "current" value with 2 lags, -but that's not quite how the processing works). -Also note that +but that's not quite how the processing works). +Also note that since we specified our `time_type` to be `year`, our `lag` and `lead` -values are both in years. +values are both in years. ```{r make-recipe, include=T, eval=T} r <- epi_recipe(employ_small) %>% @@ -196,7 +190,7 @@ r <- epi_recipe(employ_small) %>% r ``` -Let's apply this recipe using `prep` and `bake` to generate and view the `lag` +Let's apply this recipe using `prep` and `bake` to generate and view the `lag` and `ahead` columns. ```{r view-preprocessed, include=T} @@ -211,25 +205,25 @@ bake_and_show_sample <- function(recipe, data, n = 5) { r %>% bake_and_show_sample(employ_small) ``` -We can see that the `prep` and `bake` steps created new columns according to +We can see that the `prep` and `bake` steps created new columns according to our `epi_recipe`: - `ahead_1_num_graduates_prop` corresponds to $y_{t+1,ijk}$ -- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and -`lag_2_num_graduates_prop` correspond to $y_{tijk}$, $y_{t-1,ijk}$, and $y_{t-2,ijk}$ +- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and +`lag_2_num_graduates_prop` correspond to $y_{tijk}$, $y_{t-1,ijk}$, and $y_{t-2,ijk}$ respectively. -## Model fitting and prediction +## Model estimation and prediction -Since our goal for now is to fit a simple autoregressive model, we can use +Since our goal for now is to estimate a simple autoregressive model, we can use [`parsnip::linear_reg()`]( - https://parsnip.tidymodels.org/reference/linear_reg.html) with the default -engine `lm`, which fits a linear regression using ordinary least squares. + https://parsnip.tidymodels.org/reference/linear_reg.html) with the default +engine `lm`, which fits a linear regression using ordinary least squares. -We will use `epi_workflow` with the `epi_recipe` we defined in the -pre-processing section along with the `parsnip::linear_reg()` model. Note -that `epi_workflow` is a container and doesn't actually do the fitting. We have -to pass the workflow into `fit()` to get our estimated model coefficients +We will use `epi_workflow` with the `epi_recipe` we defined in the +pre-processing section along with the `parsnip::linear_reg()` model. Note +that `epi_workflow` is a container and doesn't actually do the fitting. We have +to pass the workflow into `fit()` to get our estimated model coefficients $\widehat{\alpha}_i,\ i=0,...,3$. ```{r linearreg-wf, include=T} @@ -238,23 +232,23 @@ wf_linreg <- epi_workflow(r, linear_reg()) %>% summary(extract_fit_engine(wf_linreg)) ``` -This output tells us the coefficients of the fitted model; for instance, -the estimated intercept is $\widehat{\alpha}_0 =$ -`r round(coef(extract_fit_engine(wf_linreg))[1], 3)` and the coefficient for -$y_{tijk}$ is -$\widehat\alpha_1 =$ `r round(coef(extract_fit_engine(wf_linreg))[2], 3)`. -The summary also tells us that all estimated coefficients are significantly -different from zero. Extracting the 95% confidence intervals for the +This output tells us the coefficients of the fitted model; for instance, +the estimated intercept is $\widehat{\alpha}_0 =$ +`r round(coef(hardhat::extract_fit_engine(wf_linreg))[1], 3)` and the coefficient for +$y_{tijk}$ is +$\widehat\alpha_1 =$ `r round(coef(hardhat::extract_fit_engine(wf_linreg))[2], 3)`. +The summary also tells us that all estimated coefficients are significantly +different from zero. Extracting the 95% confidence intervals for the coefficients also leads us to the same conclusion: all the coefficient estimates are significantly different -from 0. +from 0. ```{r} confint(extract_fit_engine(wf_linreg)) ``` -Now that we have our workflow, we can generate predictions from a subset of our -data. For this demo, we will predict the number of graduates using the last 2 +Now that we have our workflow, we can generate predictions from a subset of our +data. For this demo, we will predict the number of graduates using the last 2 years of our dataset. ```{r linearreg-predict, include=T} @@ -264,17 +258,17 @@ preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred)) preds %>% sample_n(5) ``` -We can do this using the `augment` function too. Note that `predict` and -`augment` both still return an `epiprocess::epi_df` with all of the keys that +We can do this using the `augment` function too. Note that `predict` and +`augment` both still return an `epiprocess::epi_df` with all of the keys that were present in the original dataset. ```{r linearreg-augment} augment(wf_linreg, latest) %>% sample_n(5) ``` -## Model diagnostics +## Model diagnostics -First, we'll plot the residuals (that is, $y_{tijk} - \widehat{y}_{tijk}$) +First, we'll plot the residuals (that is, $y_{tijk} - \widehat{y}_{tijk}$) against the fitted values ($\widehat{y}_{tijk}$). ```{r lienarreg-resid-plot, include=T, fig.height = 5, fig.width = 5} @@ -282,35 +276,35 @@ par(mfrow = c(2, 2), mar = c(5, 3, 1.2, 0)) plot(extract_fit_engine(wf_linreg)) ``` -The fitted values vs. residuals plot shows us that the residuals are mostly -clustered around zero, but do not form an even band around the zero line, +The fitted values vs. residuals plot shows us that the residuals are mostly +clustered around zero, but do not form an even band around the zero line, indicating that the variance of the residuals is not constant. Additionally, -the fitted values vs. square root of standardized residuals makes this more -obvious - the spread of the square root of standardized residuals varies with -the fitted values. +the fitted values vs. square root of standardized residuals makes this more +obvious - the spread of the square root of standardized residuals varies with +the fitted values. -The Q-Q plot shows us that the residuals have heavier tails than a Normal +The Q-Q plot shows us that the residuals have heavier tails than a Normal distribution. So the normality of residuals assumption doesn't hold either. Finally, the residuals vs. leverage plot shows us that we have a few influential -points based on the Cook's distance (those outside the red dotted line). +points based on the Cook's distance (those outside the red dotted line). -Since we appear to be violating the linear model assumptions, we might consider -transforming our data differently, or considering a non-linear model, or +Since we appear to be violating the linear model assumptions, we might consider +transforming our data differently, or considering a non-linear model, or something else. # AR model with exogenous inputs -Now suppose we want to model the 1-step-ahead 5-year employment income using -current and two previous values, while -also incorporating information from the other two time-series in our dataset: -the 2-year employment income and the number of graduates in the previous 2 +Now suppose we want to model the 1-step-ahead 5-year employment income using +current and two previous values, while +also incorporating information from the other two time-series in our dataset: +the 2-year employment income and the number of graduates in the previous 2 years. We would do this using an autoregressive model with exogenous inputs, defined as follows: \[ \begin{aligned} - y_{t+1,ijk} &= + y_{t+1,ijk} &= \alpha_0 + \alpha_1 y_{tijk} + \alpha_2 y_{t-1,ijk} + \alpha_3 y_{t-2,ijk}\\ &\quad + \beta_1 x_{tijk} + \beta_2 x_{t-1,ijk}\\ &\quad + \gamma_2 z_{tijk} + \gamma_2 z_{t-1,ijk} + \epsilon_{tijk} @@ -318,11 +312,11 @@ defined as follows: \] where $y_{tijk}$ is the 5-year median income (proportion) at time $t$ (in -location $i$, age group $j$ with education quality $k$), -$x_{tijk}$ is the 2-year median income (proportion) at time $t$, and -$z_{tijk}$ is the number of graduates (proportion) at time $t$. +location $i$, age group $j$ with education quality $k$), +$x_{tijk}$ is the 2-year median income (proportion) at time $t$, and +$z_{tijk}$ is the number of graduates (proportion) at time $t$. -## Pre-processing +## Pre-processing Again, we construct an `epi_recipe` detailing the pre-processing steps. @@ -339,21 +333,21 @@ rx <- epi_recipe(employ_small) %>% bake_and_show_sample(rx, employ_small) ``` -## Model fitting & post-processing +## Model estimation & post-processing -Before fitting our model and making predictions, let's add some post-processing +Before estimating our model and making predictions, let's add some post-processing steps using a few [`frosting`]( - https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do + https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do a few things: -1. Threshold our predictions to 0. We are predicting proportions, which can't -be negative. And the transformed values back to dollars and people can't be +1. Threshold our predictions to 0. We are predicting proportions, which can't +be negative. And the transformed values back to dollars and people can't be negative either. -1. Generate prediction intervals based on residual quantiles, allowing us to +1. Generate prediction intervals based on residual quantiles, allowing us to quantify the uncertainty associated with future predicted values. -1. Convert our predictions back to income values and number of graduates, -rather than standardized proportions. We do this via the frosting layer -`layer_population_scaling()`. +1. Convert our predictions back to income values and number of graduates, +rather than standardized proportions. We do this via the frosting layer +`layer_population_scaling()`. ```{r custom-arx-post, include=T} @@ -370,7 +364,6 @@ f <- frosting() %>% layer_threshold(.pred, lower = 0) %>% # 90% prediction interval layer_residual_quantiles( - quantile_levels = c(0.1, 0.9), symmetrize = FALSE ) %>% layer_population_scaling( @@ -406,28 +399,28 @@ predsx %>% # Using canned forecasters -We've seen what we can do with non-epidemiological panel data using the -recipes frame, with `epi_recipe` for pre-processing, `epi_workflow` for model -fitting, and `frosting` for post-processing. +We've seen what we can do with non-epidemiological panel data using the +recipes frame, with `epi_recipe` for pre-processing, `epi_workflow` for model +fitting, and `frosting` for post-processing. -`epipredict` also comes with canned forecasters that do all of those steps -behind the scenes for some simple models. Even though we aren't working with +`epipredict` also comes with canned forecasters that do all of those steps +behind the scenes for some simple models. Even though we aren't working with epidemiological data, canned forecasters still work as expected, out of the box. We will demonstrate this with the simple [`flatline_forecaster`]( - https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) -and the direct autoregressive (AR) forecaster + https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) +and the direct autoregressive (AR) forecaster [`arx_forecaster`]( https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html). -For both illustrations, we will continue to use the `employ_small` dataset -with the transformed numeric columns that are proportions within each group +For both illustrations, we will continue to use the `employ_small` dataset +with the transformed numeric columns that are proportions within each group by the keys in our `epi_df`. -## Flatline forecaster +## Flatline forecaster In this first example, we'll use `flatline_forecaster` to make a simple -prediction of the 2-year median income for the next year, based on one previous +prediction of the 2-year median income for the next year, based on one previous time point. This model is representated algebraically as: \[y_{t+1,ijk} = y_{tijk} + \epsilon_{tijk}\] where $y_{tijk}$ is the 2-year median income (proportion) at time $t$. @@ -442,14 +435,14 @@ out_fl ## Autoregressive forecaster with exogenous inputs -In this second example, we'll use `arx_forecaster` to make a prediction of the +In this second example, we'll use `arx_forecaster` to make a prediction of the 5-year median income based using two lags, _and_ using two lags on two exogenous -variables: 2-year median income and number of graduates. +variables: 2-year median income and number of graduates. -The canned forecaster gives us a simple way of making this forecast since it -defines the recipe, workflow, and post-processing steps behind the scenes. This -is very similar to the model we introduced in the "Autoregressive Linear Model -with Exogenous Inputs" section of this article, but where all inputs have the +The canned forecaster gives us a simple way of making this forecast since it +defines the recipe, workflow, and post-processing steps behind the scenes. This +is very similar to the model we introduced in the "Autoregressive Linear Model +with Exogenous Inputs" section of this article, but where all inputs have the same number of lags. ```{r arx-lr, include=T, warning=F} @@ -463,7 +456,7 @@ out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop", out_arx_lr ``` -Other changes to the direct AR forecaster, like changing the engine, also work +Other changes to the direct AR forecaster, like changing the engine, also work as expected. Below we use a boosted tree model instead of a linear regression. ```{r arx-rf, include=T, warning=F} @@ -479,7 +472,7 @@ out_arx_rf # Conclusion -While the purpose of `{epipredict}` is to allow `{tidymodels}` to operate on +While the purpose of `{epipredict}` is to allow `{tidymodels}` to operate on epidemiology data, it can be easily adapted (both the workflows and the canned forecasters) to work for generic panel data modelling. diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd deleted file mode 100644 index 63a27bd55..000000000 --- a/vignettes/preprocessing-and-models.Rmd +++ /dev/null @@ -1,601 +0,0 @@ ---- -title: Examples of Preprocessing and Models -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Examples of Preprocessing and Models} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set( - echo = TRUE, - collapse = TRUE, - comment = "#>", - out.width = "100%" -) -``` - -## Introduction - -The `epipredict` package utilizes the `tidymodels` framework, namely -[`{recipes}`](https://recipes.tidymodels.org/) for -[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences -of feature engineering and [`{parsnip}`](https://parsnip.tidymodels.org/) for a -unified interface to a range of models. - -`epipredict` has additional customized feature engineering and preprocessing -steps, such as `step_epi_lag()`, `step_population_scaling()`, -`step_epi_naomit()`. They can be used along with -steps from the `{recipes}` package for more feature engineering. - -In this vignette, we will illustrate some examples of how to use `epipredict` -with `recipes` and `parsnip` for different purposes of epidemiological forecasting. -We will focus on basic autoregressive models, in which COVID cases and -deaths in the near future are predicted using a linear combination of cases and -deaths in the near past. - -The remaining vignette will be split into three sections. The first section, we -will use a Poisson regression to predict death counts. In the second section, -we will use a linear regression to predict death rates. Last but not least, we -will create a classification model for hotspot predictions. - -```{r, warning=FALSE, message=FALSE} -library(tidyr) -library(dplyr) -library(epidatr) -library(epipredict) -library(recipes) -library(workflows) -library(poissonreg) -``` - -## Poisson Regression - -During COVID-19, the US Center for Disease Control and Prevention (CDC) collected -models -and forecasts to characterize the state of an outbreak and its course. They use -it to inform public health decision makers on potential consequences of -deploying control measures. - -One of the outcomes that the CDC forecasts is -[death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html). -Although there are many state-of-the-art models, we choose to use Poisson -regression, the textbook example for modeling count data, as an illustration -for using the `epipredict` package with other existing tidymodels packages. - -```{r poisson-reg-data} -x <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_incidence_num", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - select(geo_value, time_value, cases = value) - -y <- pub_covidcast( - source = "jhu-csse", - signals = "deaths_incidence_num", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - select(geo_value, time_value, deaths = value) - -counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>% - as_epi_df() -``` - -The `counts_subset` dataset comes from the `epidatr` package, and -contains the number of confirmed cases and deaths from June 4, 2021 to -Dec 31, 2021 in some U.S. states. - -We wish to predict the 7-day ahead death counts with lagged cases and deaths. -Furthermore, we will let each state be a dummy variable. Using differential -intercept coefficients, we can allow for an intercept shift between states. - -The model takes the form -\begin{aligned} -\log\left( \mu_{t+7} \right) &= \beta_0 + \delta_1 s_{\text{state}_1} + -\delta_2 s_{\text{state}_2} + \cdots + \nonumber \\ -&\quad\beta_1 \text{deaths}_{t} + -\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + -\beta_4 \text{cases}_{t-7}, -\end{aligned} -where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a -Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy -variables for each state and take values of either 0 or 1. - -Preprocessing steps will be performed to prepare the -data for model fitting. But before diving into them, it will be helpful to -understand what `roles` are in the `recipes` framework. - ---- - -#### Aside on `recipes` - -`recipes` can assign one or more roles to each column in the data. The roles -are not restricted to a predefined set; they can be anything. -For most conventional situations, they are typically “predictor” and/or -"outcome". Additional roles enable targeted `step_*()` operations on specific -variables or groups of variables. - -In our case, the role `predictor` is given to explanatory variables on the -right-hand side of the model (in the equation above). -The role `outcome` is the response variable -that we wish to predict. `geo_value` and `time_value` are predefined roles -that are unique to the `epipredict` package. Since we work with `epi_df` -objects, all datasets should have `geo_value` and `time_value` passed through -automatically with these two roles assigned to the appropriate columns in the data. - -The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) -in bulk. There are a few handy functions that can be used together to help us -manipulate variable roles easily. - -> `update_role()` alters an existing role in the recipe or assigns an initial role -> to variables that do not yet have a declared role. -> -> `add_role()` adds an additional role to variables that already have a role in -> the recipe, without overwriting old roles. -> -> `remove_role()` eliminates a single existing role in the recipe. - -#### End aside - ---- - -Notice in the following preprocessing steps, we used `add_role()` on -`geo_value_factor` since, currently, the default role for it is `raw`, but -we would like to reuse this variable as `predictor`s. - -```{r} -counts_subset <- counts_subset %>% - mutate(geo_value_factor = as.factor(geo_value)) %>% - as_epi_df() - -epi_recipe(counts_subset) - -r <- epi_recipe(counts_subset) %>% - add_role(geo_value_factor, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - ## Occasionally, data reporting errors / corrections result in negative - ## cases / deaths - step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>% - step_epi_lag(cases, deaths, lag = c(0, 7)) %>% - step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() -``` - -After specifying the preprocessing steps, we will use the `parsnip` package for -modeling and producing the prediction for death count, 7 days after the -latest available date in the dataset. - -```{r} -latest <- get_test_data(r, counts_subset) - -wf <- epi_workflow(r, parsnip::poisson_reg()) %>% - fit(counts_subset) - -predict(wf, latest) %>% filter(!is.na(.pred)) -``` - -Note that the `time_value` corresponds to the last available date in the -training set, **NOT** to the target date of the forecast -(`r max(latest$time_value) + 7`). - -Let's take a look at the fit: - -```{r} -extract_fit_engine(wf) -``` - -Up to now, we've used the Poisson regression to model count data. Poisson -regression can also be used to model rate data, such as case rates or death -rates, by incorporating offset terms in the model. - -To model death rates, the Poisson regression would be expressed as: -\begin{aligned} -\log\left( \mu_{t+7} \right) &= \log(\text{population}) + -\beta_0 + \delta_1 s_{\text{state}_1} + -\delta_2 s_{\text{state}_2} + \cdots + \nonumber \\ -&\quad\beta_1 \text{deaths}_{t} + -\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + -\beta_4 \text{cases}_{t-7} -\end{aligned} -where $\log(\text{population})$ is the log of the state population that was -used to scale the count data on the left-hand side of the equation. This offset -is simply a predictor with coefficient fixed at 1 rather than estimated. - -There are several ways to model rate data given count and population data. -First, in the `parsnip` framework, we could specify the formula in `fit()`. -However, by doing so we lose the ability to use the `recipes` framework to -create new variables since variables that do not exist in the -original dataset (such as, here, the lags and leads) cannot be called directly in `fit()`. - -Alternatively, `step_population_scaling()` and `layer_population_scaling()` -in the `epipredict` package can perform the population scaling if we provide the -population data, which we will illustrate in the next section. - -## Linear Regression - -For COVID-19, the CDC required submission of case and death count predictions. -However, the Delphi Group preferred to train on rate data instead, because it -puts different locations on a similar scale (eliminating the need for location-specific intercepts). -We can use a liner regression to predict the death -rates and use state population data to scale the rates to counts.[^pois] We will do so -using `layer_population_scaling()` from the `epipredict` package. - -[^pois]: We could continue with the Poisson model, but we'll switch to the Gaussian likelihood just for simplicity. - -Additionally, when forecasts are submitted, prediction intervals should be -provided along with the point estimates. This can be obtained via postprocessing -using -`layer_residual_quantiles()`. It is worth pointing out, however, that -`layer_residual_quantiles()` should be used before population scaling or else -the transformation will make the results uninterpretable. - -We wish, now, to predict the 7-day ahead death counts with lagged case rates and -death rates, along with some extra behaviourial predictors. Namely, we will use -survey data from -[COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators). - -The survey data provides the estimated percentage of people who wore a mask for -most or all of the time while in public in the past 7 days and the estimated -percentage of respondents who reported that all or most people they encountered -in public in the past 7 days maintained a distance of at least 6 feet. - -State-wise population data from the 2019 U.S. Census is included in this package -and will be used in `layer_population_scaling()`. - -```{r} -behav_ind_mask <- pub_covidcast( - source = "fb-survey", - signals = "smoothed_wwearing_mask_7d", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - select(geo_value, time_value, masking = value) - -behav_ind_distancing <- pub_covidcast( - source = "fb-survey", - signals = "smoothed_wothers_distanced_public", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - select(geo_value, time_value, distancing = value) - -pop_dat <- state_census %>% select(abbr, pop) - -behav_ind <- behav_ind_mask %>% - full_join(behav_ind_distancing, by = c("geo_value", "time_value")) -``` - -Rather than using raw mask-wearing / social-distancing metrics, for the sake -of illustration, we'll convert both into categorical predictors. - -```{r, echo=FALSE, message=FALSE,fig.align='center', fig.width=6, fig.height=4} -library(ggplot2) -behav_ind %>% - pivot_longer(masking:distancing) %>% - ggplot(aes(value, fill = geo_value)) + - geom_density(alpha = 0.5) + - scale_fill_brewer(palette = "Set1", name = "") + - theme_bw() + - scale_x_continuous(expand = c(0, 0)) + - scale_y_continuous(expand = expansion(c(0, .05))) + - facet_wrap(~name, scales = "free") + - theme(legend.position = "bottom") -``` - -We will take a subset of death rate and case rate data from the built-in dataset -`case_death_rate_subset`. - -```{r} -jhu <- filter( - case_death_rate_subset, - time_value >= "2021-06-04", - time_value <= "2021-12-31", - geo_value %in% c("ca", "fl", "tx", "ny", "nj") -) -``` - -Preprocessing steps will again rely on functions from the `epipredict` package as well -as the `recipes` package. -There are also many functions in the `recipes` package that allow for -[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), -such as log transformations and data centering. In our case, we will -center the numerical predictors to allow for a more meaningful interpretation of the -intercept. - -```{r} -jhu <- jhu %>% - mutate(geo_value_factor = as.factor(geo_value)) %>% - left_join(behav_ind, by = c("geo_value", "time_value")) %>% - as_epi_df() - -r <- epi_recipe(jhu) %>% - add_role(geo_value_factor, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% - step_mutate( - masking = cut_number(masking, 5), - distancing = cut_number(distancing, 5) - ) %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_center(contains("lag"), role = "predictor") %>% - step_epi_naomit() -``` - -As a sanity check we can examine the structure of the training data: - -```{r, warning = FALSE} -glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6)) -``` - -Before directly predicting the results, we need to add postprocessing layers to -obtain the death counts instead of death rates. Note that the rates used so -far are "per 100K people" rather than "per person". We'll also use quantile -regression with the `quantile_reg` engine rather than ordinary least squares -to create median predictions and a 90% prediction interval. - -```{r, warning=FALSE} -f <- frosting() %>% - layer_predict() %>% - layer_add_target_date("2022-01-07") %>% - layer_threshold(.pred, lower = 0) %>% - layer_quantile_distn() %>% - layer_naomit(.pred) %>% - layer_population_scaling( - .pred, .pred_distn, - df = pop_dat, - rate_rescaling = 1e5, - by = c("geo_value" = "abbr"), - df_pop_col = "pop" - ) - -wf <- epi_workflow(r, quantile_reg(quantile_levels = c(.05, .5, .95))) %>% - fit(jhu) %>% - add_frosting(f) - -p <- forecast(wf) -p -``` - -The columns marked `*_scaled` have been rescaled to the correct units, in this -case `deaths` rather than deaths per 100K people (these remain in `.pred`). - -To look at the prediction intervals: - -```{r} -p %>% - select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>% - pivot_quantiles_wider(.pred_distn_scaled) -``` - -Last but not least, let's take a look at the regression fit and check the -coefficients: - -```{r, echo =FALSE} -extract_fit_engine(wf) -``` - -## Classification - -Sometimes it is preferable to create a predictive model for surges or upswings -rather than for raw values. In this case, -the target is to predict if the future will have increased case rates (denoted `up`), -decreased case rates (`down`), or flat case rates (`flat`) relative to the current -level. Such models may be -referred to as "hotspot prediction models". We will follow the analysis -in [McDonald, Bien, Green, Hu, et al.](#references) but extend the application -to predict three categories instead of two. - -Hotspot prediction uses a categorical outcome variable defined in terms of the -relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. -Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. -We define the response variables as follows: - -$$ - Z_{\ell, t}= - \begin{cases} - \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ - \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.20\\ - \text{flat}, & \text{otherwise} - \end{cases} -$$ - -where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})\ /\ (Y_{\ell, t-7})$. -We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is -`up`, meaning the number of newly reported cases over the past 7 days has -increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ -is categorized as `down`, it suggests that there has been at least a 20% -decrease in newly reported cases over the past 7 days (a 20% decrease is the -inverse of a 25% increase). Otherwise, we will -consider the trend to be `flat`. - -The expression of the multinomial regression we will use is as follows: - -$$ -\pi_{j}(x) = \text{Pr}(Z_{\ell,t} = j|x) = \frac{e^{g_j(x)}}{1 + \sum_{k=1}^{2}e^{g_k(x)} } -$$ - -where $j$ is either down, flat, or up - -\begin{aligned} -g_{\text{down}}(x) &= 0.\\ -g_{\text{flat}}(x) &= \log\left(\frac{Pr(Z_{\ell,t}=\text{flat}\mid x)}{Pr(Z_{\ell,t}=\text{down}\mid x)}\right) = -\beta_{10} + \beta_{11} t + \delta_{10} s_{\text{state_1}} + -\delta_{11} s_{\text{state_2}} + \cdots \nonumber \\ -&\quad + \beta_{12} Y^{\Delta}_{\ell, t} + -\beta_{13} Y^{\Delta}_{\ell, t-7} + \beta_{14} Y^{\Delta}_{\ell, t-14}\\ -g_{\text{up}}(x) &= \log\left(\frac{Pr(Z_{\ell,t}=\text{up}\mid x)}{Pr(Z_{\ell,t}=\text{down} \mid x)}\right) = -\beta_{20} + \beta_{21}t + \delta_{20} s_{\text{state_1}} + -\delta_{21} s_{\text{state}\_2} + \cdots \nonumber \\ -&\quad + \beta_{22} Y^{\Delta}_{\ell, t} + -\beta_{23} Y^{\Delta}_{\ell, t-7} + \beta_{24} Y^{\Delta}_{\ell, t-14} -\end{aligned} - -Preprocessing steps are similar to the previous models with an additional step -of categorizing the response variables. Again, we will use a subset of death -rate and case rate data from our built-in dataset -`case_death_rate_subset`. - -```{r} -jhu <- case_death_rate_subset %>% - dplyr::filter( - time_value >= "2021-06-04", - time_value <= "2021-12-31", - geo_value %in% c("ca", "fl", "tx", "ny", "nj") - ) %>% - mutate(geo_value_factor = as.factor(geo_value)) - -r <- epi_recipe(jhu) %>% - add_role(time_value, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - step_growth_rate(case_rate, role = "none", prefix = "gr_") %>% - step_epi_lag(starts_with("gr_"), lag = c(0, 7, 14)) %>% - step_epi_ahead(starts_with("gr_"), ahead = 7, role = "none") %>% - # note recipes::step_cut() has a bug in it, or we could use that here - step_mutate( - response = cut( - ahead_7_gr_7_rel_change_case_rate, - breaks = c(-Inf, -0.2, 0.25, Inf) / 7, # division gives weekly not daily - labels = c("down", "flat", "up") - ), - role = "outcome" - ) %>% - step_rm(has_role("none"), has_role("raw")) %>% - step_epi_naomit() -``` - -We will fit the multinomial regression and examine the predictions: - -```{r, warning=FALSE} -wf <- epi_workflow(r, multinom_reg()) %>% - fit(jhu) - -forecast(wf) %>% filter(!is.na(.pred_class)) -``` - -We can also look at the estimated coefficients and model summary information: - -```{r} -extract_fit_engine(wf) -``` - -One could also use a formula in `epi_recipe()` to achieve the same results as -above. However, only one of `add_formula()`, `add_recipe()`, or -`workflow_variables()` can be specified. For the purpose of demonstrating -`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to -return a `data.frame` that could be used for model fitting. - -```{r} -b <- bake(prep(r, jhu), jhu) - -epi_workflow() %>% - add_formula( - response ~ geo_value + time_value + lag_0_gr_7_rel_change_case_rate + - lag_7_gr_7_rel_change_case_rate + lag_14_gr_7_rel_change_case_rate - ) %>% - add_model(parsnip::multinom_reg()) %>% - fit(data = b) -``` - -## Benefits of Lagging and Leading in `epipredict` - -The `step_epi_ahead` and `step_epi_lag` functions in the `epipredict` package -is handy for creating correct lags and leads for future predictions. - -Let's start with a simple dataset and preprocessing: - -```{r} -ex <- filter( - case_death_rate_subset, - time_value >= "2021-12-01", - time_value <= "2021-12-31", - geo_value == "ca" -) - -dim(ex) -``` - -We want to predict death rates on `r max(ex$time_value) + 7`, which is 7 days ahead of the -latest available date in our dataset. - -We will compare two methods of trying to create lags and leads: - -```{r} -p1 <- epi_recipe(ex) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% - step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_epi_naomit() %>% - prep(ex) - -b1 <- bake(p1, ex) -b1 - - -p2 <- epi_recipe(ex) %>% - step_mutate( - lag0case_rate = lag(case_rate, 0), - lag7case_rate = lag(case_rate, 7), - lag14case_rate = lag(case_rate, 14), - lag0death_rate = lag(death_rate, 0), - lag7death_rate = lag(death_rate, 7), - lag14death_rate = lag(death_rate, 14), - ahead7death_rate = lead(death_rate, 7) - ) %>% - step_epi_naomit() %>% - prep(ex) - -b2 <- bake(p2, ex) -b2 -``` - -Notice the difference in number of rows `b1` and `b2` returns. This is because -the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`, -has omitted dates compared to the one that used the `epipredict` functions. - -```{r} -dates_used_in_training1 <- b1 %>% - select(-ahead_7_death_rate) %>% - na.omit() %>% - pull(time_value) -dates_used_in_training1 - -dates_used_in_training2 <- b2 %>% - select(-ahead7death_rate) %>% - na.omit() %>% - pull(time_value) -dates_used_in_training2 -``` - -The model that is trained based on the `{recipes}` functions will predict 7 days ahead from -`r max(dates_used_in_training2)` -instead of 7 days ahead from `r max(dates_used_in_training1)`. - -## References - -McDonald, Bien, Green, Hu, et al. "Can auxiliary indicators improve COVID-19 -forecasting and hotspot prediction?." Proceedings of the National Academy of -Sciences 118.51 (2021): e2111453118. [doi:10.1073/pnas.2111453118](https://doi.org/10.1073/pnas.2111453118) - -## Attribution - -This object contains a modified part of the -[COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) -as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) - -This data set is licensed under the terms of the -[Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) -by the Johns Hopkins -University on behalf of its Center for Systems Science in Engineering. Copyright -Johns Hopkins University 2020. diff --git a/vignettes/update.Rmd b/vignettes/update.Rmd index fcd3653ca..fbda6be37 100644 --- a/vignettes/update.Rmd +++ b/vignettes/update.Rmd @@ -8,17 +8,15 @@ vignette: > --- ```{r, include = FALSE} -knitr::opts_chunk$set( - echo = TRUE, - collapse = TRUE, - comment = "#>", - out.width = "100%" -) +source(here::here("vignettes/_common.R")) ``` ```{r setup, message=FALSE} library(epipredict) library(recipes) +library(dplyr) +library(workflows) +library(parsnip) ``` In this vignette, we will state the main goal of the add/update/remove and @@ -66,14 +64,14 @@ vignette and only briefly go through some examples for a `frosting` object. ## Add/update/remove an `epi_recipe` in an `epi_workflow` -We start with the built-in `case_death_rate_subset` dataset that contains JHU +We start with the built-in `covid_case_death_rates` dataset that contains JHU daily COVID-19 cases and deaths by state and take a subset of it from Nov. 1, 2021 to Dec. 31, 2021 for the four states of Alaska, California, New York, and South Carolina. ```{r} -jhu <- case_death_rate_subset %>% - dplyr::filter(time_value >= as.Date("2021-11-01"), geo_value %in% c("ak", "ca", "ny", "sc")) +jhu <- covid_case_death_rates %>% + filter(time_value >= as.Date("2021-11-01"), geo_value %in% c("ak", "ca", "ny", "sc")) jhu ``` @@ -105,7 +103,7 @@ We may then go on to add the fitted linear model to our `epi_workflow`: ```{r} # Fit a linear model -wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) wf ``` @@ -144,7 +142,7 @@ using `workflows::remove_recipe()` and then inspect the class of `wf`: ```{r} wf %>% class() # class before -workflows::remove_recipe(wf) %>% class() # class after removing recipe using workflows function +remove_recipe(wf) %>% class() # class after removing recipe using workflows function ``` We can observe that `wf` is no longer an `epi_workflow` and a `workflow`. It has @@ -172,7 +170,7 @@ fit as before: ```{r} # fit linear model -wf <- Update_model(wf, parsnip::linear_reg()) %>% fit(jhu) +wf <- Update_model(wf, linear_reg()) %>% fit(jhu) wf ``` @@ -248,8 +246,8 @@ number in the order of operations, which can be obtained by inspecting `r2` or the tidy summary of it: ```{r} -workflows::extract_preprocessor(wf) # step_epi_ahead is the third step in r2 -tidy(workflows::extract_preprocessor(wf)) # tidy tibble summary of r2 +extract_preprocessor(wf) # step_epi_ahead is the third step in r2 +tidy(extract_preprocessor(wf)) # tidy tibble summary of r2 wf <- wf %>% adjust_epi_recipe(which_step = 3, ahead = 14) ``` @@ -273,7 +271,7 @@ variable, we would specify the step number of 2 in `which_step`. ```{r} wf <- wf %>% adjust_epi_recipe(which_step = 2, lag = c(0, 1, 7, 14, 21)) -workflows::extract_preprocessor(wf) +extract_preprocessor(wf) ``` We could adjust a recipe directly in the same way as we adjust a recipe in a @@ -294,7 +292,7 @@ with a new `epi_recipe` that has undergone the adjustment ```{r} r2 <- adjust_epi_recipe(r2, which_step = 2, lag = 0:21) -workflows::extract_preprocessor(wf) +extract_preprocessor(wf) ``` ## Adjust a single layer of a `frosting`