diff --git a/DESCRIPTION b/DESCRIPTION index 121f026c..4abf7cc8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,15 +19,18 @@ Description: Provides methods for sampling contact matrices from diary License: MIT + file LICENSE Depends: R (>= 3.5.0) -Imports: +Imports: + checkmate, countrycode, curl, data.table, grDevices, httr, jsonlite, + lifecycle, lubridate, memoise, + purrr, oai, wpp2017, xml2 @@ -35,11 +38,12 @@ Suggests: ggplot2, here, knitr, - purrr, + quarto, reshape2, rmarkdown, roxyglobals (>= 1.0.0), - testthat + testthat, + withr VignetteBuilder: knitr Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index b91e8ad9..2634d8f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,8 @@ # Generated by roxygen2: do not edit by hand -S3method(check,survey) -S3method(clean,survey) +S3method(check,contact_survey) +S3method(clean,contact_survey) +export(as_contact_survey) export(check) export(clean) export(contact_matrix) @@ -20,8 +21,13 @@ export(wpp_age) export(wpp_countries) import(data.table) import(wpp2017) +importFrom(checkmate,assert_character) +importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_list) +importFrom(checkmate,assert_names) importFrom(countrycode,countrycode) importFrom(curl,curl_download) +importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,dcast) importFrom(data.table,fcase) @@ -47,6 +53,7 @@ importFrom(lubridate,period_to_seconds) importFrom(lubridate,years) importFrom(memoise,memoise) importFrom(oai,list_records) +importFrom(purrr,walk) importFrom(stats,median) importFrom(stats,runif) importFrom(stats,xtabs) diff --git a/R/as_contact_survey.R b/R/as_contact_survey.R new file mode 100644 index 00000000..a8f5cabd --- /dev/null +++ b/R/as_contact_survey.R @@ -0,0 +1,67 @@ +#' @title Check contact survey data +#' +#' @description Checks that a survey fulfills all the requirements to work with the 'contact_matrix' function +#' +#' @param x list containing +#' - an element named 'participants', a data frame containing participant +#' information +#' - an element named 'contacts', a data frame containing contact information +#' - (optionally) an element named 'reference, a list containing information +#' information needed to reference the survey, in particular it can contain$a +#' "title", "bibtype", "author", "doi", "publisher", "note", "year" +#' @param id.column the column in both the `participants` and `contacts` data frames that links contacts to participants +#' @param country.column the column in the `participants` data frame containing the country in which the participant was queried +#' @param year.column the column in the `participants` data frame containing the year in which the participant was queried +#' @importFrom checkmate assert_list assert_names assert_data_frame +#' assert_character +#' @importFrom purrr walk +#' @return invisibly returns a character vector of the relevant columns +#' @examples +#' data(polymod) +#' check(polymod) +#' @export +as_contact_survey <- function(x, id.column = "part_id", + country.column = "country", + year.column = "year") { + ## check arguments + assert_list(x, names = "named") + assert_names(names(x), must.include = c("participants", "contacts")) + assert_data_frame(x$participants) + assert_data_frame(x$contacts) + assert_list(x$reference, names = "named", null.ok = TRUE) + assert_character(id.column) + assert_character(year.column, null.ok = TRUE) + assert_character(country.column, null.ok = TRUE) + assert_names(colnames(x$participants), must.include = id.column) + assert_names(colnames(x$contacts), must.include = id.column) + + setnames(x$participants, id.column, "part_id") + setnames(x$contacts, id.column, "part_id") + + ## check optional columns exist if provided + to_check <- list( + country = country.column, + year = year.column + ) + + walk(names(to_check), \(column) { + if (!is.null(to_check[[column]]) && + !(to_check[[column]] %in% colnames(x$participants))) { + stop( + column, " column '", to_check[[column]], "' does not exist ", + "in the participant data frame" + ) + } else { + setnames(x$participants, to_check[[column]], column) + } + }) + + if (is.null(x$reference)) { + warning("No reference provided") + } + + survey <- new_contact_survey(x$participant, x$contacts, x$reference) + survey <- clean(survey) + + return(survey) +} diff --git a/R/check.R b/R/check.R index 5fc563fa..c10eeba5 100644 --- a/R/check.R +++ b/R/check.R @@ -18,7 +18,15 @@ check <- function(x, ...) UseMethod("check") #' data(polymod) #' check(polymod) #' @export -check.survey <- function(x, id.column = "part_id", participant.age.column = "part_age", country.column = "country", year.column = "year", contact.age.column = "cnt_age", ...) { +check.contact_survey <- function(x, id.column = "part_id", participant.age.column = "part_age", country.column = "country", year.column = "year", contact.age.column = "cnt_age", ...) { + lifecycle::deprecate_warn( + "1.0.0", + "check()", + details = paste( + "Use `as_contact_survey()` instead to construct a ``", + "object. This will perform necessary checks." + ) + ) chkDots(...) if (!is.data.frame(x$participants) || !is.data.frame(x$contacts)) { stop("The 'participants' and 'contacts' elements of 'x' must be data.frames") diff --git a/R/clean.R b/R/clean.R index f92eafcb..618a3c33 100644 --- a/R/clean.R +++ b/R/clean.R @@ -7,7 +7,6 @@ clean <- function(x, ...) UseMethod("clean") #' @description Cleans survey data to work with the 'contact_matrix' function #' #' @param x A [survey()] object -#' @param country.column the name of the country in which the survey participant was interviewed #' @param participant.age.column the column in `x$participants` containing participants' age #' @param ... ignored #' @importFrom data.table fcase @@ -19,14 +18,12 @@ clean <- function(x, ...) UseMethod("clean") #' cleaned <- clean(polymod) # not really necessary as the 'polymod' data set has already been cleaned #' @autoglobal #' @export -clean.survey <- function(x, country.column = "country", participant.age.column = "part_age", ...) { +clean.contact_survey <- function(x, participant.age.column = "part_age", ...) { chkDots(...) - x <- survey(x$participants, x$contacts, x$reference) - ## update country names - if (country.column %in% colnames(x$participants)) { - countries <- x$participants[[country.column]] + if ("country" %in% colnames(x$participants)) { + countries <- x$participants$country origin.code <- fcase( all(nchar(as.character(countries)) == 2), "iso2c", all(nchar(as.character(countries)) == 3), "iso3c", @@ -34,7 +31,7 @@ clean.survey <- function(x, country.column = "country", participant.age.column = ) converted_countries <- suppressWarnings(countrycode(countries, origin.code, "country.name")) converted_countries[is.na(converted_countries)] <- as.character(countries[is.na(converted_countries)]) - x$participants[, paste(country.column) := factor(converted_countries)] + x$participants[, country := factor(converted_countries)] } if (nrow(x$participants) > 0 && diff --git a/R/contact_matrix.R b/R/contact_matrix.R index 7fefb3b8..223e3ffc 100644 --- a/R/contact_matrix.R +++ b/R/contact_matrix.R @@ -28,6 +28,7 @@ #' @return a contact matrix, and the underlying demography of the surveyed population #' @importFrom stats xtabs runif median #' @importFrom utils data globalVariables +#' @importFrom data.table copy #' @importFrom countrycode countrycode #' @import data.table #' @export @@ -40,7 +41,7 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil surveys <- c("participants", "contacts") dot.args <- list(...) - unknown.args <- setdiff(names(dot.args), union(names(formals(check.survey)), names(formals(pop_age)))) + unknown.args <- setdiff(names(dot.args), union(names(formals(check.contact_survey)), names(formals(pop_age)))) if (length(unknown.args) > 0) { stop("Unknown argument(s): ", paste(unknown.args, sep = ", "), ".") } @@ -55,7 +56,9 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil missing.participant.age <- match.arg(missing.participant.age) missing.contact.age <- match.arg(missing.contact.age) - if (!inherits(survey, "survey")) { + survey <- copy(survey) + + if (!inherits(survey, "contact_survey")) { stop( "`survey` must be a survey object (created using `survey()` ", "or `get_survey()`)" @@ -69,13 +72,8 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } } - ## clean the survey - survey <- clean(survey) - ## check and get columns - columns <- suppressMessages(check(survey, ...)) - ## check if specific countries are requested (if a survey contains data from multiple countries) - if (length(countries) > 0 && columns[["country"]] %in% colnames(survey$participants)) { + if (length(countries) > 0 && "country" %in% colnames(survey$participants)) { if (all(nchar(countries) == 2)) { corrected_countries <- suppressWarnings( countrycode(countries, "iso2c", "country.name") @@ -85,113 +83,92 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil countrycode(countries, "country.name", "country.name") ) } - present_countries <- unique(as.character(survey$participants[[columns[["country"]]]])) + present_countries <- unique(as.character(survey$participants$country)) missing_countries <- countries[which(is.na(corrected_countries))] if (length(missing_countries) > 0) { stop("Survey data not found for ", paste(missing_countries, sep = ", "), ".") } countries <- corrected_countries - survey$participants <- survey$participants[get(columns[["country"]]) %in% countries] + survey$participants <- survey$participants[country %in% countries] if (nrow(survey$participants) == 0) { stop("No participants left after selecting countries.") } } - ## check maximum participant age in the data - part_exact.column <- paste(columns[["participant.age"]], "exact", sep = "_") - part_min.column <- paste(columns[["participant.age"]], "est_min", sep = "_") - part_max.column <- paste(columns[["participant.age"]], "est_max", sep = "_") - - if (part_exact.column %in% colnames(survey$participants)) { + if ("part_age_exact" %in% colnames(survey$participants)) { survey$participants[ , - paste(columns[["participant.age"]]) := as.integer(get(part_exact.column)) + part_age := as.integer(part_age_exact) ] - } else if (!(columns[["participant.age"]] %in% colnames(survey$participants))) { - survey$participants[, paste(columns[["participant.age"]]) := NA_integer_] + } else if (!("part_age" %in% colnames(survey$participants))) { + survey$participants[, part_age := NA_integer_] } ## sample estimated participant ages - if (part_min.column %in% colnames(survey$participants) && - part_max.column %in% colnames(survey$participants)) { + if ("part_age_est_min" %in% colnames(survey$participants) && + "part_age_est_max" %in% colnames(survey$participants)) { if (estimated.participant.age == "mean") { survey$participants[ - is.na(get(part_exact.column)) & - !is.na(get(part_min.column)) & !is.na(get(part_max.column)), - paste(columns[["participant.age"]]) := - as.integer(rowMeans(.SD)), - .SDcols = c(part_min.column, part_max.column) + is.na(part_age_exact) & + !is.na(part_age_est_min) & !is.na(part_age_est_max), + part_age := as.integer(rowMeans(.SD)), + .SDcols = c("part_age_est_min", "part_age_est_max") ] } else if (estimated.participant.age == "sample") { survey$participants[ - is.na(get(columns[["participant.age"]])) & - !is.na(get(part_min.column)) & !is.na(get(part_max.column)) & - get(part_min.column) <= get(part_max.column), - paste(columns[["participant.age"]]) := - as.integer(runif( - .N, - get(part_min.column), - get(part_max.column) - )) + is.na(part_age) & + !is.na(part_age_est_min) & !is.na(part_age_est_max) & + part_age_est_min <= part_age_est_max, + part_age := as.integer(runif(.N, part_age_est_min, part_age_est_max)) ] } # note: do nothing when "missing" is specified } - if (part_max.column %in% colnames(survey$participants)) { + if ("part_age_est_max" %in% colnames(survey$participants)) { max.age <- max( c( - survey$participants[, get(part_exact.column)], - survey$participants[, get(part_max.column)] + survey$participants[, part_age_exact], + survey$participants[, part_age_est_max] ), na.rm = TRUE ) + 1 } else { - max.age <- max( - survey$participants[, get(columns[["participant.age"]])], - na.rm = TRUE - ) + 1 + max.age <- max(survey$participants[, part_age], na.rm = TRUE) + 1 } if (missing(age.limits)) { - all.ages <- - unique(as.integer(survey$participants[, get(columns[["participant.age"]])])) + all.ages <- unique(as.integer(survey$participants[, part_age])) all.ages <- all.ages[!is.na(all.ages)] all.ages <- sort(all.ages) age.limits <- union(0, all.ages) } if (missing.participant.age == "remove" && - nrow(survey$participants[is.na(get(columns[["participant.age"]])) | - get(columns[["participant.age"]]) < min(age.limits)]) > 0) { + nrow(survey$participants[ + is.na(part_age) | part_age < min(age.limits) + ]) > 0) { if (!missing.participant.age.set) { message( "Removing participants without age information. ", "To change this behaviour, set the 'missing.participant.age' option" ) } - survey$participants <- - survey$participants[!is.na(get(columns[["participant.age"]])) & - get(columns[["participant.age"]]) >= min(age.limits)] + survey$participants <- survey$participants[ + !is.na(part_age) & part_age >= min(age.limits) + ] } - exact.column <- paste(columns[["contact.age"]], "exact", sep = "_") - min.column <- paste(columns[["contact.age"]], "est_min", sep = "_") - max.column <- paste(columns[["contact.age"]], "est_max", sep = "_") - ## set contact age if it's not in the data - if (exact.column %in% colnames(survey$contacts)) { - survey$contacts[ - , - paste(columns[["contact.age"]]) := as.integer(get(exact.column)) - ] + if ("cnt_age_exact" %in% colnames(survey$contacts)) { + survey$contacts[, cnt_age := as.integer(cnt_age_exact)] } else { - survey$contacts[, paste(columns[["contact.age"]]) := NA_integer_] + survey$contacts[, cnt_age := NA_integer_] } ## convert factors to integers for (age_column in - c(columns[["contact.age"]], min.column, max.column, exact.column)) { + c("cnt_age", "cnt_age_est_min", "cnt_age_est_max", "cnt_age_exact")) { if (age_column %in% colnames(survey$contacts) && is.factor(survey$contacts[[age_column]])) { survey$contacts[, paste(age_column) := @@ -200,61 +177,49 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } ## sample estimated contact ages - if (min.column %in% colnames(survey$contacts) && - max.column %in% colnames(survey$contacts)) { + if ("cnt_age_est_min" %in% colnames(survey$contacts) && + "cnt_age_est_max" %in% colnames(survey$contacts)) { if (estimated.contact.age == "mean") { survey$contacts[ - is.na(get(columns[["contact.age"]])) & - !is.na(get(min.column)) & !is.na(get(max.column)), - paste(columns[["contact.age"]]) := - as.integer(rowMeans(.SD)), - .SDcols = c(min.column, max.column) + is.na(cnt_age) & !is.na(cnt_age_est_min) & !is.na(cnt_age_est_max), + cnt_age := as.integer(rowMeans(.SD)), + .SDcols = c("cnt_age_est_min", "cnt_age_est_max") ] } else if (estimated.contact.age == "sample") { survey$contacts[ - is.na(get(columns[["contact.age"]])) & - !is.na(get(min.column)) & !is.na(get(max.column)) & - get(min.column) <= get(max.column), - paste(columns[["contact.age"]]) := - as.integer(runif( - .N, - get(min.column), - get(max.column) - )) + is.na(cnt_age) & !is.na(cnt_age_est_min) & !is.na(cnt_age_est_max) & + cnt_age_est_min <= cnt_age_est_max, + cnt_age := as.integer(runif(.N, cnt_age_est_min, cnt_age_est_max)) ] } # note: do nothing when "missing" is specified } if (missing.contact.age == "remove" && - nrow(survey$contacts[is.na(get(columns[["contact.age"]])) | - get(columns[["contact.age"]]) < min(age.limits)]) > 0) { + nrow(survey$contacts[is.na(cnt_age) | + cnt_age < min(age.limits)]) > 0) { if (!missing.contact.age.set) { message( "Removing participants that have contacts without age information. ", "To change this behaviour, set the 'missing.contact.age' option" ) } - missing.age.id <- - survey$contacts[ - is.na(get(columns[["contact.age"]])) | - get(columns[["contact.age"]]) < min(age.limits), - get(columns[["id"]]) - ] - survey$participants <- survey$participants[!(get(columns[["id"]]) %in% missing.age.id)] + missing.age.id <- survey$contacts[ + is.na(cnt_age) | cnt_age < min(age.limits), part_id + ] + survey$participants <- survey$participants[!(part_id %in% missing.age.id)] } if (missing.contact.age == "ignore" && - nrow(survey$contacts[is.na(get(columns[["contact.age"]])) | - get(columns[["contact.age"]]) < min(age.limits)]) > 0) { + nrow(survey$contacts[is.na(cnt_age) | cnt_age < min(age.limits)]) > 0) { if (!missing.contact.age.set) { message( "Ignore contacts without age information. ", "To change this behaviour, set the 'missing.contact.age' option" ) } - survey$contacts <- survey$contacts[!is.na(get(columns[["contact.age"]])) & - get(columns[["contact.age"]]) >= min(age.limits), ] + survey$contacts <- survey$contacts[!is.na(cnt_age) & + cnt_age >= min(age.limits), ] } ## check if any filters have been requested @@ -280,13 +245,12 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil # adjust age.group.brakes to the lower and upper ages in the survey survey$participants[, lower.age.limit := reduce_agegroups( - get(columns[["participant.age"]]), - age.limits[age.limits < max.age] + part_age, age.limits[age.limits < max.age] )] part.age.group.breaks <- c(age.limits[age.limits < max.age], max.age) part.age.group.present <- age.limits[age.limits < max.age] survey$participants[, age.group := - cut(survey$participants[, get(columns[["participant.age"]])], + cut(survey$participants[, part_age], breaks = part.age.group.breaks, right = FALSE )] @@ -321,11 +285,11 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } else { ## neither survey population nor country names given - try to ## guess country or countries surveyed from participant data - if (columns[["country"]] %in% colnames(survey$participants)) { - survey.countries <- unique(survey$participants[, get(columns[["country"]])]) + if ("country" %in% colnames(survey$participants)) { + survey.countries <- unique(survey$participants[, country]) } else { warning( - "No 'survey.pop' or 'countries' given, and no '", columns[["country"]], + "No 'survey.pop' or 'countries' given, and no '", "country", "' column found in the data. ", "I don't know which population this is from. ", "Assuming the survey is representative" @@ -344,13 +308,12 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil ## check if survey data are from a specific year - in that case ## use demographic data from that year, otherwise latest - if (columns[["year"]] %in% colnames(survey$participants)) { - survey.year <- - survey$participants[, median(get(columns[["year"]]), na.rm = TRUE)] + if ("year" %in% colnames(survey$participants)) { + survey.year <- survey$participants[, median(year, na.rm = TRUE)] } else { survey.year <- country.pop[, max(year, na.rm = TRUE)] warning( - "No '", columns[["year"]], "' column found in the data. Will use ", + "No information on year found in the data. Will use ", survey.year, " population data." ) } @@ -376,16 +339,14 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } if (survey.representative) { survey.pop <- - survey$participants[, lower.age.limit := - reduce_agegroups( - get(columns[["participant.age"]]), - age.limits - )] + survey$participants[ + , + lower.age.limit := reduce_agegroups(part_age, age.limits) + ] survey.pop <- survey.pop[, list(population = .N), by = lower.age.limit] survey.pop <- survey.pop[!is.na(lower.age.limit)] - if (columns[["year"]] %in% colnames(survey$participants)) { - survey.year <- - survey$participants[, median(get(columns[["year"]]), na.rm = TRUE)] + if ("year" %in% colnames(survey$participants)) { + survey.year <- survey$participants[, median(year, na.rm = TRUE)] } } } else { @@ -409,10 +370,7 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil survey.pop <- survey.pop[!is.na(population)] survey.pop$upper.age.limit <- unlist(c( survey.pop[-1, "lower.age.limit"], - 1 + max( - survey.pop$lower.age.limit, - part.age.group.present - ) + 1 + max(survey.pop$lower.age.limit, part.age.group.present) )) if (weigh.age) { @@ -470,17 +428,17 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil ## assign weights to participants, to account for age variation if (weigh.age) { # get number and proportion of participants by age - survey$participants[, age.count := .N, by = eval(columns[["participant.age"]])] + survey$participants[, age.count := .N, by = part_age] survey$participants[, age.proportion := age.count / .N] # get reference population by age (absolute and proportional) - part.age.all <- range(unique(survey$participants[, get(columns[["participant.age"]])])) + part.age.all <- range(unique(survey$participants[, part_age])) survey.pop.detail <- data.table(pop_age(survey.pop.full, seq(part.age.all[1], part.age.all[2] + 1))) - names(survey.pop.detail) <- c(columns[["participant.age"]], "population.count") + names(survey.pop.detail) <- c("part_age", "population.count") survey.pop.detail[, population.proportion := population.count / sum(population.count)] # merge reference and survey population data - survey$participants <- merge(survey$participants, survey.pop.detail, by = eval(columns[["participant.age"]])) + survey$participants <- merge(survey$participants, survey.pop.detail, by = "part_age") # calculate age-specific weights survey$participants[, weight.age := population.proportion / age.proportion] @@ -521,35 +479,35 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } ## merge participants and contacts into a single data table - setkeyv(survey$participants, columns[["id"]]) - participant_ids <- unique(survey$participants[[columns[["id"]]]]) + setkey(survey$participants, part_id) + participant_ids <- unique(survey$participants$part_id) survey$contacts <- merge(survey$contacts, survey$participants, - by = columns[["id"]], all = FALSE, + by = "part_id", all = FALSE, allow.cartesian = TRUE, suffixes = c(".cont", ".part") ) - setkeyv(survey$contacts, columns[["id"]]) + setkey(survey$contacts, part_id) ## sample contacts if (missing.contact.age == "sample" && - nrow(survey$contacts[is.na(get(columns[["contact.age"]]))]) > 0) { + nrow(survey$contacts[is.na(cnt_age)]) > 0) { for (this.age.group in - unique(survey$contacts[is.na(get(columns[["contact.age"]])), age.group])) { + unique(survey$contacts[is.na(cnt_age), age.group])) { ## first, deal with missing age - if (nrow(survey$contacts[!is.na(get(columns[["contact.age"]])) & + if (nrow(survey$contacts[!is.na(cnt_age) & age.group == this.age.group]) > 0) { ## some contacts in the age group have an age, sample from these survey$contacts[ - is.na(get(columns[["contact.age"]])) & + is.na(cnt_age) & age.group == this.age.group, - paste(columns[["contact.age"]]) := + cnt_age := sample( survey$contacts[ - !is.na(get(columns[["contact.age"]])) & + !is.na(cnt_age) & age.group == this.age.group, - get(columns[["contact.age"]]) + cnt_age ], size = .N, replace = TRUE @@ -558,13 +516,13 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } else { ## no contacts in the age group have an age, sample uniformly between limits min.contact.age <- - survey$contacts[, min(get(columns[["contact.age"]]), na.rm = TRUE)] + survey$contacts[, min(cnt_age, na.rm = TRUE)] max.contact.age <- - survey$contacts[, max(get(columns[["contact.age"]]), na.rm = TRUE)] + survey$contacts[, max(cnt_age, na.rm = TRUE)] survey$contacts[ - is.na(get(columns[["contact.age"]])) & + is.na(cnt_age) & age.group == this.age.group, - paste(columns[["contact.age"]]) := + cnt_age := as.integer(floor(runif(.N, min = min.contact.age, max = max.contact.age + 1 @@ -575,15 +533,14 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil } ## set contact age groups - max.contact.age <- - survey$contacts[, max(get(columns[["contact.age"]]), na.rm = TRUE) + 1] + max.contact.age <- survey$contacts[, max(cnt_age, na.rm = TRUE) + 1] contact.age.group.breaks <- part.age.group.breaks if (max.contact.age > max(contact.age.group.breaks)) { contact.age.group.breaks[length(contact.age.group.breaks)] <- max.contact.age } survey$contacts[, contact.age.group := - cut(get(columns[["contact.age"]]), + cut(cnt_age, breaks = contact.age.group.breaks, labels = age.groups, right = FALSE @@ -597,7 +554,7 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil part.sample <- sample(participant_ids, replace = TRUE) part.age.limits <- unique(survey$participants[ - get(columns[["id"]]) %in% part.sample, + part_id %in% part.sample, lower.age.limit ]) good.sample <- !sample.all.age.groups || @@ -607,8 +564,8 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil data.table(id = part.sample, weight = 1) sample.table <- sample.table[, list(bootstrap.weight = sum(weight)), by = id] - setnames(sample.table, "id", columns[["id"]]) - setkeyv(sample.table, columns[["id"]]) + setnames(sample.table, "id", "part_id") + setkey(sample.table, part_id) sampled.contacts <- merge(survey$contacts, sample.table) sampled.contacts[, sampled.weight := weight * bootstrap.weight] @@ -816,9 +773,9 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil # add age and/or dayofweek info if (weigh.age && weigh.dayofweek) { - part.weights <- survey$participants[, .N, by = list(age.group, participant.age = get(columns[["participant.age"]]), is.weekday, weight)] + part.weights <- survey$participants[, .N, by = list(age.group, participant.age = part_age, is.weekday, weight)] } else if (weigh.age) { - part.weights <- survey$participants[, .N, by = list(age.group, participant.age = get(columns[["participant.age"]]), weight)] + part.weights <- survey$participants[, .N, by = list(age.group, participant.age = part_age, weight)] } else if (weigh.dayofweek) { part.weights <- survey$participants[, .N, by = list(age.group, is.weekday, weight)] } diff --git a/R/get_survey.R b/R/get_survey.R index 0b023eb4..543a6df7 100644 --- a/R/get_survey.R +++ b/R/get_survey.R @@ -26,27 +26,19 @@ get_survey <- function(survey, clear_cache = FALSE, ...) { #' @autoglobal #' @param survey a DOI or url to get the survey from, or a [survey()] object (in which case only cleaning is done). #' @param ... options for [clean()], which is called at the end of this +#' @importFrom data.table copy #' @keywords internal .get_survey <- function(survey, ...) { - if (inherits(survey, "survey")) { - new_survey <- survey + if (inherits(survey, "contact_survey")) { + new_survey <- copy(survey) } else { if (is.character(survey)) { files <- download_survey(survey) new_survey <- load_survey(files) } else { - stop("'survey' must be an 'survey' object or character") + stop("'survey' must be an 'contact_survey' object or character") } } - new_survey <- clean(new_survey, ...) - - if (!is.null(new_survey$reference)) { - message( - "Using ", new_survey$reference$title, - ". To cite this in a publication, use the 'get_citation()' function" - ) - } - return(new_survey) } diff --git a/R/globals.R b/R/globals.R index 549da047..5b1a1b4c 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,9 +1,20 @@ # Generated by roxyglobals: do not edit by hand utils::globalVariables(c( - "..age.unit", # - "..high", # - "..low", # + "country", # + "..age.unit", # + "..high", # + "..low", # + "country", # + "part_age", # + "part_age_exact", # + "part_age_est_min", # + "part_age_est_max", # + "cnt_age", # + "cnt_age_exact", # + "cnt_age_est_min", # + "cnt_age_est_max", # + "part_id", # "lower.age.limit", # "age.group", # "population", # diff --git a/R/load_survey.R b/R/load_survey.R index 0017a80a..6a902615 100644 --- a/R/load_survey.R +++ b/R/load_survey.R @@ -146,14 +146,14 @@ load_survey <- function(files, ...) { } } - new_survey <- survey( - participants = main_surveys[["participant"]], - contacts = main_surveys[["contact"]], - reference = reference + new_survey <- as_contact_survey( + list( + participants = main_surveys[["participant"]], + contacts = main_surveys[["contact"]], + reference = reference + ) ) - new_survey <- clean(new_survey, ...) - if (!is.null(new_survey$reference)) { message( "Using ", new_survey$reference$title, diff --git a/R/new_contact_survey.R b/R/new_contact_survey.R new file mode 100644 index 00000000..05c43f52 --- /dev/null +++ b/R/new_contact_survey.R @@ -0,0 +1,21 @@ +#' @title Contact survey +#' @description Deprecated. A `survey` object contains the results of a contact survey. In particular, it contains two data frames called `participants` and `contacts` that are linked by a column specified as `id.column` +#' @param participants a `data.frame` containing information on participants +#' @param contacts a `data.frame` containing information on contacts +#' @param reference a `list` containing information needed to reference the survey, in particular it can contain$a "title", "bibtype", "author", "doi", "publisher", "note", "year" +#' @return a new survey object +#' @author Sebastian Funk +#' @keywords internal +new_contact_survey <- function(participants, contacts, reference = NULL) { + new_obj <- + structure( + list( + participants = data.table(participants), + contacts = data.table(contacts), + reference = reference + ), + class = "contact_survey" + ) + + return(new_obj) +} diff --git a/R/survey.R b/R/survey.R index d470f32f..d1a65b9e 100644 --- a/R/survey.R +++ b/R/survey.R @@ -1,15 +1,17 @@ #' @title Contact survey -#' @description A `survey` object contains the results of a contact survey. In particular, it contains two data frames called `participants` and `contacts` that are linked by a column specified as `id.column` +#' @description Deprecated. Use `as_survey` instead. #' @param participants a `data.frame` containing information on participants #' @param contacts a `data.frame` containing information on contacts #' @param reference a `list` containing information needed to reference the survey, in particular it can contain$a "title", "bibtype", "author", "doi", "publisher", "note", "year" #' @return a new survey object #' @author Sebastian Funk #' @export -#' @examples -#' data(polymod) -#' new_survey <- survey(polymod$participants, polymod$contacts) survey <- function(participants, contacts, reference = NULL) { + lifecycle::deprecate_warn( + "1.0.0", + "survey()", + details = "Use `as_contact_survey()` instead." + ) new_obj <- structure( list( @@ -17,7 +19,7 @@ survey <- function(participants, contacts, reference = NULL) { contacts = data.table(contacts), reference = reference ), - class = "survey" + class = "contact_survey" ) return(new_obj) diff --git a/data/polymod.rda b/data/polymod.rda index 789dd8df..01ea716f 100644 Binary files a/data/polymod.rda and b/data/polymod.rda differ diff --git a/man/as_contact_survey.Rd b/man/as_contact_survey.Rd new file mode 100644 index 00000000..817e86cb --- /dev/null +++ b/man/as_contact_survey.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_contact_survey.R +\name{as_contact_survey} +\alias{as_contact_survey} +\title{Check contact survey data} +\usage{ +as_contact_survey( + x, + id.column = "part_id", + country.column = "country", + year.column = "year" +) +} +\arguments{ +\item{x}{list containing +\itemize{ +\item an element named 'participants', a data frame containing participant +information +\item an element named 'contacts', a data frame containing contact information +\item (optionally) an element named 'reference, a list containing information +information needed to reference the survey, in particular it can contain$a +"title", "bibtype", "author", "doi", "publisher", "note", "year" +}} + +\item{id.column}{the column in both the \code{participants} and \code{contacts} data frames that links contacts to participants} + +\item{country.column}{the column in the \code{participants} data frame containing the country in which the participant was queried} + +\item{year.column}{the column in the \code{participants} data frame containing the year in which the participant was queried} +} +\value{ +invisibly returns a character vector of the relevant columns +} +\description{ +Checks that a survey fulfills all the requirements to work with the 'contact_matrix' function +} +\examples{ +data(polymod) +check(polymod) +} diff --git a/man/check.Rd b/man/check.Rd index b19d35a9..170df64a 100644 --- a/man/check.Rd +++ b/man/check.Rd @@ -2,10 +2,10 @@ % Please edit documentation in R/check.R \name{check} \alias{check} -\alias{check.survey} +\alias{check.contact_survey} \title{Check contact survey data} \usage{ -\method{check}{survey}( +\method{check}{contact_survey}( x, id.column = "part_id", participant.age.column = "part_age", diff --git a/man/clean.Rd b/man/clean.Rd index b0881887..4f110642 100644 --- a/man/clean.Rd +++ b/man/clean.Rd @@ -2,16 +2,14 @@ % Please edit documentation in R/clean.R \name{clean} \alias{clean} -\alias{clean.survey} +\alias{clean.contact_survey} \title{Clean contact survey data} \usage{ -\method{clean}{survey}(x, country.column = "country", participant.age.column = "part_age", ...) +\method{clean}{contact_survey}(x, participant.age.column = "part_age", ...) } \arguments{ \item{x}{A \code{\link[=survey]{survey()}} object} -\item{country.column}{the name of the country in which the survey participant was interviewed} - \item{participant.age.column}{the column in \code{x$participants} containing participants' age} \item{...}{ignored} diff --git a/man/new_contact_survey.Rd b/man/new_contact_survey.Rd new file mode 100644 index 00000000..f03ee8c5 --- /dev/null +++ b/man/new_contact_survey.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/new_contact_survey.R +\name{new_contact_survey} +\alias{new_contact_survey} +\title{Contact survey} +\usage{ +new_contact_survey(participants, contacts, reference = NULL) +} +\arguments{ +\item{participants}{a \code{data.frame} containing information on participants} + +\item{contacts}{a \code{data.frame} containing information on contacts} + +\item{reference}{a \code{list} containing information needed to reference the survey, in particular it can contain$a "title", "bibtype", "author", "doi", "publisher", "note", "year"} +} +\value{ +a new survey object +} +\description{ +Deprecated. A \code{survey} object contains the results of a contact survey. In particular, it contains two data frames called \code{participants} and \code{contacts} that are linked by a column specified as \code{id.column} +} +\author{ +Sebastian Funk +} +\keyword{internal} diff --git a/man/survey.Rd b/man/survey.Rd index c5b2805e..1c34117b 100644 --- a/man/survey.Rd +++ b/man/survey.Rd @@ -17,11 +17,7 @@ survey(participants, contacts, reference = NULL) a new survey object } \description{ -A \code{survey} object contains the results of a contact survey. In particular, it contains two data frames called \code{participants} and \code{contacts} that are linked by a column specified as \code{id.column} -} -\examples{ -data(polymod) -new_survey <- survey(polymod$participants, polymod$contacts) +Deprecated. Use \code{as_survey} instead. } \author{ Sebastian Funk diff --git a/tests/testthat/test-as_contact_survey.r b/tests/testthat/test-as_contact_survey.r new file mode 100644 index 00000000..cdbcbb9b --- /dev/null +++ b/tests/testthat/test-as_contact_survey.r @@ -0,0 +1,28 @@ +library("data.table") + +erroneous_survey <- + new_contact_survey(polymod$participants, polymod$contacts, polymod$reference) + +erroneous_type1 <- copy(erroneous_survey) +erroneous_type1$participants <- "test" +erroneous_type2 <- copy(erroneous_survey) +erroneous_type2$participants <- 17 + +test_that("error is thrown if survey contains false data types", { + expect_error( + as_contact_survey(erroneous_type1), "Must be of type 'data.frame'" + ) + expect_error( + as_contact_survey(erroneous_type2), "Must be of type 'data.frame'" + ) +}) + +erroneous_structure1 <- copy(erroneous_survey) +erroneous_structure1$participants$part_id <- NULL + +test_that("incorrect structure of data frames is correctly identified", { + expect_error( + as_contact_survey(erroneous_structure1), + "Names must include the elements \\{'part_id'\\}" ## nolint: nonportable_path_linter + ) +}) diff --git a/tests/testthat/test-checks.r b/tests/testthat/test-checks.r index f83ed729..e39417dd 100644 --- a/tests/testthat/test-checks.r +++ b/tests/testthat/test-checks.r @@ -1,6 +1,6 @@ library(data.table) -erroneous_survey <- survey(polymod$participants, polymod$contacts, polymod$reference) +erroneous_survey <- as_contact_survey(polymod) erroneous_type1 <- copy(erroneous_survey) erroneous_type1$participants <- "test" @@ -8,6 +8,7 @@ erroneous_type2 <- copy(erroneous_survey) erroneous_type2$participants <- 17 test_that("error is thrown if survey contains false data types", { + withr::local_options(lifecycle_verbosity = "quiet") expect_error(check(erroneous_type1), "must be data.frames") expect_error(check(erroneous_type2), "must be data.frames") }) @@ -15,12 +16,13 @@ test_that("error is thrown if survey contains false data types", { erroneous_structure1 <- copy(erroneous_survey) erroneous_structure1$participants$part_id <- NULL erroneous_structure2 <- copy(erroneous_survey) -erroneous_structure2$participants$part_age <- NULL +erroneous_structure2$participants$part_age_exact <- NULL erroneous_structure3 <- copy(erroneous_survey) erroneous_structure3$contacts$cnt_age_exact <- NULL erroneous_structure3$contacts$cnt_age_est_min <- NULL test_that("incorrect structure of data frames is correctly identified", { + withr::local_options(lifecycle_verbosity = "quiet") expect_warning(check(erroneous_structure1), "does not exist") expect_warning(check(erroneous_structure2), "do not exist") expect_warning(check(erroneous_structure3), "do not exist") diff --git a/tests/testthat/test-matrix.r b/tests/testthat/test-matrix.r index f181d283..8ac90e68 100644 --- a/tests/testthat/test-matrix.r +++ b/tests/testthat/test-matrix.r @@ -13,7 +13,7 @@ polymod11 <- get_survey(polymod) polymod2$participants$added_weight <- 0.5 polymod2$contacts$cnt_age_exact <- factor(polymod2$contacts$cnt_age_exact) -polymod2$participants$part_age[1] <- "3-5" +polymod2$participants$part_age_exact[1] <- NA_real_ polymod3$participants$dayofweek <- NULL polymod3$participants$year <- NULL polymod4$participants$country <- NULL @@ -107,7 +107,8 @@ test_that("warning is thrown if filter column is not found", { test_that("warning is thrown if missing data exist", { expect_warning(contact_matrix(survey = polymod, missing.contact.age = "keep", symmetric = TRUE, age.limits = c(0, 5, 15)), "missing.contact.age") - expect_warning(contact_matrix(survey = polymod, split = TRUE), "age groups") + warning <- capture_warnings(contact_matrix(survey = polymod, missing.participant.age = "keep", split = TRUE)) + expect_match(warning[2], "missing.participant.age") }) test_that("error is thrown if an unknown argument is passed", { @@ -123,7 +124,7 @@ test_that("error is thrown if there are no participants after selection the coun }) test_that("warning is thrown if population needed but no 'year' column present", { - expect_warning(contact_matrix(survey = polymod3, symmetric = TRUE, age.limits = c(0, 5, 15)), "No 'year' column") + expect_warning(contact_matrix(survey = polymod3, symmetric = TRUE, age.limits = c(0, 5, 15)), "No information on year") }) test_that("warning is thrown if day of week is asked to be weighed but not present", { @@ -135,10 +136,12 @@ test_that("warning is thrown if country has no survey population", { }) test_that("warning is thrown if contact survey has no age information", { + withr::local_options(lifecycle_verbosity = "quiet") expect_warning(check(x = polymod6), "do not exist") }) test_that("warning is thrown if participant data has no country", { + withr::local_options(lifecycle_verbosity = "quiet") expect_warning(check(x = polymod4), "does not exist") }) @@ -147,6 +150,7 @@ test_that("user is informed about removing missing data", { }) test_that("check result is reported back", { + withr::local_options(lifecycle_verbosity = "quiet") expect_message(check(x = polymod2), "Check") }) @@ -163,10 +167,7 @@ test_that("nonsensical operations are warned about", { }) test_that("warning is thrown if it is assumed that the survey is representative", { - warning <- capture_warnings( - contact_matrix(survey = polymod4, symmetric = TRUE, age.limits = c(0, 5, 15)) - ) - expect_match(warning[2], "Assuming the survey is representative") + expect_warning(contact_matrix(survey = polymod4, symmetric = TRUE, age.limits = c(0, 5, 15)), "Assuming the survey is representative") }) diff --git a/tests/testthat/test-surveys.r b/tests/testthat/test-surveys.r index 0c9bf1e4..b884df87 100644 --- a/tests/testthat/test-surveys.r +++ b/tests/testthat/test-surveys.r @@ -11,7 +11,7 @@ test_that("surveys can be downloaded", { s <- suppressMessages(suppressWarnings(get_survey("10.5281/zenodo.1095664"))) # nolint - expect_s3_class(s, "survey") + expect_s3_class(s, "contact_survey") expect_named( s$reference, c("title", "bibtype", "author", "year", "note", "doi") diff --git a/vignettes/intro.qmd b/vignettes/intro.qmd new file mode 100644 index 00000000..7ee2dbd7 --- /dev/null +++ b/vignettes/intro.qmd @@ -0,0 +1,478 @@ +--- +title: "The socialmixr pakge" +author: "Sebastian Funk" +institute: https://epiforecasts.io +date: "`r Sys.Date()`" +-format: + revealjs: + slide-level: 3 + template-partials: + - title-slide.html + theme: sbfnk.scss + embed-resources: true + preview-links: auto + center: true + smaller: true +--- + +```{r setup, include = FALSE} +library("knitr") +library("socialmixr") +library("ggplot2") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +data.table::setDTthreads(1) +``` + +[socialmixr](https://github.com/epiforecasts/socialmixr) is an `R` package to derive social mixing matrices from survey data. These are particularly useful for age-structured [infectious disease models](https://en.wikipedia.org/wiki/Mathematical_modelling_of_infectious_disease). For background on age-specific mixing matrices and what data inform them, see, for example, the paper on POLYMOD by [@mossong_social_2008]. + +# Usage + +At the heart of the `socialmixr` package is the `contact_matrix()` function. This extracts a contact matrix from survey data. You can use the `R` help to find out about usage of the `contact_matrix()` function, including a list of examples: + +```{r eval=FALSE} +?contact_matrix +``` + +The POLYMOD data are included with the package and can be loaded using + +```{r} +data(polymod) +``` + +An example use would be + +```{r} +contact_matrix(polymod, countries = "United Kingdom", age.limits = c(0, 1, 5, 15)) +``` + +This generates a contact matrix from the UK part of the POLYMOD study, with age groups 0-1, 1-5, 5-15 and 15+ years. It contains the mean number of contacts that each member of an age group (row) has reported with members of the same or another age group (column). + +# Surveys + +The key argument to the `contact_matrix()` function is the `survey` that it supposed to use. The `socialmixr` package includes the POLYMOD survey, which will be used if not survey is specified. It also provides access to all surveys in the [Social contact data](https://zenodo.org/communities/social_contact_data) community on [Zenodo](https://zenodo.org). The available surveys can be listed (if an internet connection is available) with + +```{r eval=FALSE} +list_surveys() +``` + +A survey can be downloaded using the `get_survey()` command. This will get the relevant data of a survey given its Zenodo DOI (as returned by `list_surveys()`). All other relevant commands in the `socialmixr` package accept a DOI, but if a survey is to be used repeatedly it is worth downloading it and storing it locally to avoid the need for a network connection and speed up processing. + +```{r eval=FALSE} +peru_survey <- get_survey("https://doi.org/10.5281/zenodo.1095664") +saveRDS(peru_survey, "peru.rds") +``` + +This way, the `peru` data set can be loaded in the future without the need for an internet connection using + +```{r eval=FALSE} +peru_survey <- readRDS("peru.rds") +``` + +Some surveys may contain data from multiple countries. To check this, use the `survey_countries` function + +```{r} +survey_countries(polymod) +``` + +If one wishes to get a contact matrix for one or more specific countries, a `countries` argument can be passed to `contact_matrix()`. If this is not done, the different surveys contained in a dataset are combined as if they were one single sample (i.e., not applying any population-weighting by country or other correction). + + +By default, socialmixr uses the POLYMOD survey. A reference for any given survey can be obtained using `get_citation()`, e.g. + +```{r} +get_citation(polymod) +``` + +# Bootstrapping + +To get an idea of uncertainty of the contact matrices, a bootstrap can be used using the `sample.participants` argument of `contact_matrix()`. If this argument is set to TRUE, participants are sampled (with replacement, to get the same number of participants of the original study) every time the `contact_matrix()` function is called, and thus a different matrix returned every time. From these matrices, derived quantities can be obtained, for example the mean: + +```{r} +m <- replicate( + n = 5, + contact_matrix( + polymod, + countries = "United Kingdom", age.limits = c(0, 1, 5, 15), + sample.participants = TRUE + ) +) +mr <- Reduce("+", lapply(m["matrix", ], function(x) x / ncol(m))) +mr +``` + +# Demography + +Obtaining symmetric contact matrices, splitting out their components (see below) and age-specific participant weights require information about the underlying demographic composition of the survey population. This can be passed to `contact_matrix()` as the `survey.pop` argument, a `data.frame` with two columns, `lower.age.limit` (denoting the lower end of the age groups) and `population` (denoting the number of people in each age group). If no `survey.pop` is not given, `contact_matrix()` will try to obtain the age structure of the population (as per the `countries` argument) from the [World Population Prospects](https://population.un.org/wpp/) of the United Nations, using estimates from the year that closest matches the year in which the contact survey was conducted. + +If demographic information is used, this is returned by `contact_matrix()` as the `demography` field in the results list. It is possible to enforce or prevent the function to return demography data by using the `return.demography` option. + +```{r, warning=FALSE, message=FALSE} +contact_matrix(polymod, + countries = "United Kingdom", age.limits = c(0, 20), + return.demography = TRUE +)$demography +``` + + +# Symmetric contact matrices + +Conceivably, contact matrices should be symmetric: the total number of contacts made by members of one age group with those of another should be the same as vice versa. Mathematically, if $m_{ij}$ is the mean number of contacts made by members of age group $i$ with members of age group $j$, and the total number of people in age group $i$ is $N_i$, then + +$$m_{ij} N_i = m_{ji}N_j$$ + +Because of variation in the sample from which the contact matrix is obtained, this relationship is usually not fulfilled exactly. In order to obtain a symmetric contact matrix that fulfills it, one can use + +$$m'_{ij} = \frac{1}{2N_i} (m_{ij} N_i + m_{ji} N_j)$$ + +To get this version of the contact matrix, use `symmetric = TRUE` when calling the `contact_matrix()` function. + +```{r} +contact_matrix(polymod, countries = "United Kingdom", age.limits = c(0, 1, 5, 15), symmetric = TRUE) +``` + +# Contact rates per capita + +The contact matrix per capita $c_{ij}$ contains the social contact rates of one individual of age $i$ with one individual of age $j$, given the population details. For example, $c_{ij}$ is used in infectious disease modelling to calculate the force of infection, which is based on the likelihood that one susceptible individual of age $i$ will be in contact with one infectious individual of age $j$. The contact rates per capita are calculated as follows: + +$$c_{ij} = \tfrac{m_{ij}}{N_{j}}$$ + +To get the per capita contact matrix, use `per.capita = TRUE` when calling the `contact_matrix()` function. Please note that if the option `symmetric = TRUE` is specified, the contact matrix $m_{ij}$ can show asymmetry if the sub-population sizes are different, but the contact matrix per capita will be fully symmetric: + +$$c'_{ij} = \frac{m_{ij} N_i + m_{ji} N_j}{2N_iN_j} = c'_{ji}$$ + + +```{r message=FALSE, warning=FALSE} +contact_matrix(survey = polymod, countries = "Germany", age.limits = c(0, 60), symmetric = TRUE, per.capita = TRUE) +``` + +# Splitting contact matrices + +The `contact_matrix()` contains a simple model for the elements of the contact matrix, by which it is split into a _global_ component, as well as three components representing _contacts_, _assortativity_ and _demography_. In other words, the elements $m_{ij}$ of the contact matrix are modeled as + +$$ m_{ij} = c q d_i a_{ij} n_j $$ + +where $c$ is the mean number of contacts across the whole population, $c q d_i$ is the number of contacts that a member of group $i$ makes across age groups, $n_j$ is the proportion of the surveyed population in age group $j$. The constant $q$ is set so that $c q$ is equal to the value of the largest eigenvalue of $m_{ij}$; if used in an infectious disease model and assumed that every contact leads to infection, $c q$ can be replaced by the basic reproduction number $R_0$. + +To model the contact matrix in this way with the `contact_matrix()` function, set `split = TRUE`. The components of the matrix are returned as elements `mean.contacts` ($c$), `normalisation` ($q$), `contacts` ($d_i$), `matrix` ($a_{ij}$) and `demography` ($n_j$) of the resulting list. + +```{r} +contact_matrix(polymod, countries = "United Kingdom", age.limits = c(0, 1, 5, 15), split = TRUE) +``` + +# Filtering + +The `filter` argument to `contact_matrix()` can be used to select particular participants or contacts. For example, in the `polymod` dataset, the indicators `cnt_home`, `cnt_work`, `cnt_school`, `cnt_transport`, `cnt_leisure` and `cnt_otherplace` take value 0 or 1 depending on where a contact occurred. Any filter can be applied to the data, if given as a list of the form (column=filter_value). As such, only contacts that have 'filter_value' in 'column' will be considered for the generated contact matrix: + +```{r, warning=FALSE, message=FALSE} +# contact matrix for school-related contacts +contact_matrix(polymod, age.limits = c(0, 20, 60), filter = list(cnt_school = 1))$matrix + +# contact matrix for work-related contacts involving physical contact +contact_matrix(polymod, age.limits = c(0, 20, 60), filter = list(cnt_work = 1, phys_contact = 1))$matrix + +# contact matrix for daily contacts at home with males +contact_matrix(polymod, age.limits = c(0, 20, 60), filter = list(cnt_home = 1, cnt_gender = "M", duration_multi = 5))$matrix +``` + +# Participant weights + +## Temporal aspects and demography + +Participant weights are commonly used to align sample and population characteristics in terms of temporal aspects and the age distribution. For example, the day of the week has been reported as a driving factor for social contact behavior, hence to obtain a weekly average, the survey data should represent the weekly 2/5 distribution of weekend/week days. To align the survey data to this distribution, one can obtain participant weights in the form of: +$$w_{\textrm{day.of.week}} = \tfrac{5/7}{N_{\textrm{weekday}}/N} \text{ OR } \tfrac{2/7}{N_{\textrm{weekend}}/N}$$ +with sample size $N$, and $N_{weekday}$ and $N_{weekend}$ the number of participants that were surveyed during weekdays and weekend days, respectively. It is possible to remove the constant values (e.g. $w = 5/N_{weekday}$), which results in the same standardized weights. However, we opt to use the relative proportions to calculate weights to enable truncation with a generic threshold (see below). + +Another driver of social contact patterns is age. To improve the representativeness of survey data, age-specific weights can be calculated as: +$$w_{age} = \tfrac{P_{a}\ /\ P}{N_{a}\ /\ N}$$ +with $P$ the population size, $P_a$ the population fraction of age $a$, $N$ the survey sample size and $N_a$ the survey fraction of age $a$. The combination of age-specific and temporal weights for participant $i$ of age $a$ can be constructed as: +$$w_{i} = w_{\textrm{age}} * w_{\textrm{day.of.week}} $$ +Finally, the weights can to be standardized as follows: +$$\tilde{w}_{i} = \tfrac{w_{i}}{\sum_{}^{} w_{}} * N $$ + +If the social contact analysis is based on stratification by splitting the population into non-overlapping groups, it requires the weights to be standardized so that the weighted totals within mutually exclusive cells equal the known population totals [@kolenikov_post-stratification_2016]. The post-stratification cells need to be mutually exclusive and cover the whole population. The post-stratified (PS) weight for participant $i$ of \mbox{group $g$} is: +$$\tilde{w}^{PS}_{i} = \tfrac{w_{i}}{\sum_{\text{j}}^{\text{group g}} w_{j}} * N_g$$ + +Temporal weights are activated in `contact_matrix()` by `weigh.dayofweek = TRUE` and age-specific weights by `weight.age = TRUE`. The post-stratification weights are calculated by default. It is possible to obtain the participant weights via the option `return.part.weights = TRUE`. + +```{r message=FALSE, warning=FALSE} +contact_matrix( + survey = polymod, age.limits = c(0, 18, 60), weigh.dayofweek = TRUE, + weigh.age = TRUE, return.part.weights = TRUE +) +``` + +## User-defined participant weights + +The `contact_matrix()` allows to specify and use your own participant weights. Therefore, provide the names of the columns of the participant data you want to use to weight the reported contacts via the `weights` argument. + +```{r message=FALSE, warning=FALSE} +# e.g. use household size as (dummy) weight to provide more importance to participant data from large households +contact_matrix(survey = polymod, age.limits = c(0, 18, 60), weights = "hh_size") +``` + + +## Weight threshold + +If the survey population differs extensively from the demography, some participants can end up with relatively high weights and as such, an excessive contribution to the population average. This warrants the limitation of single participant influences by a truncation of the weights. To enable this in `contact_matrix()`, you need to provide a numeric `weight.threshold`. This truncation is applied on the standardized weights, followed by another standardization to make sure that the sum of the weights still equals the sample size. The latter can lead to final weights of which some little exceed the given threshold value. + +```{r message=FALSE, warning=FALSE} +contact_matrix( + survey = polymod, age.limits = c(0, 18, 60), weigh.dayofweek = TRUE, + weigh.age = TRUE, return.part.weights = TRUE, weight.threshold = 3 +) +``` + + +## Numerical example + +With these numeric examples, we show the importance of post-stratification weights in contrast to using the crude weights directly within age-groups. We will apply the weights by age and day of week separately in these examples, though the combination is straightforward via multiplication. + +### Get survey data +We start from a survey including 6 participants of 1, 2 and 3 years of age. The ages are not equally represented in the sample, though we assume they are equally present in the reference population. We will calculate the weighted average number of contacts by age and by age group, using {1,2} and {3} years of age. The following table shows the reported number of contacts per participant $i$, represented by $m_i$: + +```{r, echo=FALSE} +survey_data <- data.frame( + age = c(1, 1, 2, 2, 2, 3), + day.of.week = as.factor(c("weekend", "weekend", "weekend", "week", "week", "week")), + age.group = NA, + m_i = c(3, 2, 9, 10, 8, 15) +) + +# age groups 1-2 and 3 +survey_data$age.group <- 1 - (survey_data$age < 3) + 1 +survey_data$age.group <- as.factor(c("A", "B"))[survey_data$age.group] + +kable(survey_data) +``` + +The summary statistics for the sample (N) and reference population (P) are as follows + +```{r} +N <- 6 +N_age <- c(2, 3, 1) +N_age.group <- c(5, 1) +N_day.of.week <- c(3, 3) + +P <- 3000 +P_age <- c(1000, 1000, 1000) +P_age.group <- c(2000, 1000) + +P_day.of.week <- c(5 / 7, 2 / 7) * 3000 +``` + +This survey data results in an unweighted average number of contacts: + +```{r, echo=FALSE} +print(paste("unweighted average number of contacts:", round(mean(survey_data$m_i), digits = 2))) +``` + +and age-specific unweighted averages on the number of contacts: +```{r, echo=FALSE} +kable(aggregate(m_i ~ age + age.group, data = survey_data, mean)) +``` + +### Weight by day of week + +The following table contains the participants weights based on the survey day with and without the population and sample size constants ($w$ and $w'$, respectively). Note that the standardized weights $\tilde{w}$ and $\tilde{w'}$ are the same: + +```{r, echo=FALSE} +# including population constants +survey_data$w <- NA +for (i in seq_len(nrow(survey_data))) { + day_i <- survey_data$day.of.week[i] + survey_data$w[i] <- (P_day.of.week[day_i] / P) / (N_day.of.week[day_i] / N) +} +survey_data$w_tilde <- survey_data$w / sum(survey_data$w) * N + +# without population constants +survey_data$w_dot <- NA +for (i in seq_len(nrow(survey_data))) { + day_i <- survey_data$day.of.week[i] + survey_data$w_dot[i] <- (P_day.of.week[day_i]) / (N_day.of.week[day_i]) +} +survey_data$w_dot_tilde <- survey_data$w_dot / sum(survey_data$w_dot) * N + +# round +survey_data[, -(1:4)] <- round(survey_data[, -(1:4)], digits = 2) + +# print +kable(survey_data) + +# remove the 'dot' weights +survey_data$w_dot <- NULL +survey_data$w_dot_tilde <- NULL +``` + +Note the different scale of $w$ and $w'$, and the more straightforward interpretation of the numerical value of $w$ in terms of relative differences to apply truncation. +Using the standardized weights, we are able to calculate the weighted number of contacts: + +```{r, echo=FALSE} +# add weighted number of contacts +survey_data["m_i * w_tilde"] <- survey_data$m_i * survey_data$w_tilde + +# show table +kable(survey_data) + +# remove the weighted number of contacts +survey_data$`m_i * w_tilde` <- NULL + +print(paste("weighted average number of contacts:", round(mean(survey_data$m_i * survey_data$w_tilde), digits = 2))) +``` + +If the population-based weights are directly used in age-specific groups, the contact behavior of the 3 year-old participant, which participated during week day, is inflated due to the under-representation of week days in the survey sample. In addition, the number of contacts for 1 year-old participants is decreased because of the over-representation of weekend days in the survey. Using the population-weights within the two aggregated age groups, we obtain a more intuitive weighting for age group A, but it is still skewed for individuals in age group B. As such, this weighted average for age group B has no meaning in terms of social contact behavior: + +```{r, echo=FALSE} +kable(list( + aggregate(m_i * w_tilde ~ age, data = survey_data, mean), + aggregate(m_i * w_tilde ~ age.group, data = survey_data, mean) +)) +``` + +If we subdivide the population, we need to use post-stratification weights ("w_PS") in which the weighted totals within mutually exclusive cells equal the sample size. For the age groups, this goes as follows: + +```{r, echo=FALSE} +survey_data$w_PS <- NA +for (i in seq_len(nrow(survey_data))) { + k_i <- survey_data$age.group[i] + flag_k <- survey_data$age.group == k_i + survey_data$w_PS[i] <- survey_data$w[i] / sum(survey_data$w[flag_k]) * N_age.group[k_i] +} + +# round +survey_data[, -(1:4)] <- round(survey_data[, -(1:4)], digits = 2) + +kable(survey_data) +``` + +The weighted means equal: + +```{r, echo=FALSE} +kable(aggregate(m_i * w_PS ~ age.group, data = survey_data, mean)) +``` + +### Weight by age + +We repeated the example by calculating age-specific participant weights on the population and age-group level: + +```{r, echo=FALSE} +survey_data$w <- NA +survey_data$w_tilde <- NA +survey_data$w_PS <- NULL +for (i in seq_len(nrow(survey_data))) { + age_i <- survey_data$age[i] + survey_data$w[i] <- (P_age[age_i] / P) / (N_age[age_i] / N) +} +survey_data$w_tilde <- survey_data$w / sum(survey_data$w) * N + +survey_data$w_PS <- NA +for (i in seq_len(nrow(survey_data))) { + k_i <- survey_data$age.group[i] + flag_k <- survey_data$age.group == k_i + survey_data$w_PS[i] <- survey_data$w[i] / sum(survey_data$w[flag_k]) * N_age.group[k_i] +} + +# round +survey_data[, -(1:4)] <- round(survey_data[, -(1:4)], digits = 2) + +# print +kable(survey_data) + +print(paste("weighted average number of contacts:", round(mean(survey_data$m_i * survey_data$w_tilde), digits = 2))) +``` + +If the age-specific weights are directly used within the age groups, the contact behavior for age group B is inflated to unrealistic levels and the number of contacts for age group A is artificially low: + +```{r, echo=FALSE} +kable(list( + aggregate(m_i * w_tilde ~ age, data = survey_data, mean), + aggregate(m_i * w_tilde ~ age.group, data = survey_data, mean) +)) +``` + +Using the post-stratification weights, we end up with: + +```{r, echo=FALSE} +kable(aggregate(m_i * w_PS ~ age.group, data = survey_data, mean)) +``` + + +### Apply threshold + +We start with survey data of 14 participants of 1, 2 and 3 years of age, sampled from a population in which all ages are equally present. Given the high representation of participants aged 1 year, the age-specific proportions are skewed in comparison with the reference population. If we calculate the age-specific weights and (un)weighted average number of contacts, we end up with: + +```{r, echo=FALSE} +survey_data <- survey_data[c(rep(1:2, 5), 3:6), ] +survey_data$m_i[survey_data$age == 3] <- 30 +rownames(survey_data) <- NULL + +survey_data <- survey_data[order(survey_data$age), ] +survey_data$w <- NULL +survey_data$w_tilde <- NULL +survey_data$w_PS <- NULL +``` + + +```{r, echo=FALSE} +N <- nrow(survey_data) +N_age <- table(survey_data$age) +N_age.group <- table(survey_data$age.group) +N_day.of.week <- table(survey_data$day.of.week) +``` + +```{r, echo=FALSE} +survey_data$w <- NA +survey_data$w_tilde <- NA +survey_data$w_PS <- NULL +for (i in seq_len(nrow(survey_data))) { + age_i <- survey_data$age[i] + survey_data$w[i] <- (P_age[age_i] / P) / (N_age[age_i] / N) +} +survey_data$w_tilde <- survey_data$w / sum(survey_data$w) * N + +# round +survey_data[, -(1:4)] <- round(survey_data[, -(1:4)], digits = 2) +kable(survey_data) + +print(paste("unweighted average number of contacts:", round(mean(survey_data$m_i), digits = 2))) +print(paste("weighted average number of contacts:", round(mean(survey_data$m_i * survey_data$w_tilde), digits = 2))) +``` + +The single participant of 3 years of age has a very large influence on the weighted population average. As such, we propose to truncate the relative age-specific weights $w$ at 3. As such, the weighted population average equals: + +```{r, echo=FALSE} +survey_data$w_tilde[survey_data$w_tilde > 3] <- 3 +``` + +```{r, echo=FALSE} +survey_data$w_tilde[survey_data$w_tilde > 3] <- 3 + +print(paste("weighted average number of contacts after truncation:", round(mean(survey_data$m_i * survey_data$w_tilde), digits = 2))) +``` + +# Plotting + +## Using ggplot2 + +The contact matrices can be plotted by using the `geom_tile()` function of the `ggplot2` package. + +```{r fig.width=4, fig.height=4} +df <- reshape2::melt(mr, varnames = c("age.group", "age.group.contact"), value.name = "contacts") +ggplot(df, aes(x = age.group, y = age.group.contact, fill = contacts)) + + theme(legend.position = "bottom") + + geom_tile() +``` + +## Using R base + +The contact matrices can also be plotted with the `matrix_plot()` function as a grid of colored rectangles with the numeric values in the cells. Heat colors are used by default, though this can be changed. + +```{r fig.width=4, fig.height=4} +matrix_plot(mr) +matrix_plot(mr, color.palette = gray.colors) +``` + +# References +