diff --git a/DESCRIPTION b/DESCRIPTION index 7681c60..b0e0fed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,7 +37,7 @@ License: Apache License (>= 2) BugReports: https://github.com/pachadotdev/capybara/issues URL: https://pacha.dev/capybara/, https://github.com/pachadotdev/capybara LazyData: true -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Encoding: UTF-8 NeedsCompilation: yes LinkingTo: cpp11, cpp11armadillo diff --git a/R/apes.R b/R/apes.R index 3fb7112..e74695b 100644 --- a/R/apes.R +++ b/R/apes.R @@ -1,3 +1,15 @@ +#' srr_stats (tests) +#' @srrstats {G1.0} Statistical Software should list at least one primary +#' reference from published academic literature. +#' @srrstats {G2.3} For univariate character input: +#' @srrstats {G2.3a} Use `match.arg()` or equivalent where applicable to only +#' permit expected values. +#' @srrstats {G2.3b} Either: use `tolower()` or equivalent to ensure input of +#' character parameters is not case dependent; or explicitly document that +#' parameters are strictly case-sensitive. +#' @noRd +NULL + #' @title Compute average partial effects after fitting binary choice models #' with a 1,2,3-way error component #' @@ -10,10 +22,6 @@ #' #' \strong{Remark:} The routine currently does not allow to compute average #' partial effects based on functional forms like interactions and polynomials. -#' -#' @srrstats {G2.3} *For univariate character input:* -#' @srrstats {G2.3a} *Use `match.arg()` or equivalent where applicable to only permit expected values.* -#' @srrstats {G2.3b} *Either: use `tolower()` or equivalent to ensure input of character parameters is not case dependent; or explicitly document that parameters are strictly case-sensitive.* #' #' @param object an object of class \code{"bias_corr"} or \code{"feglm"}; #' currently restricted to \code{\link[stats]{binomial}}. @@ -46,8 +54,6 @@ #' #' @return The function \code{\link{apes}} returns a named list of class #' \code{"apes"}. -#' -#' @srrstats {G1.0} *Statistical Software should list at least one primary reference from published academic literature.* #' #' @references Cruz-Gonzalez, M., I. Fernández-Val, and M. Weidner (2017). "Bias #' corrections for probit and logit models with two-way fixed effects". The diff --git a/R/apes_bias_helpers.R b/R/apes_bias_helpers.R index acec9cb..f9d95c5 100644 --- a/R/apes_bias_helpers.R +++ b/R/apes_bias_helpers.R @@ -1,8 +1,15 @@ +#' srr_stats (tests) +#' @srrstats {G1.4a} All internal (non-exported) functions should also be +#' documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, +#' along with a final `@noRd` tag to suppress automatic generation of `.Rd` +#' files. +#' @noRd +NULL + #' @title Checks if the object is an `feglm` object #' @description Internal check #' @param object Object to check #' @param fun Function name (e.g., "apes") -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd apes_bias_check_object_ <- function(object, fun) { if (is.null(object)) { diff --git a/R/bias_corr.R b/R/bias_corr.R index ff98199..5fe5b08 100644 --- a/R/bias_corr.R +++ b/R/bias_corr.R @@ -1,3 +1,15 @@ +#' srr_stats (tests) +#' @srrstats {G1.0} Statistical Software should list at least one primary +#' reference from published academic literature. +#' @srrstats {G2.3} For univariate character input: +#' @srrstats {G2.3a} Use `match.arg()` or equivalent where applicable to only +#' permit expected values. +#' @srrstats {G2.3b} Either: use `tolower()` or equivalent to ensure input of +#' character parameters is not case dependent; or explicitly document that +#' parameters are strictly case-sensitive. +#' @noRd +NULL + #' @title Asymptotic bias correction after fitting binary choice models with a #' 1,2,3-way error component #' @@ -7,10 +19,6 @@ #' obtain bias-corrected estimates of the structural parameters and is #' currently restricted to \code{\link[stats]{binomial}} with 1,2,3-way fixed #' effects. -#' -#' @srrstats {G2.3} *For univariate character input:* -#' @srrstats {G2.3a} *Use `match.arg()` or equivalent where applicable to only permit expected values.* -#' @srrstats {G2.3b} *Either: use `tolower()` or equivalent to ensure input of character parameters is not case dependent; or explicitly document that parameters are strictly case-sensitive.* #' #' @param object an object of class \code{"feglm"}. #' @param l unsigned integer indicating a bandwidth for the estimation of @@ -28,8 +36,6 @@ #' observed for several time periods. Default is \code{"classic"}. #' #' @return A named list of classes \code{"bias_corr"} and \code{"feglm"}. -#' -#' @srrstats {G1.0} *Statistical Software should list at least one primary reference from published academic literature.* #' #' @references Czarnowske, D. and A. Stammann (2020). "Fixed Effects Binary #' Choice Models: Estimation and Inference with Long Panels". ArXiv e-prints. diff --git a/R/capybara-package.R b/R/capybara-package.R index 035d143..5709715 100644 --- a/R/capybara-package.R +++ b/R/capybara-package.R @@ -1,9 +1,15 @@ +#' srr_stats (tests) +#' @srrstats {G1.1} The algorithm is a full refactor with memory and speed +#' improvements for a previous R implementation (Stammann, 2018). +#' @srrstats {G1.2} Describes the current and anticipated future states of +#' development. +#' @srrstats {G1.4} The package uses [`roxygen2`](https://roxygen2.r-lib.org/) +#' to document all functions (see ./DESCRIPTION). +#' @noRd +NULL + #' @title Generalized Linear Models (GLMs) with high-dimensional k-way fixed #' effects -#' -#' @srrstats {G1.1} *Statistical Software should document whether the algorithm(s) it implements are:* - *The first implementation of a novel algorithm*; or - *The first implementation within **R** of an algorithm which has previously been implemented in other languages or contexts*; or - *An improvement on other implementations of similar algorithms in **R***. -#' @srrstats {G1.2} *Statistical Software should include a* Life Cycle Statement *describing current and anticipated future states of development.* -#' @srrstats {G1.4} *Software should use [`roxygen2`](https://roxygen2.r-lib.org/) to document all functions.* #' #' @description #' Provides a routine to partial out factors with many levels during the @@ -34,6 +40,11 @@ #' @useDynLib capybara, .registration = TRUE "_PACKAGE" +#' srr_stats (tests) +#' @srrstats {G5.1} The panel is exported and used in the package examples. +#' @noRd +NULL + #' Trade Panel 1986-2006 #' #' Aggregated exports at origin-destination-year level for 1986-2006. @@ -54,5 +65,6 @@ #' \item{exp_year}{Exporter ISO country code and year} #' \item{imp_year}{Importer ISO country code and year} #' } +#' #' @source Advanced Guide to Trade Policy Analysis (ISBN: 978-92-870-4367-2) "trade_panel" diff --git a/R/feglm.R b/R/feglm.R index 5af8aae..cd16930 100644 --- a/R/feglm.R +++ b/R/feglm.R @@ -1,3 +1,55 @@ +#' srr_stats (tests) +#' @srrstats {G1.0} Statistical Software should list at least one primary +#' reference from published academic literature. +#' @srrstats {G1.3} All statistical terminology should be clarified and +#' unambiguously defined. +#' @srrstats {G2.3} For univariate character input: +#' @srrstats {G2.3a} Use `match.arg()` or equivalent where applicable to only +#' permit expected values. +#' @srrstats {G2.3b} Either: use `tolower()` or equivalent to ensure input of +#' character parameters is not case dependent; or explicitly document that +#' parameters are strictly case-sensitive. +#' @srrstats {RE4.4} The specification of the model, generally as a formula +#' (via `formula()`) +#' @srrstats {RE1.0} Regression Software should enable models to be specified +#' via a formula interface, unless reasons for not doing so are explicitly +#' documented. +#' @srrstats {RE1.1} Regression Software should document how formula interfaces +#' are converted to matrix representations of input data. +#' @srrstats {RE1.2} Regression Software should document expected format (types +#' or classes) for inputting predictor variables, including descriptions of +#' types or classes which are not accepted. +#' @srrstats {RE1.3} Regression Software which passes or otherwise transforms +#' aspects of input data onto output structures should ensure that those output +#' structures retain all relevant aspects of input data, notably including row +#' and column names, and potentially information from other `attributes()`. +#' @srrstats {RE1.3a} Where otherwise relevant information is not transferred, +#' this should be explicitly documented. +#' @srrstats {RE1.4} Regression Software should document any assumptions made +#' with regard to input data; for example distributional assumptions, or +#' assumptions that predictor data have mean values of zero. Implications of +#' violations of these assumptions should be both documented and tested. +#' @srrstats {RE2.3} Where applicable, Regression Software should enable data to +#' be centred (for example, through converting to zero-mean equivalent values; +#' or to z-scores) or offset (for example, to zero-intercept equivalent values) +#' via additional parameters, with the effects of any such parameters clearly +#' documented and tested. +#' @srrstats {RE3.0} Issue appropriate warnings or other diagnostic messages for +#' models which fail to converge. +#' @srrstats {RE3.1} Enable such messages to be optionally suppressed, yet +#' should ensure that the resultant model object nevertheless includes +#' sufficient data to identify lack of convergence. +#' @srrstats {RE3.2} Ensure that convergence thresholds have sensible default +#' values, demonstrated through explicit documentation. +#' @srrstats {RE3.3} Allow explicit setting of convergence thresholds, unless +#' reasons against doing so are explicitly documented. +#' @srrstats {RE4.0} Regression Software should return some form of "model" +#' object, generally through using or modifying existing class structures for +#' model objects (such as `lm`, `glm`, or model objects from other packages), +#' or creating a new class of model objects. +#' @noRd +NULL + #' @title GLM fitting with high-dimensional k-way fixed effects #' #' @description \code{\link{feglm}} can be used to fit generalized linear models @@ -8,22 +60,6 @@ #' \strong{Remark:} The term fixed effect is used in econometrician's sense of #' having intercepts for each level in each category. #' -#' @srrstats {G2.3} *For univariate character input:* -#' @srrstats {G2.3a} *Use `match.arg()` or equivalent where applicable to only permit expected values.* -#' @srrstats {G2.3b} *Either: use `tolower()` or equivalent to ensure input of character parameters is not case dependent; or explicitly document that parameters are strictly case-sensitive.* -#' @srrstats {RE4.4} *The specification of the model, generally as a formula (via `formula()`)* -#' @srrstats {RE1.0} *Regression Software should enable models to be specified via a formula interface, unless reasons for not doing so are explicitly documented.* -#' @srrstats {RE1.1} *Regression Software should document how formula interfaces are converted to matrix representations of input data.* -#' @srrstats {RE1.2} *Regression Software should document expected format (types or classes) for inputting predictor variables, including descriptions of types or classes which are not accepted.* -#' @srrstats {RE1.3} *Regression Software which passes or otherwise transforms aspects of input data onto output structures should ensure that those output structures retain all relevant aspects of input data, notably including row and column names, and potentially information from other `attributes()`.* -#' @srrstats {RE1.3a} *Where otherwise relevant information is not transferred, this should be explicitly documented.* -#' @srrstats {RE1.4} *Regression Software should document any assumptions made with regard to input data; for example distributional assumptions, or assumptions that predictor data have mean values of zero. Implications of violations of these assumptions should be both documented and tested.* -#' @srrstats {RE2.3} *Where applicable, Regression Software should enable data to be centred (for example, through converting to zero-mean equivalent values; or to z-scores) or offset (for example, to zero-intercept equivalent values) via additional parameters, with the effects of any such parameters clearly documented and tested.* -#' @srrstats {RE3.0} *Issue appropriate warnings or other diagnostic messages for models which fail to converge.* -#' @srrstats {RE3.1} *Enable such messages to be optionally suppressed, yet should ensure that the resultant model object nevertheless includes sufficient data to identify lack of convergence.* -#' @srrstats {RE3.2} *Ensure that convergence thresholds have sensible default values, demonstrated through explicit documentation.* -#' @srrstats {RE3.3} *Allow explicit setting of convergence thresholds, unless reasons against doing so are explicitly documented.* -#' #' @param formula an object of class \code{"formula"}: a symbolic description of #' the model to be fitted. \code{formula} must be of type \code{y ~ x | k}, #' where the second part of the formula refers to factors to be concentrated @@ -51,9 +87,6 @@ #' linear dependence between one or more regressors and a fixed effects #' category. In this case, you should carefully inspect your model #' specification. -#' -#' @srrstats {G1.3} *All statistical terminology should be clarified and unambiguously defined.* -#' @srrstats {RE4.0} *Regression Software should return some form of "model" object, generally through using or modifying existing class structures for model objects (such as `lm`, `glm`, or model objects from other packages), or creating a new class of model objects.* #' #' @return A named list of class \code{"feglm"}. The list contains the following #' fifteen elements: @@ -75,8 +108,6 @@ #' observations} #' \item{family}{the family used in the model} #' \item{control}{the control list used in the model} -#' -#' @srrstats {G1.0} *Statistical Software should list at least one primary reference from published academic literature.* #' #' @references Gaure, S. (2013). "OLS with Multiple High Dimensional Category #' Variables". Computational Statistics and Data Analysis, 66. @@ -163,8 +194,9 @@ feglm <- function( p <- NA model_response_(data, formula) - # Check for linear dependence in 'x' ---- - check_linear_dependence_(x, p) + # Check for linear dependence ---- + # check_linear_dependence_(x, p) + check_linear_dependence_(cbind(y,x), p + 1L) # Extract weights if required ---- if (is.null(weights)) { diff --git a/R/feglm_helpers.R b/R/feglm_helpers.R index 28dab0c..b927695 100644 --- a/R/feglm_helpers.R +++ b/R/feglm_helpers.R @@ -1,10 +1,22 @@ +#' srr_stats (tests) +#' @srrstats {G1.4a} All internal (non-exported) functions should also be +#' documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, +#' along with a final `@noRd` tag to suppress automatic generation of `.Rd` +#' files. +#' @noRd +NULL + +#' srr_stats (tests) +#' @srrstats {G2.4} Provide appropriate mechanisms to convert between different +#' data types, potentially including: +#' @srrstats {G2.4d} explicit conversion to factor via `as.factor()` +#' @srrstats {G2.4e} explicit conversion from factor via `as...()` functions +#' @noRd +NULL + #' @title Transform factor #' @description Checks if variable is a factor and transforms if necessary #' @param x Variable to be checked -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* -#' @srrstats {G2.4} *Provide appropriate mechanisms to convert between different data types, potentially including:* -#' @srrstats {G2.4d} *explicit conversion to factor via `as.factor()`* -#' @srrstats {G2.4e} *explicit conversion from factor via `as...()` functions* #' @noRd check_factor_ <- function(x) { if (is.factor(x)) { @@ -19,7 +31,6 @@ check_factor_ <- function(x) { #' @param eta Eta value #' @param mu_eta Mu.eta value #' @param family Family object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd second_order_derivative_ <- function(eta, mu_eta, family) { link <- family[["link"]] @@ -41,7 +52,6 @@ second_order_derivative_ <- function(eta, mu_eta, family) { #' @param eta Eta value #' @param mu_eta Mu.eta value #' @param family Family object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd third_order_derivative_ <- function(eta, mu_eta, family) { link <- family[["link"]] @@ -64,7 +74,6 @@ third_order_derivative_ <- function(eta, mu_eta, family) { #' @param eta Linear predictor #' @param family Family object #' @param order Order of the derivative (2 or 3) -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd partial_mu_eta_ <- function(eta, family, order) { # Safeguard eta if necessary @@ -84,7 +93,6 @@ partial_mu_eta_ <- function(eta, family, order) { #' @title Temporary variable #' @description Generates a temporary variable name #' @param data Data frame -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd temp_var_ <- function(data) { repeat { @@ -102,7 +110,6 @@ temp_var_ <- function(data) { #' @title Check formula #' @description Checks if formula for GLM/NegBin models #' @param formula Formula object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd check_formula_ <- function(formula) { if (is.null(formula)) { @@ -112,11 +119,16 @@ check_formula_ <- function(formula) { } } +#' srr_stats (tests) +#' @srrstats {G2.0} Implement assertions on lengths of inputs, particularly +#' through asserting that inputs expected to be single- or multi-valued are +#' indeed so. +#' @noRd +NULL + #' @title Check data #' @description Checks data for GLM/NegBin models #' @param data Data frame -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* -#' @srrstats {G2.0} *Implement assertions on lengths of inputs, particularly through asserting that inputs expected to be single- or multi-valued are indeed so.* #' @noRd check_data_ <- function(data) { if (is.null(data)) { @@ -131,7 +143,6 @@ check_data_ <- function(data) { #' @title Check control #' @description Checks control for GLM/NegBin models #' @param control Control list -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd check_control_ <- function(control) { if (is.null(control)) { @@ -146,7 +157,6 @@ check_control_ <- function(control) { #' @title Check family #' @description Checks family for GLM/NegBin models #' @param family Family object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd check_family_ <- function(family) { if (!inherits(family, "family")) { @@ -171,7 +181,6 @@ check_family_ <- function(family) { #' @title Update formula #' @description Updates formula for GLM/NegBin models #' @param formula Formula object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd update_formula_ <- function(formula) { formula <- Formula(formula) @@ -186,15 +195,24 @@ update_formula_ <- function(formula) { formula } +#' srr_stats (tests) +#' @srrstats {RE2.0} Regression Software should document any transformations +#' applied to input data, for example conversion of label-values to `factor`, +#' and should provide ways to explicitly avoid any default transformations +#' (with error or warning conditions where appropriate). +#' @srrstats {RE2.1} Regression Software should implement explicit parameters +#' controlling the processing of missing values, ideally distinguishing `NA` or +#' `NaN` values from `Inf` values (for example, through use of `na.omit()` and +#' related functions from the `stats` package). +#' @srrstats {RE4.5} Numbers of observations submitted to model (via `nobs()`) +#' @noRd +NULL + #' @title Model frame #' @description Creates model frame for GLM/NegBin models #' @param data Data frame #' @param formula Formula object #' @param weights Weights -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* -#' @srrstats {RE2.0} *Regression Software should document any transformations applied to input data, for example conversion of label-values to `factor`, and should provide ways to explicitly avoid any default transformations (with error or warning conditions where appropriate).* -#' @srrstats {RE2.1} *Regression Software should implement explicit parameters controlling the processing of missing values, ideally distinguishing `NA` or `NaN` values from `Inf` values (for example, through use of `na.omit()` and related functions from the `stats` package).* -#' @srrstats {RE4.5} *Numbers of observations submitted to model (via `nobs()`)* #' @noRd model_frame_ <- function(data, formula, weights) { data <- select(ungroup(data), all_of(c(all.vars(formula), weights))) @@ -219,7 +237,6 @@ model_frame_ <- function(data, formula, weights) { #' @param data Data frame #' @param lhs Left-hand side of the formula #' @param family Family object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd check_response_ <- function(data, lhs, family) { if (family[["family"]] == "binomial") { @@ -270,7 +287,6 @@ check_response_ <- function(data, lhs, family) { #' @param tmp_var Temporary variable #' @param k_vars Fixed effects #' @param control Control list -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd drop_by_link_type_ <- function(data, lhs, family, tmp_var, k_vars, control) { if (family[["family"]] %in% c("binomial", "poisson")) { @@ -307,7 +323,6 @@ drop_by_link_type_ <- function(data, lhs, family, tmp_var, k_vars, control) { #' @param data Data frame #' @param formula Formula object #' @param k_vars Fixed effects -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd transform_fe_ <- function(data, formula, k_vars) { data <- mutate(data, across(all_of(k_vars), check_factor_)) @@ -325,7 +340,6 @@ transform_fe_ <- function(data, formula, k_vars) { #' @param nobs_full Number of observations in the full data set #' @param nobs_na Number of observations with missing values #' @param nt Number of observations after dropping -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd nobs_ <- function(nobs_full, nobs_na, nt) { c( @@ -340,7 +354,6 @@ nobs_ <- function(nobs_full, nobs_na, nt) { #' @description Computes the model response #' @param data Data frame #' @param formula Formula object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd model_response_ <- function(data, formula) { y <- data[[1L]] @@ -355,15 +368,23 @@ model_response_ <- function(data, formula) { assign("p", p, envir = parent.frame()) } +#' srr_stats (tests) +#' @srrstats {G2.0} Implement assertions on lengths of inputs, particularly +#' through asserting that inputs expected to be single- or multi-valued are +#' indeed so. +#' @srrstats {RE2.4} Regression Software should implement pre-processing +#' routines to identify whether aspects of input data are perfectly collinear, +#' notably including: +#' @srrstats {RE2.4a} Perfect collinearity among predictor variables +#' @srrstats {RE2.4b} Perfect collinearity between independent and dependent +#' variables +#' @noRd +NULL + #' @title Check weights #' @description Checks if weights are valid #' @param x Regressor matrix #' @param p Number of parameters -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* -#' @srrstats {G2.0} *Implement assertions on lengths of inputs, particularly through asserting that inputs expected to be single- or multi-valued are indeed so.* -#' @srrstats {RE2.4} *Regression Software should implement pre-processing routines to identify whether aspects of input data are perfectly collinear, notably including:* -#' @srrstats {RE2.4a} *Perfect collinearity among predictor variables* -#' @srrstats {RE2.4b} *Perfect collinearity between independent and dependent variables* #' @noRd check_linear_dependence_ <- function(x, p) { if (qr(x)$rank < p) { @@ -371,11 +392,16 @@ check_linear_dependence_ <- function(x, p) { } } +#' srr_stats (tests) +#' @srrstats {G2.0} Implement assertions on lengths of inputs, particularly +#' through asserting that inputs expected to be single- or multi-valued are +#' indeed so. +#' @noRd +NULL + #' @title Check weights #' @description Checks if weights are valid #' @param wt Weights -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* -#' @srrstats {G2.0} *Implement assertions on lengths of inputs, particularly through asserting that inputs expected to be single- or multi-valued are indeed so.* #' @noRd check_weights_ <- function(wt) { if (!is.numeric(wt)) { @@ -390,7 +416,6 @@ check_weights_ <- function(wt) { #' @description Checks if starting theta is valid for NegBin models #' @param init_theta Initial theta value #' @param link Link function -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd init_theta_ <- function(init_theta, link) { if (is.null(init_theta)) { @@ -419,7 +444,6 @@ init_theta_ <- function(init_theta, link) { #' @param wt Weights #' @param p Number parameters #' @param family Family object -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd start_guesses_ <- function( beta_start, eta_start, y, x, beta, nt, wt, p, family) { @@ -485,7 +509,6 @@ start_guesses_ <- function( #' effects #' @param k_vars Fixed effects #' @param data Data frame -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd get_index_list_ <- function(k_vars, data) { indexes <- seq.int(0L, nrow(data) - 1L) @@ -497,7 +520,6 @@ get_index_list_ <- function(k_vars, data) { #' @title Get score matrix #' @description Computes the score matrix #' @param object Result list -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd get_score_matrix_ <- function(object) { # Extract required quantities from result list @@ -547,7 +569,6 @@ get_score_matrix_ <- function(object) { #' @param ppsi Psi matrix #' @param v Vector of weights #' @param nt Number of observations -#' @srrstats {G1.4a} *All internal (non-exported) functions should also be documented in standard [`roxygen2`](https://roxygen2.r-lib.org/) format, along with a final `@noRd` tag to suppress automatic generation of `.Rd` files.* #' @noRd gamma_ <- function(mx, h, j, ppsi, v, nt) { inv_nt <- 1.0 / nt diff --git a/R/felm.R b/R/felm.R index 890e80b..2cbd463 100644 --- a/R/felm.R +++ b/R/felm.R @@ -1,12 +1,21 @@ +#' srr_stats (tests) +#' @srrstats {G1.0} Statistical Software should list at least one primary +#' reference from published academic literature. +#' @srrstats {G1.3} All statistical terminology should be clarified and +#' unambiguously defined. +#' @srrstats {RE4.0} Regression Software should return some form of "model" +#' object, generally through using or modifying existing class structures for +#' model objects (such as `lm`, `glm`, or model objects from other packages), +#' or creating a new class of model objects. +#' @noRd +NULL + #' @title LM fitting with high-dimensional k-way fixed effects #' #' @description A wrapper for \code{\link{feglm}} with #' \code{family = gaussian()}. #' #' @inheritParams feglm -#' -#' @srrstats {G1.3} *All statistical terminology should be clarified and unambiguously defined.* -#' @srrstats {RE4.0} *Regression Software should return some form of "model" object, generally through using or modifying existing class structures for model objects (such as `lm`, `glm`, or model objects from other packages), or creating a new class of model objects.* #' #' @return A named list of class \code{"felm"}. The list contains the following #' eleven elements: @@ -24,8 +33,6 @@ #' \item{data}{the data used in the model after dropping non-contributing #' observations} #' \item{control}{the control list used in the model} -#' -#' @srrstats {G1.0} *Statistical Software should list at least one primary reference from published academic literature.* #' #' @references Gaure, S. (2013). "OLS with Multiple High Dimensional Category #' Variables". Computational Statistics and Data Analysis, 66. diff --git a/R/fenegbin.R b/R/fenegbin.R index 8002535..3b3086a 100644 --- a/R/fenegbin.R +++ b/R/fenegbin.R @@ -1,3 +1,13 @@ +#' srr_stats (tests) +#' @srrstats {G1.3} All statistical terminology should be clarified and +#' unambiguously defined. +#' @srrstats {RE4.0} Regression Software should return some form of "model" +#' object, generally through using or modifying existing class structures for +#' model objects (such as `lm`, `glm`, or model objects from other packages), +#' or creating a new class of model objects. +#' @noRd +NULL + #' @title Negative Binomial model fitting with high-dimensional k-way fixed #' effects #' @@ -24,9 +34,6 @@ #' ) #' #' summary(mod) -#' -#' @srrstats {G1.3} *All statistical terminology should be clarified and unambiguously defined.* -#' @srrstats {RE4.0} *Regression Software should return some form of "model" object, generally through using or modifying existing class structures for model objects (such as `lm`, `glm`, or model objects from other packages), or creating a new class of model objects.* #' #' @return A named list of class \code{"feglm"}. The list contains the following #' eighteen elements: @@ -109,7 +116,7 @@ fenegbin <- function( model_response_(data, formula) # Check for linear dependence in 'x' ---- - check_linear_dependence_(x, p) + check_linear_dependence_(cbind(y,x), p + 1L) # Extract weights if required ---- if (is.null(weights)) { diff --git a/R/fepoisson.R b/R/fepoisson.R index 3238eb0..a1a93ba 100644 --- a/R/fepoisson.R +++ b/R/fepoisson.R @@ -1,3 +1,13 @@ +#' srr_stats (tests) +#' @srrstats {G1.3} All statistical terminology should be clarified and +#' unambiguously defined. +#' @srrstats {RE4.0} Regression Software should return some form of "model" +#' object, generally through using or modifying existing class structures for +#' model objects (such as `lm`, `glm`, or model objects from other packages), +#' or creating a new class of model objects. +#' @noRd +NULL + #' @title Poisson model fitting high-dimensional with k-way fixed effects #' #' @description A wrapper for \code{\link{feglm}} with diff --git a/R/fixed_effects.R b/R/fixed_effects.R index 4ae6624..9a3cdab 100644 --- a/R/fixed_effects.R +++ b/R/fixed_effects.R @@ -1,3 +1,11 @@ +#' srr_stats (tests) +#' @srrstats {G1.3} All statistical terminology should be clarified and +#' unambiguously defined. +#' @srrstats {G1.0} Statistical Software should list at least one primary +#' reference from published academic literature. +#' @noRd +NULL + #' @title Recover the estimates of the fixed effects after fitting (G)LMs #' #' @description The system might not have a unique solution since we do not take @@ -10,12 +18,8 @@ #' stopped at iteration \eqn{i} if \eqn{||\boldsymbol{\alpha}_{i} - #' \boldsymbol{\alpha}_{i - 1}||_{2} < tol ||\boldsymbol{\alpha}_{i - 1}|| #' {2}}{||\Delta \alpha|| < tol ||\alpha_old||}. Default is \code{1.0e-08}. -#' -#' @srrstats {G1.3} *All statistical terminology should be clarified and unambiguously defined.* #' #' @return A named list containing named vectors of estimated fixed effects. -#' -#' @srrstats {G1.0} *Statistical Software should list at least one primary reference from published academic literature.* #' #' @references Stammann, A. (2018). "Fast and Feasible Estimation of Generalized #' Linear Models with High-Dimensional k-way Fixed Effects". ArXiv e-prints. diff --git a/R/generics_augment.R b/R/generics_augment.R index b6a6439..17fae64 100644 --- a/R/generics_augment.R +++ b/R/generics_augment.R @@ -1,10 +1,15 @@ +#' srr_stats (tests) +#' @srrstats {RE4.10} Model Residuals, including sufficient documentation to +#' enable interpretation of residuals, and to enable users to submit residuals +#' to their own tests. +#' @noRd +NULL + #' @importFrom generics augment #' @export generics::augment #' @title Broom Integration -#' -#' @srrstats {RE4.10} *Model Residuals, including sufficient documentation to enable interpretation of residuals, and to enable users to submit residuals to their own tests.* #' #' @description The provided `broom` methods do the following: #' 1. `augment`: Takes the input data and adds additional columns with the diff --git a/R/generics_coef.R b/R/generics_coef.R index 84ad581..edad0ad 100644 --- a/R/generics_coef.R +++ b/R/generics_coef.R @@ -1,40 +1,39 @@ +#' srr_stats (tests) +#' @srrstats {RE4.2} Model coefficients (via `coef()` / `coefficients()`) +#' @noRd +NULL + #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.apes <- function(object, ...) { object[["delta"]] } #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.feglm <- function(object, ...) { object[["coefficients"]] } #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.felm <- function(object, ...) { object[["coefficients"]] } #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.summary.apes <- function(object, ...) { object[["cm"]] } #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.summary.feglm <- function(object, ...) { object[["cm"]] } #' @export -#' @srrstats {RE4.2} *Model coefficients (via `coef()` / `coefficients()`)* #' @noRd coef.summary.felm <- function(object, ...) { object[["cm"]] diff --git a/R/generics_confint.R b/R/generics_confint.R index 2b16005..74b11c9 100644 --- a/R/generics_confint.R +++ b/R/generics_confint.R @@ -1,5 +1,10 @@ +#' srr_stats (tests) +#' @srrstats {RE4.3} Confidence intervals on those coefficients +#' (via `confint()`) +#' @noRd +NULL + #' @export -#' @srrstats {RE4.3} *Confidence intervals on those coefficients (via `confint()`)* #' @noRd confint.feglm <- function(object, parm, level = 0.95, ...) { # Extract the summary of the feglm object @@ -26,7 +31,6 @@ confint.feglm <- function(object, parm, level = 0.95, ...) { } #' @export -#' @srrstats {RE4.3} *Confidence intervals on those coefficients (via `confint()`)* #' @noRd confint.felm <- function(object, parm, level = 0.95, ...) { confint.feglm(object, parm, level, ...) diff --git a/R/generics_predict.R b/R/generics_predict.R index a335acd..5a96102 100644 --- a/R/generics_predict.R +++ b/R/generics_predict.R @@ -1,12 +1,26 @@ +#' srr_stats (tests) +#' @srrstats {G2.3} For univariate character input: +#' @srrstats {G2.3a} Use `match.arg()` or equivalent where applicable to only +#' permit expected values. +#' @srrstats {G2.3b} Either: use `tolower()` or equivalent to ensure input of +#' character parameters is not case dependent; or explicitly document that +#' parameters are strictly case-sensitive. +#' @srrstats {RE4.9} Modelled values of response variables. +#' @srrstats {RE4.12} Where appropriate, functions used to transform input data, +#' and associated inverse transform functions. +#' @srrstats {RE4.13} Predictor variables, and associated "metadata" where +#' applicable. (via `confint()`) +#' @srrstats {RE4.18} Regression Software may also implement `summary` methods +#' for model objects, and in particular should implement distinct `summary` +#' methods for any cases in which calculation of summary statistics is +#' computationally non-trivial (for example, for bootstrapped estimates of +#' confidence intervals). +#' @noRd +NULL + #' @title Predict method for 'feglm' objects #' @description Similar to the 'predict' method for 'glm' objects #' @export -#' @srrstats {G2.3} *For univariate character input:* -#' @srrstats {G2.3a} *Use `match.arg()` or equivalent where applicable to only permit expected values.* -#' @srrstats {G2.3b} *Either: use `tolower()` or equivalent to ensure input of character parameters is not case dependent; or explicitly document that parameters are strictly case-sensitive.* -#' @srrstats {RE4.9} *Modelled values of response variables.* -#' @srrstats {RE4.12} *Where appropriate, functions used to transform input data, and associated inverse transform functions.* -#' @srrstats {RE4.13} *Predictor variables, and associated "metadata" where applicable.* #' @noRd predict.feglm <- function(object, type = c("link", "response"), ...) { # Check validity of 'type' @@ -25,9 +39,6 @@ predict.feglm <- function(object, type = c("link", "response"), ...) { #' @title Predict method for 'felm' objects #' @description Similar to the 'predict' method for 'lm' objects #' @export -#' @srrstats {RE4.9} *Modelled values of response variables.* -#' @srrstats {RE4.12} *Where appropriate, functions used to transform input data, and associated inverse transform functions.* -#' @srrstats {RE4.13} *Predictor variables, and associated "metadata" where applicable.* #' @noRd predict.felm <- function(object, ...) { object[["fitted.values"]] diff --git a/R/generics_summary.R b/R/generics_summary.R index 4a3c295..c343e6c 100644 --- a/R/generics_summary.R +++ b/R/generics_summary.R @@ -1,11 +1,21 @@ +#' srr_stats (tests) +#' @srrstats {RE4.5} Numbers of observations submitted to model (via `nobs()`) +#' @srrstats {RE4.6} The variance-covariance matrix of the model parameters +#' (via `vcov()`) +#' @srrstats {RE4.7} Where appropriate, convergence statistics +#' @srrstats {RE4.11} Goodness-of-fit and other statistics associated such as +#' effect sizes with model coefficients. +#' @srrstats {RE4.18} Regression Software may also implement `summary` methods +#' for model objects, and in particular should implement distinct `summary` +#' methods for any cases in which calculation of summary statistics is +#' computationally non-trivial (for example, for bootstrapped estimates of +#' confidence intervals). +#' @noRd +NULL + #' @title Summary method for fixed effects APEs #' @inherit vcov.apes #' @export -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* -#' @srrstats {RE4.5} *Numbers of observations submitted to model (via `nobs()`)* -#' @srrstats {RE4.7} *Where appropriate, convergence statistics* -#' @srrstats {RE4.11} *Goodness-of-fit and other statistics associated such as effect sizes with model coefficients.* -#' @srrstats {RE4.18} *Regression Software may also implement `summary` methods for model objects, and in particular should implement distinct `summary` methods for any cases in which calculation of summary statistics is computationally non-trivial (for example, for bootstrapped estimates of confidence intervals).* #' @noRd summary.apes <- function(object, ...) { # Compute coefficent matrix @@ -24,10 +34,6 @@ summary.apes <- function(object, ...) { #' @title Summary method for fixed effects GLMs #' @inherit vcov.feglm #' @export -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* -#' @srrstats {RE4.5} *Numbers of observations submitted to model (via `nobs()`)* -#' @srrstats {RE4.7} *Where appropriate, convergence statistics* -#' @srrstats {RE4.11} *Goodness-of-fit and other statistics associated such as effect sizes with model coefficients.* #' @noRd summary.feglm <- function( object, @@ -76,11 +82,6 @@ summary.feglm <- function( #' @title Summary method for fixed effects LMs #' @inherit vcov.felm #' @export -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* -#' @srrstats {RE4.5} *Numbers of observations submitted to model (via `nobs()`)* -#' @srrstats {RE4.7} *Where appropriate, convergence statistics* -#' @srrstats {RE4.11} *Goodness-of-fit and other statistics associated such as effect sizes with model coefficients.* -#' @srrstats {RE4.18} *Regression Software may also implement `summary` methods for model objects, and in particular should implement distinct `summary` methods for any cases in which calculation of summary statistics is computationally non-trivial (for example, for bootstrapped estimates of confidence intervals).* #' @noRd summary.felm <- function( object, diff --git a/R/generics_vcov.R b/R/generics_vcov.R index 1a9340f..44a24a0 100644 --- a/R/generics_vcov.R +++ b/R/generics_vcov.R @@ -1,9 +1,13 @@ +#' srr_stats (tests) +#' @srrstats {RE4.6} The variance-covariance matrix of the model parameters +#' (via `vcov()`) +#' @noRd +NULL + #' @title Covariance matrix for APEs #' #' @description Covariance matrix for the estimator of the #' average partial effects from objects returned by \code{\link{apes}}. -#' -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* #' #' @param object an object of class \code{"apes"}. #' @param ... additional arguments. @@ -24,8 +28,6 @@ vcov.apes <- function(object, ...) { #' @description Covariance matrix for the estimator of the structural parameters #' from objects returned by \code{\link{feglm}}. The covariance is computed #' from the hessian, the scores, or a combination of both after convergence. -#' -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* #' #' @param object an object of class \code{"feglm"}. #' @param type the type of covariance estimate required. \code{"hessian"} refers @@ -222,8 +224,6 @@ vcov_feglm_clustered_cov_ <- function(g, cl_vars, sp_vars, p) { #' from objects returned by \code{\link{felm}}. The covariance is computed #' from the hessian, the scores, or a combination of both after convergence. #' -#' @srrstats {RE4.6} *The variance-covariance matrix of the model parameters (via `vcov()`)* -#' #' @param object an object of class \code{"felm"}. #' #' @inherit vcov.feglm diff --git a/R/srr-stats-standards.R b/R/srr-stats-standards.R index 76c30d3..8d38fd2 100644 --- a/R/srr-stats-standards.R +++ b/R/srr-stats-standards.R @@ -2,39 +2,39 @@ #' #' @srrstatsVerbose TRUE #' -#' @srrstats {G1.5} *Software should include all code necessary to reproduce results which form the basis of performance claims made in associated publications.* -#' @srrstats {G1.6} *Software should include code necessary to compare performance claims with alternative implementations in other R packages.* +#' @srrstats {G1.5} Software should include all code necessary to reproduce results which form the basis of performance claims made in associated publications. +#' @srrstats {G1.6} Software should include code necessary to compare performance claims with alternative implementations in other R packages. #' @srrstats {G2.0a} Provide explicit secondary documentation of any expectations on lengths of inputs -#' @srrstats {G2.1} *Implement assertions on types of inputs (see the initial point on nomenclature above).* -#' @srrstats {G2.1a} *Provide explicit secondary documentation of expectations on data types of all vector inputs.* -#' @srrstats {G2.2} *Appropriately prohibit or restrict submission of multivariate input to parameters expected to be univariate.* -#' @srrstats {G2.4} *Provide appropriate mechanisms to convert between different data types, potentially including:* -#' @srrstats {G2.4a} *explicit conversion to `integer` via `as.integer()`* -#' @srrstats {G2.4b} *explicit conversion to continuous via `as.numeric()`* -#' @srrstats {G2.4c} *explicit conversion to character via `as.character()` (and not `paste` or `paste0`)* -#' @srrstats {G2.4d} *explicit conversion to factor via `as.factor()`* -#' @srrstats {G2.4e} *explicit conversion from factor via `as...()` functions* -#' @srrstats {G2.5} *Where inputs are expected to be of `factor` type, secondary documentation should explicitly state whether these should be `ordered` or not, and those inputs should provide appropriate error or other routines to ensure inputs follow these expectations.* -#' @srrstats {G2.6} *Software which accepts one-dimensional input should ensure values are appropriately pre-processed regardless of class structures.* -#' @srrstats {G2.7} *Software should accept as input as many of the above standard tabular forms as possible, including extension to domain-specific forms.* -#' @srrstats {G2.8} *Software should provide appropriate conversion or dispatch routines as part of initial pre-processing to ensure that all other sub-functions of a package receive inputs of a single defined class or type.* -#' @srrstats {G2.9} *Software should issue diagnostic messages for type conversion in which information is lost (such as conversion of variables from factor to character; standardisation of variable names; or removal of meta-data such as those associated with [`sf`-format](https://r-spatial.github.io/sf/) data) or added (such as insertion of variable or column names where none were provided).* -#' @srrstats {G2.10} *Software should ensure that extraction or filtering of single columns from tabular inputs should not presume any particular default behaviour, and should ensure all column-extraction operations behave consistently regardless of the class of tabular data used as input.* -#' @srrstats {G2.11} *Software should ensure that `data.frame`-like tabular objects which have columns which do not themselves have standard class attributes (typically, `vector`) are appropriately processed, and do not error without reason. This behaviour should be tested. Again, columns created by the [`units` package](https://github.com/r-quantities/units/) provide a good test case.* -#' @srrstats {G2.12} *Software should ensure that `data.frame`-like tabular objects which have list columns should ensure that those columns are appropriately pre-processed either through being removed, converted to equivalent vector columns where appropriate, or some other appropriate treatment such as an informative error. This behaviour should be tested.* -#' @srrstats {G2.13} *Statistical Software should implement appropriate checks for missing data as part of initial pre-processing prior to passing data to analytic algorithms.* -#' @srrstats {G2.14} *Where possible, all functions should provide options for users to specify how to handle missing (`NA`) data, with options minimally including:* -#' @srrstats {G2.14a} *error on missing data* -#' @srrstats {G2.14b} *ignore missing data with default warnings or messages issued* -#' @srrstats {G2.15} *Functions should never assume non-missingness, and should never pass data with potential missing values to any base routines with default `na.rm = FALSE`-type parameters (such as [`mean()`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/mean.html), [`sd()`](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/sd.html) or [`cor()`](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/cor.html)).* -#' @srrstats {G3.0} *Statistical software should never compare floating point numbers for equality. All numeric equality comparisons should either ensure that they are made between integers, or use appropriate tolerances for approximate equality.* -#' @srrstats {G3.1} *Statistical software which relies on covariance calculations should enable users to choose between different algorithms for calculating covariances, and should not rely solely on covariances from the `stats::cov` function.* -#' @srrstats {G3.1a} *The ability to use arbitrarily specified covariance methods should be documented (typically in examples or vignettes).* -#' @srrstats {G5.2a} *Every message produced within R code by `stop()`, `warning()`, `message()`, or equivalent should be unique* -#' @srrstats {G5.4a} *For new methods, it can be difficult to separate out correctness of the method from the correctness of the implementation, as there may not be reference for comparison. In this case, testing may be implemented against simple, trivial cases or against multiple implementations such as an initial R implementation compared with results from a C/C++ implementation.* -#' @srrstats {G5.4c} *Where applicable, stored values may be drawn from published paper outputs when applicable and where code from original implementations is not available* -#' @srrstats {RE4.8} *Response variables, and associated "metadata" where applicable.* -#' @srrstats {RE5.0} *Scaling relationships between sizes of input data (numbers of observations, with potential extension to numbers of variables/columns) and speed of algorithm.* +#' @srrstats {G2.1} Implement assertions on types of inputs (see the initial point on nomenclature above). +#' @srrstats {G2.1a} Provide explicit secondary documentation of expectations on data types of all vector inputs. +#' @srrstats {G2.2} Appropriately prohibit or restrict submission of multivariate input to parameters expected to be univariate. +#' @srrstats {G2.4} Provide appropriate mechanisms to convert between different data types, potentially including: +#' @srrstats {G2.4a} explicit conversion to `integer` via `as.integer()` +#' @srrstats {G2.4b} explicit conversion to continuous via `as.numeric()` +#' @srrstats {G2.4c} explicit conversion to character via `as.character()` (and not `paste` or `paste0`) +#' @srrstats {G2.4d} explicit conversion to factor via `as.factor()` +#' @srrstats {G2.4e} explicit conversion from factor via `as...()` functions +#' @srrstats {G2.5} Where inputs are expected to be of `factor` type, secondary documentation should explicitly state whether these should be `ordered` or not, and those inputs should provide appropriate error or other routines to ensure inputs follow these expectations. +#' @srrstats {G2.6} Software which accepts one-dimensional input should ensure values are appropriately pre-processed regardless of class structures. +#' @srrstats {G2.7} Software should accept as input as many of the above standard tabular forms as possible, including extension to domain-specific forms. +#' @srrstats {G2.8} Software should provide appropriate conversion or dispatch routines as part of initial pre-processing to ensure that all other sub-functions of a package receive inputs of a single defined class or type. +#' @srrstats {G2.9} Software should issue diagnostic messages for type conversion in which information is lost (such as conversion of variables from factor to character; standardisation of variable names; or removal of meta-data such as those associated with [`sf`-format](https://r-spatial.github.io/sf/) data) or added (such as insertion of variable or column names where none were provided). +#' @srrstats {G2.10} Software should ensure that extraction or filtering of single columns from tabular inputs should not presume any particular default behaviour, and should ensure all column-extraction operations behave consistently regardless of the class of tabular data used as input. +#' @srrstats {G2.11} Software should ensure that `data.frame`-like tabular objects which have columns which do not themselves have standard class attributes (typically, `vector`) are appropriately processed, and do not error without reason. This behaviour should be tested. Again, columns created by the [`units` package](https://github.com/r-quantities/units/) provide a good test case. +#' @srrstats {G2.12} Software should ensure that `data.frame`-like tabular objects which have list columns should ensure that those columns are appropriately pre-processed either through being removed, converted to equivalent vector columns where appropriate, or some other appropriate treatment such as an informative error. This behaviour should be tested. +#' @srrstats {G2.13} Statistical Software should implement appropriate checks for missing data as part of initial pre-processing prior to passing data to analytic algorithms. +#' @srrstats {G2.14} Where possible, all functions should provide options for users to specify how to handle missing (`NA`) data, with options minimally including: +#' @srrstats {G2.14a} error on missing data +#' @srrstats {G2.14b} ignore missing data with default warnings or messages issued +#' @srrstats {G2.15} Functions should never assume non-missingness, and should never pass data with potential missing values to any base routines with default `na.rm = FALSE`-type parameters (such as [`mean()`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/mean.html), [`sd()`](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/sd.html) or [`cor()`](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/cor.html)). +#' @srrstats {G3.0} Statistical software should never compare floating point numbers for equality. All numeric equality comparisons should either ensure that they are made between integers, or use appropriate tolerances for approximate equality. +#' @srrstats {G3.1} Statistical software which relies on covariance calculations should enable users to choose between different algorithms for calculating covariances, and should not rely solely on covariances from the `stats::cov` function. +#' @srrstats {G3.1a} The ability to use arbitrarily specified covariance methods should be documented (typically in examples or vignettes). +#' @srrstats {G5.2a} Every message produced within R code by `stop()`, `warning()`, `message()`, or equivalent should be unique +#' @srrstats {G5.4a} For new methods, it can be difficult to separate out correctness of the method from the correctness of the implementation, as there may not be reference for comparison. In this case, testing may be implemented against simple, trivial cases or against multiple implementations such as an initial R implementation compared with results from a C/C++ implementation. +#' @srrstats {G5.4c} Where applicable, stored values may be drawn from published paper outputs when applicable and where code from original implementations is not available +#' @srrstats {RE4.8} Response variables, and associated "metadata" where applicable. +#' @srrstats {RE5.0} Scaling relationships between sizes of input data (numbers of observations, with potential extension to numbers of variables/columns) and speed of algorithm. #' @noRd NULL @@ -44,23 +44,25 @@ NULL #' to `@srrstatsNA`, and placed together in this block, along with explanations #' for why each of these standards have been deemed not applicable. #' (These comments may also be deleted at any time.) -#' @srrstatsNA {G2.14c} *replace missing data with appropriately imputed values* -#' @srrstatsNA {G2.16} *All functions should also provide options to handle undefined values (e.g., `NaN`, `Inf` and `-Inf`), including potentially ignoring or removing such values.* -#' @srrstatsNA {G4.0} *Statistical Software which enables outputs to be written to local files should parse parameters specifying file names to ensure appropriate file suffices are automatically generated where not provided.* -#' @srrstatsNA {G5.3} *For functions which are expected to return objects containing no missing (`NA`) or undefined (`NaN`, `Inf`) values, the absence of any such values in return objects should be explicitly tested.* -#' @srrstatsNA {G5.4} **Correctness tests** *to test that statistical algorithms produce expected results to some fixed test data sets (potentially through comparisons using binding frameworks such as [RStata](https://github.com/lbraglia/RStata)).* -#' @srrstatsNA {G5.5} *Correctness tests should be run with a fixed random seed* -#' @srrstatsNA {G5.9b} *Running under different random seeds or initial conditions does not meaningfully change results* -#' @srrstatsNA {G5.11a} *When any downloads of additional data necessary for extended tests fail, the tests themselves should not fail, rather be skipped and implicitly succeed with an appropriate diagnostic message.* -#' @srrstatsNA {RE2.2} *Regression Software should provide different options for processing missing values in predictor and response data. For example, it should be possible to fit a model with no missing predictor data in order to generate values for all associated response points, even where submitted response values may be missing.* -#' @srrstatsNA {RE4.1} *Regression Software may enable an ability to generate a model object without actually fitting values. This may be useful for controlling batch processing of computationally intensive fitting algorithms.* -#' @srrstatsNA {RE4.16} *Regression Software which models distinct responses for different categorical groups should include the ability to submit new groups to `predict()` methods.* -#' @srrstatsNA {RE4.14} *Where possible, values should also be provided for extrapolation or forecast *errors*.* -#' @srrstatsNA {RE4.15} *Sufficient documentation and/or testing should be provided to demonstrate that forecast errors, confidence intervals, or equivalent values increase with forecast horizons.* -#' @srrstatsNA {RE4.17} *Model objects returned by Regression Software should implement or appropriately extend a default `print` method which provides an on-screen summary of model (input) parameters and (output) coefficients.* -#' @srrstatsNA {RE6.0} *Model objects returned by Regression Software (see* **RE4***) should have default `plot` methods, either through explicit implementation, extension of methods for existing model objects, or through ensuring default methods work appropriately.* -#' @srrstatsNA {RE6.1} *Where the default `plot` method is **NOT** a generic `plot` method dispatched on the class of return objects (that is, through an S3-type `plot.` function or equivalent), that method dispatch (or equivalent) should nevertheless exist in order to explicitly direct users to the appropriate function.* -#' @srrstatsNA {RE6.2} *The default `plot` method should produce a plot of the `fitted` values of the model, with optional visualisation of confidence intervals or equivalent.* -#' @srrstatsNA {RE6.3} *Where a model object is used to generate a forecast (for example, through a `predict()` method), the default `plot` method should provide clear visual distinction between modelled (interpolated) and forecast (extrapolated) values.* +#' @srrstatsNA {G2.14c} replace missing data with appropriately imputed values +#' @srrstatsNA {G2.16} All functions should also provide options to handle undefined values (e.g., `NaN`, `Inf` and `-Inf`), including potentially ignoring or removing such values. +#' @srrstatsNA {G4.0} Statistical Software which enables outputs to be written to local files should parse parameters specifying file names to ensure appropriate file suffices are automatically generated where not provided. +#' @srrstatsNA {G5.3} For functions which are expected to return objects containing no missing (`NA`) or undefined (`NaN`, `Inf`) values, the absence of any such values in return objects should be explicitly tested. +#' @srrstatsNA {G5.4} Correctness tests to test that statistical algorithms produce expected results to some fixed test data sets (potentially through comparisons using binding frameworks such as [RStata](https://github.com/lbraglia/RStata)). +#' @srrstatsNA {G5.5} Correctness tests should be run with a fixed random seed +#' @srrstatsNA {G5.9b} Running under different random seeds or initial conditions does not meaningfully change results +#' @srrstatsNA {G5.10} Extended tests should included and run under a common framework with other tests but be switched on by flags such as as a `_EXTENDED_TESTS="true"` environment variable. - The extended tests can be then run automatically by GitHub Actions for example by adding the following to the `env` section of the workflow: +#' @srrstatsNA {G5.11} Where extended tests require large data sets or other assets, these should be provided for downloading and fetched as part of the testing workflow. +#' @srrstatsNA {G5.11a} When any downloads of additional data necessary for extended tests fail, the tests themselves should not fail, rather be skipped and implicitly succeed with an appropriate diagnostic message. +#' @srrstatsNA {RE2.2} Regression Software should provide different options for processing missing values in predictor and response data. For example, it should be possible to fit a model with no missing predictor data in order to generate values for all associated response points, even where submitted response values may be missing. +#' @srrstatsNA {RE4.1} Regression Software may enable an ability to generate a model object without actually fitting values. This may be useful for controlling batch processing of computationally intensive fitting algorithms. +#' @srrstatsNA {RE4.16} Regression Software which models distinct responses for different categorical groups should include the ability to submit new groups to `predict()` methods. +#' @srrstatsNA {RE4.14} Where possible, values should also be provided for extrapolation or forecast errors. +#' @srrstatsNA {RE4.15} Sufficient documentation and/or testing should be provided to demonstrate that forecast errors, confidence intervals, or equivalent values increase with forecast horizons. +#' @srrstatsNA {RE4.17} Model objects returned by Regression Software should implement or appropriately extend a default `print` method which provides an on-screen summary of model (input) parameters and (output) coefficients. +#' @srrstatsNA {RE6.0} Model objects returned by Regression Software (see RE4) should have default `plot` methods, either through explicit implementation, extension of methods for existing model objects, or through ensuring default methods work appropriately. +#' @srrstatsNA {RE6.1} Where the default `plot` method is NOT a generic `plot` method dispatched on the class of return objects (that is, through an S3-type `plot.` function or equivalent), that method dispatch (or equivalent) should nevertheless exist in order to explicitly direct users to the appropriate function. +#' @srrstatsNA {RE6.2} The default `plot` method should produce a plot of the `fitted` values of the model, with optional visualisation of confidence intervals or equivalent. +#' @srrstatsNA {RE6.3} Where a model object is used to generate a forecast (for example, through a `predict()` method), the default `plot` method should provide clear visual distinction between modelled (interpolated) and forecast (extrapolated) values. #' @noRd NULL diff --git a/tests/testthat/test-deterministic.R b/tests/testthat/test-deterministic.R new file mode 100644 index 0000000..12aed32 --- /dev/null +++ b/tests/testthat/test-deterministic.R @@ -0,0 +1,25 @@ +#' srr_stats (tests) +#' @srrstatsVerbose TRUE +#' @srrstats {RE7.0} Test exact relationships between predictors. +#' @srrstats {RE7.0a} Reject perfectly noiseless input data. +#' @srrstats {RE7.1} Tests exact relationships between predictor and response. +#' @noRd +NULL + +test_that("deterministic relations", { + set.seed(123) + d <- data.frame( + y = rnorm(100), + f = 1 + ) + + d$x <- 2 * d$y + d$x2 <- 2 * d$y + + # the solution is beta = 0.5 but we have the solve() function to + # solve a linear system of equations! + expect_error(coef(feglm(y ~ x |f, d)), "Linear dependent terms") + + # error because we check linear dependency in the data + expect_error(feglm(y ~ x + x2 | f, d), "Linear dependent terms") +}) diff --git a/tests/testthat/test-errors.R b/tests/testthat/test-errors.R index 91a961f..921f5f0 100644 --- a/tests/testthat/test-errors.R +++ b/tests/testthat/test-errors.R @@ -1,4 +1,19 @@ -test_that("error conditions", { +#' srr_stats (tests) +#' @srrstatsVerbose TRUE +#' @srrstats {G5.2b} Here we cover errors as broadly as possible conditions, +#' and we write intentionally bad examples to compare the result with expected +#' values and inputs. +#' @srrstats {G5.8a} We test for errors created when passing zero-length data +#' frames or NULL as the data argument. +#' @srrstats {G5.8c} We test dependency in the data by checking the range of +#' the data and the number of observations (i.e., determining if the data is +#' linearly dependent). +#' @srrstats {G5.8d} Data with more columns than rows (i.e., linearly dependent) +#' produces an error. +#' @noRd +NULL + +test_that("error conditions in APEs", { trade_panel_2002 <- trade_panel[trade_panel$year == 2002, ] trade_panel_2002$trade_100 <- ifelse(trade_panel_2002$trade >= 100, 1, 0) trade_panel_2002$trade_200_100 <- as.factor(ifelse(trade_panel_2002$trade >= @@ -7,11 +22,8 @@ test_that("error conditions", { -1 ) - - # APEs ---- - - # TODO: test n.pop argument and the rest of apes() - + # no model + expect_error(apes(), "specified") expect_error( @@ -19,11 +31,15 @@ test_that("error conditions", { "non-'feglm'" ) + # using APEs with Poisson + expect_error( apes(fepoisson(trade ~ log_dist | rta, data = trade_panel_2002)), "binary choice" ) + # not using two-way fixed effects + expect_error( apes( feglm( @@ -35,6 +51,8 @@ test_that("error conditions", { ), "two-way" ) + # not using three-way fixed effects + expect_error( apes( feglm( @@ -46,7 +64,33 @@ test_that("error conditions", { ), "three-way" ) - # GLMs ---- + # wrong population size + + trade_panel_2002$tradebin <- ifelse(trade_panel_2002$trade > 100, 1L, 0L) + + expect_error( + apes( + feglm( + tradebin ~ lang | year, + data = trade_panel_2002, + family = binomial() + ), + # n_pop = 4692 + n_pop = NA + ), "missing value" + ) +}) + +test_that("error conditions in GLMs", { + trade_panel_2002 <- trade_panel[trade_panel$year == 2002, ] + trade_panel_2002$trade_100 <- ifelse(trade_panel_2002$trade >= 100, 1, 0) + trade_panel_2002$trade_200_100 <- as.factor(ifelse(trade_panel_2002$trade >= + 200, 1, ifelse(trade_panel_2002$trade >= 100, 0.5, 0))) + trade_panel_2002$trade_1_minus1 <- ifelse(trade_panel_2002$trade >= 100, 1, + -1 + ) + + # 0 rows in the data expect_error( fepoisson( @@ -56,6 +100,8 @@ test_that("error conditions", { "zero observations" ) + # incorrect deviance tolerance + expect_error( fepoisson( trade ~ log_dist | rta, @@ -65,6 +111,8 @@ test_that("error conditions", { "greater than zero" ) + # bad number of iterations + expect_error( fepoisson( trade ~ log_dist | rta, @@ -74,6 +122,8 @@ test_that("error conditions", { "at least one" ) + # bad number of iterations + expect_error( fepoisson( trade ~ log_dist | rta, @@ -82,20 +132,25 @@ test_that("error conditions", { ), "at least one" ) +}) - # Helpers ---- +test_that("error conditions in helpers", { + trade_panel_2002 <- trade_panel[trade_panel$year == 2002, ] + trade_panel_2002$trade_100 <- ifelse(trade_panel_2002$trade >= 100, 1, 0) + trade_panel_2002$trade_200_100 <- as.factor(ifelse(trade_panel_2002$trade >= + 200, 1, ifelse(trade_panel_2002$trade >= 100, 0.5, 0))) + trade_panel_2002$trade_1_minus1 <- ifelse(trade_panel_2002$trade >= 100, 1, + -1 + ) - # TODO: - # weights - # linear dependence - # init.theta - # beta.start - # eta.start + # no formula expect_error( feglm(data = trade_panel_2002), "'formula' has to be specified" ) + # incorrect formula + expect_error( feglm( formula = "a ~ b", @@ -104,6 +159,8 @@ test_that("error conditions", { "'formula' has to be of class 'formula'" ) + # null data + expect_error( fepoisson( trade ~ log_dist | rta, @@ -112,6 +169,8 @@ test_that("error conditions", { "'data' has to be specified" ) + # empty data + expect_error( fepoisson( trade ~ log_dist | rta, @@ -120,6 +179,8 @@ test_that("error conditions", { "length zero" ) + # incorrect control + expect_error( fepoisson( trade ~ log_dist | rta, @@ -129,6 +190,8 @@ test_that("error conditions", { "'control' has to be a list" ) + # incorrect family + expect_error( feglm( trade ~ log_dist | rta, @@ -138,6 +201,8 @@ test_that("error conditions", { "'family' has to be of class family" ) + # we have the cluster estimator to do the same as quasi-Poisson + expect_error( feglm( trade ~ log_dist | rta, @@ -147,6 +212,8 @@ test_that("error conditions", { "Quasi-variants of 'family' are not supported" ) + # fitting a negative binomial model with the GLM function + expect_error( feglm( trade ~ log_dist | rta, @@ -156,6 +223,8 @@ test_that("error conditions", { "use 'fenegbin' instead" ) + # not adding fixed effects + expect_error( fepoisson( trade ~ log_dist, @@ -164,6 +233,8 @@ test_that("error conditions", { "'formula' incorrectly specified" ) + # incorrect data + link = bad response + expect_error( feglm( trade ~ log_dist | rta, @@ -173,6 +244,8 @@ test_that("error conditions", { "response has to be within the unit interval" ) + # incorrect data + link = bad response + expect_error( feglm( trade_200_100 ~ log_dist | rta, @@ -182,6 +255,8 @@ test_that("error conditions", { "response has to be binary" ) + # incorrect data + link = bad response + expect_error( feglm( trade_1_minus1 ~ log_dist | rta, @@ -191,6 +266,8 @@ test_that("error conditions", { "response has to be strictly positive" ) + # incorrect data + link = bad response + expect_error( feglm( trade_1_minus1 ~ log_dist | rta, @@ -199,4 +276,50 @@ test_that("error conditions", { ), "response has to be strictly positive" ) + + # incorrect beta + + expect_error( + feglm( + trade ~ log_dist | rta, + data = trade_panel_2002, + beta_start = NA # not allowed + ), + "Invalid input type" + ) + + # incorrect eta + + expect_error( + feglm( + trade ~ log_dist | rta, + data = trade_panel_2002, + eta_start = rep(NA, nrow(trade_panel_2002)) + ), + "Invalid input type" + ) + + # incorrect theta + + expect_error( + fenegbin( + trade ~ log_dist | rta, + data = trade_panel_2002, + init_theta = -1 # not allowed + ), + "has to be strictly positive" + ) + + # intentionally break the data with unusable weights + + trade_panel_2002$bad_weights <- NA + + expect_error( + feglm( + trade ~ log_dist | rta, + data = trade_panel_2002, + weights = "bad_weights" + ), + "Linear dependent terms detected" + ) }) diff --git a/tests/testthat/test-feglm.R b/tests/testthat/test-feglm.R index 2abda0c..7b76506 100644 --- a/tests/testthat/test-feglm.R +++ b/tests/testthat/test-feglm.R @@ -2,24 +2,14 @@ #' #' @srrstatsVerbose TRUE #' -#' @srrstats {G5.0} *Where applicable or practicable, tests should use standard data sets with known properties (for example, the [NIST Standard Reference Datasets](https://www.itl.nist.gov/div898/strd/), or data sets provided by other widely-used R packages).* -#' @srrstats {G5.1} *Data sets created within, and used to test, a package should be exported (or otherwise made generally available) so that users can confirm tests and run examples.* -#' @srrstats {G5.2} *Appropriate error and warning behaviour of all functions should be explicitly demonstrated through tests. In particular,* -#' @srrstats {G5.2b} *Explicit tests should demonstrate conditions which trigger every one of those messages, and should compare the result with expected values.* -#' @srrstats {G5.4b} *For new implementations of existing methods, correctness tests should include tests against previous implementations. Such testing may explicitly call those implementations in testing, preferably from fixed-versions of other software, or use stored outputs from those where that is not possible.* +#' @srrstats {G5.0} The tests use the widely known mtcars data set. It has few +#' observations, and it is easy to compare the results with the base R +#' functions. +#' @srrstats {G5.4b} We determine correctess for GLMs by comparison, checking +#' the estimates versus base R and hardcoded values obtained with Alpaca +#' (Stammann, 2018). #' @srrstats {G5.8} **Edge condition tests** *to test that these conditions produce expected behaviour such as clear warnings or errors when confronted with data with extreme properties including but not limited to:* -#' @srrstats {G5.8a} *Zero-length data* #' @srrstats {G5.8b} *Data of unsupported types (e.g., character or complex numbers in for functions designed only for numeric data)* -#' @srrstats {G5.8c} *Data with all-`NA` fields or columns or all identical fields or columns* -#' @srrstats {G5.8d} *Data outside the scope of the algorithm (for example, data with more fields (columns) than observations (rows) for some regression algorithms)* -#' @srrstats {G5.9} **Noise susceptibility tests** *Packages should test for expected stochastic behaviour, such as through the following conditions:* -#' @srrstats {G5.9a} *Adding trivial noise (for example, at the scale of `.Machine$double.eps`) to data does not meaningfully change results* -#' @srrstats {G5.10} *Extended tests should included and run under a common framework with other tests but be switched on by flags such as as a `_EXTENDED_TESTS="true"` environment variable.* - The extended tests can be then run automatically by GitHub Actions for example by adding the following to the `env` section of the workflow: -#' @srrstats {G5.11} *Where extended tests require large data sets or other assets, these should be provided for downloading and fetched as part of the testing workflow.* -#' @srrstats {RE7.0} *Tests with noiseless, exact relationships between predictor (independent) data.* -#' @srrstats {RE7.0a} In particular, these tests should confirm ability to reject perfectly noiseless input data. -#' @srrstats {RE7.1} *Tests with noiseless, exact relationships between predictor (independent) and response (dependent) data.* -#' @srrstats {RE7.1a} *In particular, these tests should confirm that model fitting is at least as fast or (preferably) faster than testing with equivalent noisy data (see RE2.4b).* #' @srrstats {RE7.2} Demonstrate that output objects retain aspects of input data such as row or case names (see **RE1.3**). #' @srrstats {RE7.3} Demonstrate and test expected behaviour when objects returned from regression software are submitted to the accessor methods of **RE4.2**--**RE4.7**. #' @srrstats {RE7.4} Extending directly from **RE4.15**, where appropriate, tests should demonstrate and confirm that forecast errors, confidence intervals, or equivalent values increase with forecast horizons. @@ -30,7 +20,7 @@ NULL test_that("feglm is similar to glm", { # Gaussian ---- - # see felm + # see test-felm.R # Poisson diff --git a/tests/testthat/test-felm.R b/tests/testthat/test-felm.R index 8ac83d9..e216082 100644 --- a/tests/testthat/test-felm.R +++ b/tests/testthat/test-felm.R @@ -1,30 +1,6 @@ #' srr_stats (tests) -#' #' @srrstatsVerbose TRUE -#' -#' @srrstats {G5.0} *Where applicable or practicable, tests should use standard data sets with known properties (for example, the [NIST Standard Reference Datasets](https://www.itl.nist.gov/div898/strd/), or data sets provided by other widely-used R packages).* -#' @srrstats {G5.1} *Data sets created within, and used to test, a package should be exported (or otherwise made generally available) so that users can confirm tests and run examples.* -#' @srrstats {G5.2} *Appropriate error and warning behaviour of all functions should be explicitly demonstrated through tests. In particular,* -#' @srrstats {G5.2b} *Explicit tests should demonstrate conditions which trigger every one of those messages, and should compare the result with expected values.* -#' @srrstats {G5.4b} *For new implementations of existing methods, correctness tests should include tests against previous implementations. Such testing may explicitly call those implementations in testing, preferably from fixed-versions of other software, or use stored outputs from those where that is not possible.* -#' @srrstats {G5.7} **Algorithm performance tests** *to test that implementation performs as expected as properties of data change. For instance, a test may show that parameters approach correct estimates within tolerance as data size increases, or that convergence times decrease for higher convergence thresholds.* -#' @srrstats {G5.8} **Edge condition tests** *to test that these conditions produce expected behaviour such as clear warnings or errors when confronted with data with extreme properties including but not limited to:* -#' @srrstats {G5.8a} *Zero-length data* -#' @srrstats {G5.8b} *Data of unsupported types (e.g., character or complex numbers in for functions designed only for numeric data)* -#' @srrstats {G5.8c} *Data with all-`NA` fields or columns or all identical fields or columns* -#' @srrstats {G5.8d} *Data outside the scope of the algorithm (for example, data with more fields (columns) than observations (rows) for some regression algorithms)* -#' @srrstats {G5.9} **Noise susceptibility tests** *Packages should test for expected stochastic behaviour, such as through the following conditions:* -#' @srrstats {G5.9a} *Adding trivial noise (for example, at the scale of `.Machine$double.eps`) to data does not meaningfully change results* -#' @srrstats {G5.10} *Extended tests should included and run under a common framework with other tests but be switched on by flags such as as a `_EXTENDED_TESTS="true"` environment variable.* - The extended tests can be then run automatically by GitHub Actions for example by adding the following to the `env` section of the workflow: -#' @srrstats {G5.11} *Where extended tests require large data sets or other assets, these should be provided for downloading and fetched as part of the testing workflow.* -#' @srrstats {RE7.0} *Tests with noiseless, exact relationships between predictor (independent) data.* -#' @srrstats {RE7.0a} In particular, these tests should confirm ability to reject perfectly noiseless input data. -#' @srrstats {RE7.1} *Tests with noiseless, exact relationships between predictor (independent) and response (dependent) data.* -#' @srrstats {RE7.1a} *In particular, these tests should confirm that model fitting is at least as fast or (preferably) faster than testing with equivalent noisy data (see RE2.4b).* -#' @srrstats {RE7.2} Demonstrate that output objects retain aspects of input data such as row or case names (see **RE1.3**). -#' @srrstats {RE7.3} Demonstrate and test expected behaviour when objects returned from regression software are submitted to the accessor methods of **RE4.2**--**RE4.7**. -#' @srrstats {RE7.4} Extending directly from **RE4.15**, where appropriate, tests should demonstrate and confirm that forecast errors, confidence intervals, or equivalent values increase with forecast horizons. -#' +#' @srrstats {G5.4b} See test-feglm.R #' @noRd NULL @@ -75,21 +51,6 @@ test_that("felm works", { expect_equal(round(coef(m1), 2), round(coef(m2)[c(2, 3)], 2)) }) -test_that("felm works with perfect relationships", { - set.seed(200100) - d <- data.frame( - y = rnorm(100), - f = factor(sample(1:2, 1000, replace = TRUE)) - ) - d$x <- 2 * d$y - - fit <- felm(y ~ x | f, data = d) - s1 <- summary(fit) - expect_equal(s1$r.squared, 1) - expect_equal(s1$adj.r.squared, 1) - expect_equal(s1$cm[4], 0) -}) - test_that("felm time is the same adding noise to the data", { mtcars2 <- mtcars[, c("mpg", "wt", "cyl")] set.seed(200100) diff --git a/tests/testthat/test-fenegbin.R b/tests/testthat/test-fenegbin.R index cd5976c..9cdcc5b 100644 --- a/tests/testthat/test-fenegbin.R +++ b/tests/testthat/test-fenegbin.R @@ -1,76 +1,21 @@ #' srr_stats (tests) -#' #' @srrstatsVerbose TRUE -#' -#' @srrstats {G5.12} *Any conditions necessary to run extended tests such as platform requirements, memory, expected runtime, and artefacts produced that may need manual inspection, should be described in developer documentation such as a `CONTRIBUTING.md` or `tests/README.md` file.* -#' +#' @srrstats {G5.4b} See test-feglm.R #' @noRd NULL test_that("fenegbin is similar to fixest", { - # use one year or otherwise devtools::check() gives a warning about the time - # it takes - trade_panel_2006 <- trade_panel[trade_panel$year == 2006, ] + mod <- fenegbin(mpg ~ wt | cyl, mtcars) - mod <- fenegbin( - trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, - trade_panel_2006 + mod_base <- glm( + mpg ~ wt + as.factor(cyl), + mtcars, + family = quasipoisson(link = "log") ) - # the vector comes from: - # mod_fixest <- fixest::fenegbin( - # trade ~ log_dist + lang + cntg + clny | exp_year + imp_year, - # trade_panel_2006, - # cluster = ~pair - # ) + coef_dist_base <- coef(mod_base)[2] - summary_mod <- summary(mod, type = "clustered") + dist_variation <- abs((coef(mod)[1] - coef_dist_base) / coef(mod)[1]) - # the vector comes from: - # summary_mod_fixest <- summary(mod_fixest) - # summary_mod_fixest$coeftable[,2][1:4] - summary_mod_fixest <- c(0.03234993, 0.07188846, 0.14751949, 0.12471723) - - expect_equal( - unname(round(summary_mod$cm[, 2] - summary_mod_fixest, 1)), - rep(0, 4) - ) + expect_lt(dist_variation, 0.05) }) - -# test_that("fenegbin time is the same adding noise to the data", { -# trade_panel2 <- trade_panel -# set.seed(200100) -# trade_panel2$trade2 <- trade_panel$trade + rbinom(nrow(trade_panel2), 1, 0.5) * -# .Machine$double.eps -# m1 <- fenegbin( -# trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# m2 <- fenegbin( -# trade2 ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# expect_equal(coef(m1), coef(m2)) -# expect_equal(fixed_effects(m1), fixed_effects(m2)) - -# t1 <- rep(NA, 10) -# t2 <- rep(NA, 10) -# for (i in 1:10) { -# a <- Sys.time() -# m1 <- fenegbin( -# trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# b <- Sys.time() -# t1[i] <- b - a - -# a <- Sys.time() -# m2 <- fenegbin( -# trade2 ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# b <- Sys.time() -# t2[i] <- b - a -# } -# expect_lte(abs(median(t1) - median(t2)), 0.05) -# }) diff --git a/tests/testthat/test-fepoisson.R b/tests/testthat/test-fepoisson.R index e67ec29..a8055e4 100644 --- a/tests/testthat/test-fepoisson.R +++ b/tests/testthat/test-fepoisson.R @@ -1,39 +1,23 @@ #' srr_stats (tests) -#' #' @srrstatsVerbose TRUE -#' -#' @srrstats {G5.12} *Any conditions necessary to run extended tests such as platform requirements, memory, expected runtime, and artefacts produced that may need manual inspection, should be described in developer documentation such as a `CONTRIBUTING.md` or `tests/README.md` file.* -#' +#' @srrstats {G5.4b} See test-feglm.R #' @noRd NULL test_that("fepoisson is similar to fixest", { - mod <- fepoisson( - trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, - trade_panel - ) - - # mod_fixest <- fixest::fepois( - # trade ~ log_dist + lang + cntg + clny | exp_year + imp_year, - # trade_panel, - # cluster = ~pair - # ) + mod <- fepoisson(mpg ~ wt | cyl | am, mtcars) - coef_mod_fixest <- c(-0.8409273, 0.2474765, 0.4374432, -0.2224899) - - expect_equal(unname(round(coef(mod) - coef_mod_fixest, 5)), rep(0, 4)) + mod_base <- glm( + mpg ~ wt + as.factor(cyl), + mtcars, + family = quasipoisson(link = "log") + ) - summary_mod <- summary(mod, type = "clustered") + coef_dist_base <- coef(mod_base)[2] - # the vector comes from: - # summary_mod_fixest <- summary(mod_fixest); - # summary_mod_fixest$coeftable[,2] - summary_mod_fixest <- c(0.02656441, 0.06322979, 0.06825364, 0.09380935) + dist_variation <- abs((coef(mod)[1] - coef_dist_base) / coef(mod)[1]) - expect_equal( - unname(round(summary_mod$cm[, 2] - summary_mod_fixest, 2)), - rep(0, 4) - ) + expect_lt(dist_variation, 0.05) expect_output(print(mod)) @@ -41,88 +25,71 @@ test_that("fepoisson is similar to fixest", { fes <- fixed_effects(mod) n <- unname(mod[["nobs"]]["nobs"]) - expect_equal(length(fes), 2) + expect_equal(length(fes), 1) expect_equal(length(fitted(mod)), n) expect_equal(length(predict(mod)), n) - expect_equal(length(coef(mod)), 4) - expect_equal(length(fes), 2) - expect_equal(round(fes[["exp_year"]][1:3], 3), c(10.195, 11.081, 11.260)) - expect_equal(round(fes[["imp_year"]][1:3], 3), c(0.226, -0.254, 1.115)) + expect_equal(length(coef(mod)), 1) + expect_equal(length(fes), 1) + + expect_equal( + round(fes[["cyl"]][1], 2), + unname(round(coef(glm(mpg ~ wt + as.factor(cyl), mtcars, family = quasipoisson(link = "log")))[1], 2)) + ) smod <- summary(mod) - expect_equal(length(coef(smod)[, 1]), 4) + expect_equal(length(coef(smod)[, 1]), 1) expect_output(summary_formula_(smod)) expect_output(summary_family_(smod)) expect_output(summary_estimates_(smod, 3)) expect_output(summary_r2_(smod, 3)) expect_output(summary_nobs_(smod)) expect_output(summary_fisher_(smod)) +}) - trade_panel_2 <- trade_panel[trade_panel$year %in% c(2002, 2006), ] - - if (identical(Sys.info()[["user"]], "pacha")) { - t_fepoisson <- rep(0, 10) - - t1 <- Sys.time() - fit <- fepoisson( - trade ~ log_dist + lang + cntg + clny | exp_year + imp_year, - trade_panel_2 - ) - t2 <- Sys.time() - t_fepoisson <- t2 - t1 - - t_glm <- rep(0, 10) - - t1 <- Sys.time() - fit <- suppressWarnings(glm( - trade ~ log_dist + lang + cntg + clny + as.factor(exp_year) + - as.factor(imp_year), - trade_panel_2, - family = poisson(link = "log") - )) - t2 <- Sys.time() +#' srr_stats (tests) +#' +#' @srrstatsVerbose TRUE +#' +#' @srrstats {G5.9a} Here we add a censored white noise (i.e., y cannot be < 0 +#' in a Poisson model). The noise is rnorm * .Machine$double.eps to check that +#' the slopes do not change. See test-feglm.R. +#' @srrstats {RE7.1a} Model fitting is at least as fast or (preferably) faster +#' than testing with equivalent noisy data (see RE2.4b).* +#' +#' @noRd +NULL - t_glm <- t2 - t1 +test_that("fepoisson time is the same adding noise to the data", { + set.seed(123) + d <- data.frame( + x = rnorm(1000), + y = rpois(1000, 1), + f = factor(rep(1:10, 100)) + ) - expect_lte(t_fepoisson, t_glm) + set.seed(123) + d$y2 <- d$y + pmax(rnorm(nrow(d)), 0) * .Machine$double.eps + + m1 <- fepoisson(y ~ x | f, d) + m2 <- fepoisson(y2 ~ x | f, d) + expect_equal(coef(m1), coef(m2)) + expect_equal(fixed_effects(m1), fixed_effects(m2)) + + t1 <- rep(NA, 10) + t2 <- rep(NA, 10) + for (i in 1:10) { + a <- Sys.time() + m1 <- fepoisson(y ~ x | f, d) + b <- Sys.time() + t1[i] <- b - a + + a <- Sys.time() + m2 <- fepoisson(y2 ~ x | f, d) + b <- Sys.time() + t2[i] <- b - a } + expect_gt(abs(median(t1) / median(t2)), 0.9) + expect_lt(abs(median(t1) / median(t2)), 1) + expect_lt(median(t1), median(t2)) }) - -# test_that("fepoisson time is the same adding noise to the data", { -# trade_panel2 <- trade_panel -# set.seed(200100) -# trade_panel2$trade2 <- trade_panel$trade + rbinom(nrow(trade_panel2), 1, 0.5) * -# .Machine$double.eps -# m1 <- fepoisson( -# trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# m2 <- fepoisson( -# trade2 ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# expect_equal(coef(m1), coef(m2)) -# expect_equal(fixed_effects(m1), fixed_effects(m2)) - -# t1 <- rep(NA, 10) -# t2 <- rep(NA, 10) -# for (i in 1:10) { -# a <- Sys.time() -# m1 <- fepoisson( -# trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# b <- Sys.time() -# t1[i] <- b - a - -# a <- Sys.time() -# m2 <- fepoisson( -# trade2 ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, -# trade_panel2 -# ) -# b <- Sys.time() -# t2[i] <- b - a -# } -# expect_lte(abs(median(t1) - median(t2)), 0.05) -# }) diff --git a/tests/testthat/test-vcov.R b/tests/testthat/test-vcov.R index d0cb7e1..e368096 100644 --- a/tests/testthat/test-vcov.R +++ b/tests/testthat/test-vcov.R @@ -1,20 +1,14 @@ test_that("vcov works", { - m1 <- fepoisson( - trade ~ log_dist + lang + cntg + clny | exp_year + imp_year, - trade_panel - ) + m1 <- fepoisson(mpg ~ wt + disp | cyl, mtcars) - m2 <- fepoisson( - trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | pair, - trade_panel - ) + m2 <- fepoisson(mpg ~ wt + disp | cyl | carb, mtcars) v1 <- vcov(m1) v2 <- vcov(m2, type = "clustered") - v3 <- vcov(m1, type = "sandwich") - v4 <- vcov(m1, type = "outer.product") + v3 <- vcov(m2, type = "sandwich") + v4 <- vcov(m2, type = "outer.product") - expect_gt(norm(v2), norm(v1)) - expect_gt(norm(v3), norm(v1)) - expect_gt(norm(v4), norm(v1)) + expect_gt(sum(diag(v1)), sum(diag(v2))) + expect_gt(sum(diag(v1)), sum(diag(v3))) + expect_gt(sum(diag(v1)), sum(diag(v4))) })