diff --git a/.Rbuildignore b/.Rbuildignore index aa09cc1d..cbf22e53 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,6 @@ ^appveyor\.yml$ ^doc$ ^Meta$ +^\.github$ +^\.lintr +^docs$ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46523f94..1c1ff79a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -50,7 +50,6 @@ jobs: with: extra-packages: | any::devtools - any::covr - name: Document run: devtools::document() @@ -62,14 +61,10 @@ jobs: git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git add man/\* NAMESPACE - git commit -m "Update documentation" || echo "No changes to commit" + git commit -m "Update documentation (bot)" || echo "No changes to commit" git pull --ff-only git push origin - - name: Check package + - name: Check package with devtools run: devtools::check() shell: Rscript {0} - - - name: Check coverage - run: covr::codecov(quiet = FALSE) - shell: Rscript {0} diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml new file mode 100644 index 00000000..8a29ff60 --- /dev/null +++ b/.github/workflows/quality.yaml @@ -0,0 +1,40 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: Quality checks + +jobs: + lint-project: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + any::covr + any::lintr + any::styler + + - name: Styler + run: styler::style_pkg(filetype = c('R', 'Rprofile', 'Rmd'), dry='fail') + shell: Rscript {0} + + - name: Lintr + run: lintr::lint_package() + shell: Rscript {0} + + - name: Coverage + run: covr::codecov(quiet = FALSE) + shell: Rscript {0} diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..2201ebb3 --- /dev/null +++ b/.lintr @@ -0,0 +1,7 @@ +error_on_lint: TRUE +linters: with_defaults( + open_curly_linter = NULL, + closed_curly_linter = NULL, + spaces_left_parentheses_linter = NULL, + seq_linter = NULL + ) diff --git a/NAMESPACE b/NAMESPACE index 68af50e8..06751647 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,18 +49,20 @@ export(min_emd_exhaustive) export(min_emd_optimise) export(min_emd_optimise_fast) export(netdis) -export(netdis.plot) export(netdis_centred_graphlet_counts) export(netdis_expected_counts) export(netdis_many_to_many) export(netdis_one_to_many) export(netdis_one_to_one) +export(netdis_plot) export(netdis_subtract_exp_counts) export(netdis_uptok) -export(netemd.plot) export(netemd_many_to_many) export(netemd_one_to_one) +export(netemd_plot) export(netemd_single_pair) +export(netemd_smooth) +export(netemd_speed_test_smooth) export(normalise_dhist_mass) export(normalise_dhist_variance) export(orbit_key) diff --git a/R/PlottingFunctions.R b/R/PlottingFunctions.R index e525dabc..60f41e6c 100644 --- a/R/PlottingFunctions.R +++ b/R/PlottingFunctions.R @@ -1,28 +1,55 @@ #' Heatmap of Netdis many-to-many comparisons #' -#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. -#' +#' Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. +#' #' @param netdislist Default output of \code{netdis_many_to_many}. #' -#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} +#' to be used for plotting. #' -#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} +#' function from the \code{pheatmap} package. The dendrogram will appear if +#' \code{docluster} is TRUE (default). #' #' @param main Title of the plot. -#' -#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. -#' -#' @return Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' +#' @param docluster controls the order of the rows and columns. If TRUE +#' (default) the rows and columns will be reordered to create the dendrogram. If +#' FALSE, then only the heatmap is drawn. +#' +#' @return Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. #' @export -netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D",main="Nedis",docluster=TRUE){ - adjmat <- cross_comp_to_matrix(measure = netdislist$netdis[whatrow,], cross_comparison_spec = netdislist$comp_spec) +netdis_plot <- function(netdislist, + whatrow = c(1, 2)[2], + clustering_method = "ward.D", + main = "Nedis", + docluster = TRUE) { + adjmat <- cross_comp_to_matrix( + measure = netdislist$netdis[whatrow, ], + cross_comparison_spec = netdislist$comp_spec + ) vnames <- rownames(adjmat) - - legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) - levels1 <- round(legend1,digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) + + legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) + levels1 <- round(legend1, digits = 2) + pheatmap::pheatmap( + mat = stats::as.dist(adjmat), + cluster_rows = docluster, + cluster_cols = docluster, + clustering_method = clustering_method, + angle_col = 45, + main = main, + treeheight_row = 80, + labels_row = vnames, + labels_col = vnames, + display_numbers = TRUE, + legend_breaks = legend1, + legend_labels = levels1 + ) } @@ -30,27 +57,52 @@ netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D", #' Heatmap of NetEmd many-to-many comparisons #' -#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. -#' +#' Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. +#' #' @param netdislist Default output of \code{netdis_many_to_many}. #' -#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} +#' to be used for plotting. #' -#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} +#' function from the \code{pheatmap} package. The dendrogram will appear if +#' \code{docluster} is TRUE (default). #' #' @param main Title of the plot. -#' -#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. -#' -#' @return Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +#' +#' @param docluster controls the order of the rows and columns. If TRUE +#' (default) the rows and columns will be reordered to create the dendrogram. If +#' FALSE, then only the heatmap is drawn. +#' +#' @return Provides a heat map and dendrogram for the network comparisons via +#' \code{pheatmap}. #' @export -netemd.plot <- function(netemdlist,clustering_method="ward.D",main="NetEmd",docluster=TRUE){ - adjmat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) +netemd_plot <- function(netemdlist, + clustering_method = "ward.D", + main = "NetEmd", + docluster = TRUE) { + adjmat <- cross_comp_to_matrix( + measure = netemdlist$netemds, + cross_comparison_spec = netemdlist$comp_spec + ) vnames <- rownames(adjmat) - - legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) - levels1 <- round(legend1,digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) - + + legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) + levels1 <- round(legend1, digits = 2) + pheatmap::pheatmap( + mat = stats::as.dist(adjmat), + cluster_rows = docluster, + cluster_cols = docluster, + clustering_method = clustering_method, + angle_col = 45, + main = main, + treeheight_row = 80, + labels_row = vnames, + labels_col = vnames, + display_numbers = TRUE, + legend_breaks = legend1, + legend_labels = levels1 + ) } diff --git a/R/RcppExports.R b/R/RcppExports.R index b7d0c2dc..39a5aee8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,3 +25,13 @@ emd_fast_no_smoothing <- function(locations1, values1, locations2, values2) { .Call(`_netdist_emd_fast_no_smoothing`, locations1, values1, locations2, values2) } +#' @title +#' Compute EMD +NULL + +#' +#' @export +netemd_smooth <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { + .Call(`_netdist_netemd_smooth`, loc1, val1, binWidth1, loc2, val2, binWidth2) +} + diff --git a/R/data.R b/R/data.R index cf5c5dff..688cee76 100644 --- a/R/data.R +++ b/R/data.R @@ -36,9 +36,17 @@ #' } #' #' @format A list of \code{igraph} objects. -#' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. -#' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, Parkinson J (2009) The Modular Organization of Protein Interactions in Escherichia coli. PLoS Comput Biol 5(10): e1000523. \url{https://doi.org/10.1371/journal.pcbi.1000523} -#' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. \url{https://www.ncbi.nlm.nih.gov/taxonomy} +#' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, +#' Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily +#' Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): +#' e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table +#' S2 in the supporting information. +#' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, +#' Parkinson J (2009) The Modular Organization of Protein Interactions in +#' Escherichia coli. PLoS Comput Biol 5(10): e1000523. +#' \url{https://doi.org/10.1371/journal.pcbi.1000523} +#' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. +#' \url{https://www.ncbi.nlm.nih.gov/taxonomy} #' @encoding UTF-8 "virusppi" @@ -47,18 +55,29 @@ #' World trade networks from 1985–2014 -#' -#' The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. +#' +#' The world trade data set consists of a small sample of world trade networks +#' for the years 2001-2014, and pre-computed subgraph counts of a larger set of +#' world trade networks (1985–2014). The world trade networks are based on the +#' data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the +#' United Nations division COMTRADE [Division, 2015] for the years 2001-2014. #' #' \itemize{ -#' \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. -#' \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. +#' \item wtnets: List of \code{igraph} objects providing the world trade +#' networks from 2001–2014. +#' \item Counts: Pre-computed graphlet counts for the world trade networks in +#' the years 1985-2014. #' } -#' -#' @format A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. -#' @source \strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. -#' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). -#' +#' +#' @format A list of two elements. The first element, 'wtnets', is a list of +#' \code{igraph} objects providing a small sample of world trade networks from +#' 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph +#' counts of world trade networks in the years 1985-2014. +#' @source \strong{World trade networks:}. United nations commodity trade +#' statistics database (UN comtrade). http://comtrade.un.org/, 2015. +#' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and +#' Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau +#' of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). +#' #' @encoding UTF-8 "worldtradesub" - diff --git a/R/dhist.R b/R/dhist.R index a5261f76..dea6dda7 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -9,23 +9,27 @@ #' location #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across -#' a bin centred on the location and having width = \code{smoothing_window_width} -#' (default = \code{0} - no smoothing) +#' a bin centred on the location and having +#' width = \code{smoothing_window_width} (default = \code{0} - no smoothing) #' @param sorted Whether or not to return a discrete histogram with locations #' and masses sorted by ascending mass (default = \code{TRUE}) #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' Note that locations where no mass is present are not included in the returned #' \code{dhist} object. Mass in these discrete histograms is treated as being -#' present precisely at the specified location. Discrete histograms should not be used -#' for data where observations have been grouped into bins representing ranges -#' of observation values. +#' present precisely at the specified location. Discrete histograms should not +#' be used for data where observations have been grouped into bins representing +#' ranges of observation values. #' @export -dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) { +dhist <- function(locations, + masses, + smoothing_window_width = 0, + sorted = TRUE) { if (!is_numeric_vector_1d(locations)) { stop("Bin locations must be provided as a 1D numeric vector") } @@ -76,9 +80,11 @@ update_dhist <- #' @param dhist A discrete histogram as a \code{dhist} object #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across -#' a bin centred on the location and having width = \code{smoothing_window_width} -#' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} -#' attribute set to the value provided \code{smoothing_window_width} parameter. +#' a bin centred on the location and having +#' width = \code{smoothing_window_width} +#' @return A copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to the value provided +#' \code{smoothing_window_width} parameter. #' @export as_smoothed_dhist <- function(dhist, smoothing_window_width) { dhist <- update_dhist(dhist, smoothing_window_width = smoothing_window_width) @@ -90,8 +96,8 @@ as_smoothed_dhist <- function(dhist, smoothing_window_width) { #' Returns an "unsmoothed" copy of a \code{dhist} object with its #' \code{smoothing_window_width} attribute set to 0. #' @param dhist A discrete histogram as a \code{dhist} object -#' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} -#' attribute set to 0. +#' @return A copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to 0. #' @export as_unsmoothed_dhist <- function(dhist) { dhist <- update_dhist(dhist, smoothing_window_width = 0) @@ -101,8 +107,8 @@ as_unsmoothed_dhist <- function(dhist) { #' Check if an object is a \code{dhist} discrete histogram #' #' Checks if the input object is of class \code{dhist}. If \code{fast_check} is -#' \code{TRUE} then the only check is whether the object has a class attribute of -#' \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks +#' \code{TRUE} then the only check is whether the object has a class attribute +#' of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks #' are also made to ensure that the object has the structure required of a #' \code{dhist} object. #' @param x An arbitrary object @@ -110,9 +116,9 @@ as_unsmoothed_dhist <- function(dhist) { #' superficial fast check limited to checking the object's class attribute #' is set to \code{dhist} (default = \code{TRUE}) #' @export -is_dhist <- function(x, fast_check = TRUE) { - # Quick check that relies on user not to construct variables with "dhist" class - # that do not have the required elements +is_dhist <- function(x, fast_check = TRUE) { # nolint: cyclocomp_linter. + # Quick check that relies on user not to construct variables with "dhist" + # class that do not have the required elements has_class_attr <- (class(x) == "dhist") if (fast_check) { # Early return if fast check requested @@ -123,23 +129,25 @@ is_dhist <- function(x, fast_check = TRUE) { has_masses <- purrr::has_element(attr(x, "name"), "masses") # Require list with correct class and presence of 1D numeric vector named # elements "locations" and "masses" - return(has_class_attr - && purrr::is_list(x) - && has_locations - && has_masses - && is_numeric_vector_1d(x$locations) - && is_numeric_vector_1d(x$masses)) + return(has_class_attr && + purrr::is_list(x) && + has_locations && + has_masses && + is_numeric_vector_1d(x$locations) && + is_numeric_vector_1d(x$masses)) } #' Discrete histogram from observations (Pure R slow version) #' -#' Generate a sparse discrete histogram from a set of discrete numeric observations +#' Generate a sparse discrete histogram from a set of discrete numeric +#' observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' @export dhist_from_obs_slow <- function(observations) { @@ -162,13 +170,15 @@ dhist_from_obs_slow <- function(observations) { #' Discrete histogram from observations #' -#' Generate a sparse discrete histogram from a set of discrete numeric observations +#' Generate a sparse discrete histogram from a set of discrete numeric +#' observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' @export dhist_from_obs <- function(observations) { @@ -299,7 +309,8 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { stop("ECMFs must have the same type") } ecmf_type <- ecmf_type1 - # Determine all possible locations where either ECMF changes gradient ("knots") + # Determine all possible locations where either ECMF changes gradient + # ("knots") x1 <- ecmf_knots(dhist_ecmf1) x2 <- ecmf_knots(dhist_ecmf2) x <- sort(union(x1, x2)) @@ -375,7 +386,8 @@ segment_area_trapezium <- function(x_diff, y_diff_lower, y_diff_upper) { height_trapezium <- abs(x_diff) base_trapezium <- abs(y_diff_lower) top_trapezium <- abs(y_diff_upper) - segment_area <- 0.5 * height_trapezium * (base_trapezium + top_trapezium) + + 0.5 * height_trapezium * (base_trapezium + top_trapezium) } segment_area_bowtie <- function(x_diff, y_diff_lower, y_diff_upper) { @@ -387,23 +399,6 @@ segment_area_bowtie <- function(x_diff, y_diff_lower, y_diff_upper) { (abs(y_diff_lower) + abs(y_diff_upper)) } -#' Area between two offset Empirical Cumulative Mass Functions (ECMFs) -#' -#' @param ecmf1 An Empirical Cululative Mass Function (ECMF) object of class -#' \code{dhist_ecmf} -#' @param ecmf2 An Empirical Cululative Mass Function (ECMF) object of class -#' \code{dhist_ecmf} -#' @param offset An offset to add to all locations of the first ECMF. Postive -#' offsets will shift the ECMF to the right and negative ones to the left. -#' @return area The area between the two ECMFs, calculated as the integral of -#' the absolute difference between the two ECMFs -area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { - # Construct ECMFs for each normalised histogram - ecmf1 <- dhist_ecmf(shift_dhist(dhist1_norm, offset)) - ecmf2 <- dhist_ecmf(dhist2_norm) - area_between_dhist_ecmfs(ecmf1, ecmf2) -} - #' Sort discrete histogram #' #' Sort a discrete histogram so that locations are in increasing (default) or @@ -413,7 +408,11 @@ area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { #' increasing (default) or decreasing order of location #' @export sort_dhist <- function(dhist, decreasing = FALSE) { - sorted_indexes <- sort(dhist$locations, decreasing = decreasing, index.return = TRUE)$ix + sorted_indexes <- sort( + dhist$locations, + decreasing = decreasing, + index.return = TRUE + )$ix dhist$masses <- dhist$masses[sorted_indexes] dhist$locations <- dhist$locations[sorted_indexes] return(dhist) @@ -463,18 +462,21 @@ dhist_variance <- function(dhist) { # For unsmoothed discrete histograms, the mass associated with each location # is located precisely at the lcoation. Therefore cariance (i.e. E[X^2]) # is the mass-weighted sum of the mean-centred locations - variance <- sum(dhist$masses * (mean_centred_locations)^2) / sum(dhist$masses) + variance <- sum(dhist$masses * (mean_centred_locations)^2) / + sum(dhist$masses) } else { - # For smoothed histograms, the mass associated with each location is "smoothed" - # uniformly across a bin centred on the location with width = smoothing_window_width - # Variance (i.e. E[X^2]) is therefore the mass-weighted sum of the integrals - # of x^2 over the mean-centred bins at each location. + # For smoothed histograms, the mass associated with each location is + # "smoothed" uniformly across a bin centred on the location with + # width = smoothing_window_width Variance (i.e. E[X^2]) is therefore the + # mass-weighted sum of the integrals of x^2 over the mean-centred bins at + # each location. hw <- dhist$smoothing_window_width / 2 bin_lowers <- mean_centred_locations - hw bin_uppers <- mean_centred_locations + hw # See comment in issue #21 on Github repository for verification that E[X^2] # is calculated as below for a uniform bin - bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers * bin_uppers) / 3 + bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + + bin_lowers * bin_uppers) / 3 variance <- sum(dhist$masses * bin_x2_integrals) / sum(dhist$masses) } return(variance) @@ -540,13 +542,18 @@ normalise_dhist_variance <- function(dhist) { std_dev <- dhist_std(dhist) centred_locations <- (dhist$locations - dhist_mean_location(dhist)) normalised_centred_locations <- centred_locations / std_dev - normalised_locations <- normalised_centred_locations + dhist_mean_location(dhist) + normalised_locations <- normalised_centred_locations + + dhist_mean_location(dhist) dhist <- update_dhist(dhist, locations = normalised_locations) # If smoothing_window_width not zero, then update it to reflect the variance # normalisation if (dhist$smoothing_window_width != 0) { - normalised_smoothing_window_width <- dhist$smoothing_window_width / std_dev - dhist <- update_dhist(dhist, smoothing_window_width = normalised_smoothing_window_width) + norm_smoothing_window_width <- + dhist$smoothing_window_width / std_dev + dhist <- update_dhist( + dhist, + smoothing_window_width = norm_smoothing_window_width + ) } } return(dhist) @@ -584,7 +591,8 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { #' #' Check if a variable is a 1D numeric vector by checking that: #' \itemize{ -#' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers +#' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of +#' numbers #' \item \code{is_null(dim(input))}: Input is not a matrix or array #' } #' @param input Arbitrary object diff --git a/R/emd.R b/R/emd.R index 86575521..a12e3cec 100644 --- a/R/emd.R +++ b/R/emd.R @@ -9,8 +9,8 @@ #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @return Earth Mover's Distance between the two discrete histograms #' @export @@ -43,7 +43,8 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { # Can we run the C++ fast implementation (only works with no smoothing)? - if ((dhist1$smoothing_window_width == 0) && (dhist2$smoothing_window_width == 0)) { + if ((dhist1$smoothing_window_width == 0) && + (dhist2$smoothing_window_width == 0)) { # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) @@ -66,7 +67,6 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { val2 <- val2 / val2[length(val2)] loc1 <- dhist1$locations loc2 <- dhist2$locations - count <- 0 emd_offset <- function(offset) { temp1 <- emd_fast_no_smoothing(loc1 + offset, val1, loc2, val2) temp1 @@ -80,10 +80,54 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { min_emd <- soln$objective min_offset <- soln$minimum return(list(min_emd = min_emd, min_offset = min_offset)) - } - else { - # Fall back on other version if either dhist is smoothed - return(min_emd_optimise(dhist1, dhist2)) + } else { + val1 <- cumsum(dhist1$masses) + val2 <- cumsum(dhist2$masses) + val1 <- val1 / val1[length(val1)] + val2 <- val2 / val2[length(val2)] + loc1 <- dhist1$locations + loc2 <- dhist2$locations + bin_width_1 <- dhist1$smoothing_window_width + bin_width_2 <- dhist2$smoothing_window_width + # Offset the histograms to make the alignments work + loc1_mod <- loc1 - bin_width_1 / 2 + loc2_mod <- loc2 - bin_width_2 / 2 + # Determine minimum and maximum offset of range in which histograms overlap + # (based on sliding histogram 1) + min_offset <- min(loc2_mod) - max(loc1_mod) - max(bin_width_1, bin_width_2) + max_offset <- max(loc2_mod) - min(loc1_mod) + max(bin_width_1, bin_width_2) + # Set lower and upper range for optimise algorithm to be somewhat wider than + # range defined by the minimum and maximum offset. This guards against a + # couple of issues that arise if the optimise range is exactly min_offset + # to max_offset + # 1) If lower and upper are equal, the optimise method will throw and error + # 2) It seems that optimise is not guaranteed to explore its lower and upper + # bounds, even in the case where one of them is the offset with minimum + # EMD + buffer <- 0.1 + min_offset <- min_offset - buffer + max_offset <- max_offset + buffer + # Define a single parameter function to minimise emd as a function of offset + emd_offset <- function(offset) { + temp1 <- netemd_smooth( + loc1_mod + offset, + val1, + bin_width_1, + loc2_mod, + val2, + bin_width_2 + ) + temp1 + } + # Get solution from optimiser + soln <- stats::optimise(emd_offset, + lower = min_offset, upper = max_offset, + tol = .Machine$double.eps * 1000 + ) + # Return mnimum EMD and associated offset + min_emd <- soln$objective + min_offset <- soln$minimum + return(list(min_emd = min_emd, min_offset = min_offset)) } } @@ -148,10 +192,10 @@ min_emd_optimise <- function(dhist1, dhist2) { #' to ensure that the offset with the global minimum EMD is found. #' #' This is because of the piece-wise linear nature of the two ECMFs. Between any -#' two offsets where knots from the two ECMFs align, EMD will be either constant, -#' or uniformly increasing or decreasing. Therefore, there the EMD between two -#' sets of aligned knots cannot be smaller than the EMD at one or other of the -#' bounding offsets. +#' two offsets where knots from the two ECMFs align, EMD will be either +#' constant, or uniformly increasing or decreasing. Therefore, there the EMD +#' between two sets of aligned knots cannot be smaller than the EMD at one or +#' other of the bounding offsets. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object #' @return Earth Mover's Distance between the two discrete histograms @@ -241,15 +285,16 @@ emd <- function(dhist1, dhist2) { #' Distance between the two histograms by summing the absolute difference #' between the two cumulative histograms. #' @references -#' Calculation of the Wasserstein Distance Between Probability Distributions on the Line -#' S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 -#' \url{http://dx.doi.org/10.1137/1118101} +#' Calculation of the Wasserstein Distance Between Probability Distributions on +#' the Line S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, +#' 784-786 \url{http://dx.doi.org/10.1137/1118101} #' @param dhist1 A discrete histogram as a \code{dhist} object #' @param dhist2 A discrete histogram as a \code{dhist} object #' @return Earth Mover's Distance between the two input histograms #' @export emd_cs <- function(dhist1, dhist2) { - # Generate Empirical Cumulative Mass Functions (ECMFs) for each discrete histogram + # Generate Empirical Cumulative Mass Functions (ECMFs) for each discrete + # histogram ecmf1 <- dhist_ecmf(dhist1) ecmf2 <- dhist_ecmf(dhist2) # Calculate the area between the two ECMFs @@ -279,10 +324,16 @@ emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { # the bin_mass and bin_centre vectors for each histogram must have the same # length. if (length(bin_centres1) != num_bins1) { - stop("Number of bin masses and bin centres provided for histogram 1 must be equal") + stop( + "Number of bin masses and bin centres provided for histogram 1 must ", + "be equal" + ) } if (length(bin_centres2) != num_bins2) { - stop("Number of bin masses and bin centres provided for histogram 2 must be equal") + stop( + "Number of bin masses and bin centres provided for histogram 2 must ", + "be equal" + ) } # Generate cost matrix @@ -291,10 +342,16 @@ emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { # Linear Programming solver requires all bin masses and transportation costs # to be integers to generate correct answer if (!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { - stop("All bin masses for histogram 1 must be integers for accurate Linear Programming solution") + stop( + "All bin masses for histogram 1 must be integers for accurate Linear ", + "Programming solution" + ) } if (!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { - stop("All bin masses for histogram 2 must be integers for accurate Linear Programming solution") + stop( + "All bin masses for histogram 2 must be integers for accurate ", + "Linear Programming solution" + ) } if (!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { stop("All costs must be integers for accurate Linear Programming solution") @@ -320,8 +377,18 @@ cost_matrix <- function(bin_centres1, bin_centres2) { # Calculate distances between all bins in network 1 and all bins in network 2 num_bins1 <- length(bin_centres1) num_bins2 <- length(bin_centres2) - loc_mat1 <- matrix(bin_centres1, nrow = num_bins1, ncol = num_bins2, byrow = FALSE) - loc_mat2 <- matrix(bin_centres2, nrow = num_bins1, ncol = num_bins2, byrow = TRUE) + loc_mat1 <- matrix( + bin_centres1, + nrow = num_bins1, + ncol = num_bins2, + byrow = FALSE + ) + loc_mat2 <- matrix( + bin_centres2, + nrow = num_bins1, + ncol = num_bins2, + byrow = TRUE + ) cost_mat <- abs(loc_mat1 - loc_mat2) return(cost_mat) } diff --git a/R/graph_binning.R b/R/graph_binning.R index 3f8aa21b..e75da79b 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -10,7 +10,9 @@ binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) { - if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) { + min_counts_per_interval <- length(densities) + } breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals diff --git a/R/measures_net_dis.R b/R/measures_netdis.R similarity index 77% rename from R/measures_net_dis.R rename to R/measures_netdis.R index d1120d03..00bb4232 100644 --- a/R/measures_net_dis.R +++ b/R/measures_netdis.R @@ -1,22 +1,34 @@ #' Netdis between two graphs #' -#' Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). -#' -#' -#' @param graph_1 A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered. +#' Calculates the different variants of the network dissimilarity statistic +#' Netdis between two graphs. The variants currently supported are Netdis using +#' a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), +#' and Netdis using a Geometric Poisson approximation for the expectation +#' (\code{ref_graph = NULL}). +#' +#' +#' @param graph_1 A simple graph object from the \code{igraph} package. +#' \code{graph_1} can be set to \code{NULL} (default) if +#' \code{graphlet_counts_1} is provided. If both \code{graph_1} and +#' \code{graphlet_counts_1} are not \code{NULL}, then only +#' \code{graphlet_counts_1} will be considered. +#' +#' @param graph_2 A simple graph object from the \code{igraph} package. +#' \code{graph_2} can be set to \code{NULL} (default) if +#' \code{graphlet_counts_2} is provided. If both \code{graph_2} and +#' \code{graphlet_counts_2} are not \code{NULL}, then only +#' \code{graphlet_counts_2} will be considered. #' -#' @param graph_2 A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered. -#' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. Matrix containing counts of each graphlet (columns) for #' each ego-network (rows) in the input graph. Columns are labelled with #' graphlet IDs and rows are labelled with the ID of the central node in each #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for -#' each ego network. (default: NULL). -#' If the \code{graphlet_counts_1} argument is defined then -#' \code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}. -#' +#' each ego network. If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. These counts can be obtained with +#' \code{count_graphlets_ego}. (default: NULL). +#' #' #' @param graphlet_counts_2 Pre-generated graphlet counts for the second query #' graph. Matrix containing counts of each graphlet (columns) for @@ -26,27 +38,29 @@ #' additional column labelled "N" including the node count for #' each ego network. (default: NULL). #' If the \code{graphlet_counts_2} argument is defined then -#' \code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}. +#' \code{graph_2} will not be used. These counts can be obtained with +#' \code{count_graphlets_ego}. #' #' @param ref_graph Controls how expected counts are calculated. Either: #' 1) A numeric value - used as a constant expected counts value for all query #' graphs . #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the -#' query graphs themselves. (Geometric-Poisson approximation). -#' +#' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be +#' calculated based on the properties of the query graphs themselves +#' (Geometric-Poisson approximation). +#' #' @param graphlet_counts_ref Pre-generated reference graphlet counts. #' Matrix containing counts of each graphlet (columns) for #' each ego-network (rows) in the reference graph. Columns are labelled with #' graphlet IDs and rows are labelled with the ID of the central node in each #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for -#' each ego network. (default: NULL). -#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -#' be used. +#' each ego network. If the \code{graphlet_counts_ref} argument is defined then +#' \code{ref_graph} will not be used. (default: NULL). #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only +#' 4 (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighborhood size (default: 2). #' @@ -56,69 +70,115 @@ #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges (default: 1). #' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -#' (Default: NULL). +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} +#' and \code{num_intervals = 100} is used (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. If +#' \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. -#' +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size. #' #' @examples #' require(netdist) #' require(igraph) -#' #Set source directory for Virus PPI graph edge files stored in the netdist package. -#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +#' # Set source directory for Virus PPI graph edge files stored in the +#' # netdist package. +#' source_dir <- system.file( +#' file.path("extdata", "VRPINS"), +#' package = "netdist" +#' ) #' # Load query graphs as igraph objects -#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -#' graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") -#' -#' #Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. -#' -#' #Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) -#' -#' # Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. -#' # Two lattice networks of different sizes are used for this example. -#' goldstd_1 <- graph.lattice(c(8,8)) #A reference net -#' goldstd_2 <- graph.lattice(c(44,44)) #A reference net -#' -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) -#' -#' -#' #Providing pre-calculated subgraph counts. -#' -#' props_1 <- count_graphlets_ego(graph = graph_1) -#' props_2 <- count_graphlets_ego(graph = graph_2) -#' props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) -#' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) -#' -#' #Netdis Geometric-Poisson. -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) -#' -#' #Netdis Zero Expectation. -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) -#' -#' #Netdis using gold-standard network -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +#' graph_1 <- read_simple_graph( +#' file.path(source_dir, "EBV.txt"), +#' format = "ncol" +#' ) +#' graph_2 <- read_simple_graph( +#' file.path(source_dir, "ECL.txt"), +#' format = "ncol" +#' ) +#' +#' # Netdis variant using the Geometric Poisson approximation to remove the +#' # background expectation of each network. This option will focus on detecting +#' # more general and global discrepancies between the ego-network structures. +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) +#' +#' # Comparing the networks via their observed ego counts without centering them +#' # (equivalent to using expectation equal to zero). This option, will focus on +#' # detecting small discrepancies. +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) +#' +#' # Example of the use of netdis with a reference graph.This option will focus +#' # on detecting discrepancies between the networks relative to the ego-network +#' # structure of the reference network / gold-standard. +#' # Two lattice networks of different sizes are used for this example. +#' goldstd_1 <- graph.lattice(c(8, 8)) # A reference net +#' goldstd_2 <- graph.lattice(c(44, 44)) # A reference net +#' +#' netdis_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_1 +#' ) +#' netdis_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_2 +#' ) +#' +#' +#' # Providing pre-calculated subgraph counts. +#' +#' props_1 <- count_graphlets_ego(graph = graph_1) +#' props_2 <- count_graphlets_ego(graph = graph_2) +#' props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +#' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) +#' +#' # Netdis Geometric-Poisson. +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = NULL +#' ) +#' +#' # Netdis Zero Expectation. +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = 0 +#' ) +#' +#' # Netdis using gold-standard network +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_1 +#' ) +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_2 +#' ) #' @export netdis_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, @@ -132,8 +192,8 @@ netdis_one_to_one <- function(graph_1 = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_2 = NULL, - graphlet_counts_ref= NULL) { - + graphlet_counts_ref = NULL) { + ## ------------------------------------------------------------------------ # Check arguments if (is.null(graph_1) && is.null(graphlet_counts_1)) { @@ -145,7 +205,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -157,7 +217,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_2)) { graphlet_counts_2 <- count_graphlets_ego( graph_2, @@ -169,13 +229,13 @@ netdis_one_to_one <- function(graph_1 = NULL, ) } rm(graph_2) - + graphlet_counts <- list( graph_1 = graphlet_counts_1, graph_2 = graphlet_counts_2 ) - - if(!is.null(ref_graph)){ + + if (!is.null(ref_graph)) { if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, @@ -203,7 +263,7 @@ netdis_one_to_one <- function(graph_1 = NULL, graphlet_counts = graphlet_counts, graphlet_counts_ref = graphlet_counts_ref ) - + ## ------------------------------------------------------------------------ # extract netdis statistics from list returned by netdis_many_to_many result$netdis[, 1] @@ -226,7 +286,8 @@ netdis_one_to_one <- function(graph_1 = NULL, #' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 +#' and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size. #' @@ -236,26 +297,32 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges. #' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -#' (Default: NULL). +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +#' \code{num_intervals = 100} is used (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. If +#' \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. -#' +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. +#' #' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. If the \code{graphlet_counts_1} argument is defined then @@ -268,7 +335,7 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the #' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not #' be used. -#' +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -284,7 +351,7 @@ netdis_one_to_many <- function(graph_1 = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_compare = NULL, - graphlet_counts_ref= NULL) { + graphlet_counts_ref = NULL) { ## ------------------------------------------------------------------------ # Check arguments if (is.null(graph_1) && is.null(graphlet_counts_1)) { @@ -293,11 +360,11 @@ netdis_one_to_many <- function(graph_1 = NULL, if (is.null(graphs_compare) && is.null(graphlet_counts_compare)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } - + ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -309,7 +376,7 @@ netdis_one_to_many <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_compare)) { graphlet_counts_compare <- purrr::map( graphs_compare, @@ -322,13 +389,13 @@ netdis_one_to_many <- function(graph_1 = NULL, ) } rm(graphs_compare) - + graphlet_counts <- append(graphlet_counts_compare, - list(graph_1 = graphlet_counts_1), - after = 0 + list(graph_1 = graphlet_counts_1), + after = 0 ) - - if(!is.null(ref_graph)){ + + if (!is.null(ref_graph)) { if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, @@ -341,7 +408,7 @@ netdis_one_to_many <- function(graph_1 = NULL, ref_graph <- NULL } } - + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( @@ -358,7 +425,7 @@ netdis_one_to_many <- function(graph_1 = NULL, graphlet_counts = graphlet_counts, graphlet_counts_ref = graphlet_counts_ref ) - + ## ------------------------------------------------------------------------ # restructure netdis_many_to_many output colnames(result$netdis) <- result$comp_spec$name_b @@ -377,14 +444,15 @@ netdis_one_to_many <- function(graph_1 = NULL, #' graphs. #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL (default) - Expected counts will be calculated based on the properties of the -#' query graphs themselves. (Geometric-Poisson approximation). +#' 3) NULL (default) - Expected counts will be calculated based on the +#' properties of the query graphs themselves. (Geometric-Poisson approximation). #' #' @param comparisons Which comparisons to perform between graphs. #' Can be "many-to-many" (all pairwise combinations) or "one-to-many" #' (compare first graph in graphs to all other graphs.) #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 +#' (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size (default 2). #' @@ -393,25 +461,32 @@ netdis_one_to_many <- function(graph_1 = NULL, #' #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges (default 1). -#' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL). +#' +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +#' \code{num_intervals = 100} is used (default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. +#' If \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. #' #' @param graphlet_counts Pre-generated graphlet counts (default: NULL). If the #' \code{graphlet_counts} argument is defined then \code{graphs} will not be @@ -423,20 +498,20 @@ netdis_one_to_many <- function(graph_1 = NULL, #' additional column labelled "N" including the node count for #' each ego network. #' -#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: NULL). Matrix containing counts -#' of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with -#' graphlet IDs and rows are labelled with the ID of the central node in each -#' ego-network. As well as graphlet counts, each matrix must contain an -#' additional column labelled "N" including the node count for -#' each ego network. -#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -#' be used. -#' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: +#' NULL). Matrix containing counts of each graphlet (columns) for each +#' ego-network (rows) in the input graph. Columns are labelled with graphlet IDs +#' and rows are labelled with the ID of the central node in each ego-network. As +#' well as graphlet counts, each matrix must contain an additional column +#' labelled "N" including the node count for each ego network. +#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} +#' will not be used. +#' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. #' #' @export -netdis_many_to_many <- function(graphs = NULL, +netdis_many_to_many <- function(graphs = NULL, # nolint: cyclocomp_linter. ref_graph = NULL, comparisons = "many-to-many", max_graphlet_size = 4, @@ -448,14 +523,14 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = NULL, graphlet_counts = NULL, graphlet_counts_ref = NULL) { - + ## ------------------------------------------------------------------------ # Check arguments and set functions appropriately if (is.null(graphs) && is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } - - + + # Set default binning_fn if none supplied if (is.null(binning_fn)) { binning_fn <- purrr::partial( @@ -464,7 +539,7 @@ netdis_many_to_many <- function(graphs = NULL, num_intervals = 100 ) } - + # If no ref_graph supplied, default to geometric poisson unless user-defined # functions have been provided. if (is.null(ref_graph) && is.null(graphlet_counts_ref)) { @@ -476,9 +551,9 @@ netdis_many_to_many <- function(graphs = NULL, netdis_expected_counts, scale_fn = NULL ) - } - # If a ref_graph value supplied (including a constant), default to approach - # from original netdis paper, unless user-defined functions provided. + } + # If a ref_graph value supplied (including a constant), default to approach + # from original netdis paper, unless user-defined functions provided. } else { if (is.null(bin_counts_fn)) { bin_counts_fn <- purrr::partial( @@ -492,9 +567,9 @@ netdis_many_to_many <- function(graphs = NULL, netdis_expected_counts, scale_fn = count_graphlet_tuples ) - } + } } - + ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. # But if graphlet counts have already been provided we can skip this step. @@ -510,12 +585,15 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(graphs) - + ## ------------------------------------------------------------------------ # Centred counts - # If there are no graphlet_counts_ref, and a number has been passed as ref_graph, treat it as a constant expected - # counts value (e.g. if ref_graph = 0 then no centring of counts). - if (is.numeric(ref_graph) && length(ref_graph) == 1 && is.null(graphlet_counts_ref)) { + # If there are no graphlet_counts_ref, and a number has been passed as + # ref_graph, treat it as a constant expected counts value (e.g. if + # ref_graph = 0 then no centring of counts). + if (is.numeric(ref_graph) && + length(ref_graph) == 1 && + is.null(graphlet_counts_ref)) { centred_graphlet_counts <- purrr::map( graphlet_counts, netdis_centred_graphlet_counts, @@ -526,12 +604,12 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = NULL, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ - # If there are no graphlet_counts_ref, and If a reference graph passed, use it to calculate expected counts for all - # query graphs. + # If there are no graphlet_counts_ref, and If a reference graph passed, use + # it to calculate expected counts for all query graphs. } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { - + # Generate ego networks and calculate graphlet counts # But if graphlet_counts_ref provided can skip this step if (is.null(graphlet_counts_ref)) { @@ -545,22 +623,22 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(ref_graph) - + # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) - + # bin ref ego-network densities binned_densities <- binning_fn(densities_ref) - + ref_ego_density_bins <- binned_densities$breaks - + # Average ref graphlet counts across density bins ref_binned_graphlet_counts <- bin_counts_fn( graphlet_counts_ref, binned_densities$interval_indexes, max_graphlet_size = max_graphlet_size ) - + # Calculate centred counts using ref graph centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -572,10 +650,11 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = exp_counts_fn, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # If no reference passed, calculate expected counts using query networks - # themselves. Geometric-Poisson GP #This is the function that creates an error for a graph with three connected nodes. + # themselves. Geometric-Poisson GP #This is the function that creates an + # error for a graph with three connected nodes. } else { centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -589,17 +668,17 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(graphlet_counts) - + ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - + rm(centred_graphlet_counts) - + ## ------------------------------------------------------------------------ # Generate pairwise comparisons comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) - + ## ------------------------------------------------------------------------ # Calculate netdis statistics results <- parallel::mcmapply( @@ -614,7 +693,7 @@ netdis_many_to_many <- function(graphs = NULL, comp_spec$index_b, SIMPLIFY = TRUE ) - + list(netdis = results, comp_spec = comp_spec) } @@ -623,29 +702,32 @@ netdis_many_to_many <- function(graphs = NULL, #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets #' of size \code{graphlet_size}. -#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 -#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 +#' @param centred_graphlet_counts_1 Centred Graphlet Counts vector for +#' graph 1 +#' @param centred_graphlet_counts_2 Centred Graphlet Counts vector for +#' graph 2 #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. #' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, +netdis <- function(centred_graphlet_counts_1, + centred_graphlet_counts_2, graphlet_size) { # Select subset of centred counts corresponding to graphlets of the # specified size ids <- graphlet_ids_for_size(graphlet_size) - counts1 <- centred_graphlet_count_vector_1[ids] - counts2 <- centred_graphlet_count_vector_2[ids] - + counts1 <- centred_graphlet_counts_1[ids] + counts2 <- centred_graphlet_counts_2[ids] + # Calculate normalising constant norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) * sum(counts2^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate intermediate "netD" statistic that falls within range -1..1 netds2 <- (1 / sqrt(norm_const)) * sum((counts1 * counts2) / - sqrt(counts1^2 + counts2^2), na.rm = TRUE) + sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate corresponding "netd" Netdis statistic that falls within range 0..1 0.5 * (1 - netds2) } @@ -655,28 +737,32 @@ netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vecto #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}) for all #' graphlet sizes up to \code{max_graphlet_size}. -#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 -#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 +#' @param centred_graphlet_counts_1 Centred Graphlet Counts vector for +#' graph 1 +#' @param centred_graphlet_counts_2 Centred Graphlet Counts vector for +#' graph 2 #' @param max_graphlet_size max graphlet size to calculate Netdis for. #' The size of a graphlet is the number of nodes it contains. Netdis is -#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported. +#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently +#' only 4 and 5 are supported. #' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, +netdis_uptok <- function(centred_graphlet_counts_1, + centred_graphlet_counts_2, max_graphlet_size) { if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { stop("max_graphlet_size must be 3, 4 or 5.") } - + netdis_statistics <- purrr::map(3:max_graphlet_size, - netdis, - centred_graphlet_count_vector_1 = centred_graphlet_count_vector_1, - centred_graphlet_count_vector_2 = centred_graphlet_count_vector_2 + netdis, + centred_graphlet_counts_1 = centred_graphlet_counts_1, + centred_graphlet_counts_2 = centred_graphlet_counts_2 ) - + netdis_statistics <- simplify2array(netdis_statistics) - + names(netdis_statistics) <- sapply( "netdis", @@ -684,7 +770,7 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count 3:max_graphlet_size, sep = "" ) - + netdis_statistics } @@ -709,8 +795,8 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count #' @param binning_fn Function used to bin ego network densities. Only needed if #' \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are #' \code{NULL}. Takes densities as its single argument, and returns a named list -#' including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} -#' (density bin index for each ego network). +#' including keys \code{breaks} (vector of bin edges) and +#' \code{interval_indexes} (density bin index for each ego network). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Only needed if \code{ref_ego_density_bins} and @@ -723,36 +809,36 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' -#' @param max_graphlet_size max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported. +#' @param max_graphlet_size max graphlet size to calculate centred counts for. +#' Currently only size 4 and 5 are supported. #' #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size #' max_graphlet_size. #' @export -netdis_centred_graphlet_counts <- function( - graphlet_counts, - ref_ego_density_bins, - ref_binned_graphlet_counts, - binning_fn, - bin_counts_fn, - exp_counts_fn, - max_graphlet_size) { - +netdis_centred_graphlet_counts <- function(graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + binning_fn, + bin_counts_fn, + exp_counts_fn, + max_graphlet_size) { + ## ------------------------------------------------------------------------ # If a number has been passed as ref_binned_graphlet_counts, treat it as a # constant expected counts value (e.g. if ref_binned_graphlet_counts = 0 # then no centring of counts). if (is.numeric(ref_binned_graphlet_counts) && - length(ref_binned_graphlet_counts) == 1) { + length(ref_binned_graphlet_counts) == 1) { exp_graphlet_counts <- netdis_const_expected_counts( graphlet_counts, const = ref_binned_graphlet_counts ) - + ## ------------------------------------------------------------------------ # If reference bins and counts passed, use them to calculate # expected counts } else if (!is.null(ref_ego_density_bins) && - !is.null(ref_binned_graphlet_counts)) { + !is.null(ref_binned_graphlet_counts)) { # Calculate expected graphlet counts (using ref # graph ego network density bins) exp_graphlet_counts <- exp_counts_fn( @@ -761,29 +847,29 @@ netdis_centred_graphlet_counts <- function( ref_binned_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # If NULL passed as ref bins and counts, calculate expected counts using # query network itself. This should be GP. } else if (is.null(ref_ego_density_bins) && - is.null(ref_binned_graphlet_counts)) { + is.null(ref_binned_graphlet_counts)) { # Get ego-network densities densities <- ego_network_density(graphlet_counts) - + # bin ref ego-network densities - binned_densities <- binning_fn(densities) - + binned_densities <- binning_fn(densities) + # extract bin breaks and indexes from binning results ego_density_bin_breaks <- binned_densities$breaks ego_density_bin_indexes <- binned_densities$interval_indexes - + # Calculate expected counts in each bin binned_graphlet_counts <- bin_counts_fn( graphlet_counts, ego_density_bin_indexes, max_graphlet_size = max_graphlet_size ) - + # Calculate expected graphlet counts for each ego network exp_graphlet_counts <- exp_counts_fn( graphlet_counts, @@ -791,19 +877,19 @@ netdis_centred_graphlet_counts <- function( binned_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts } else { stop("Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts. Options are: - Both NULL: calculate expected counts using query network. - - Vector of bin edges and matrix of binned counts: Reference graph values - for calculating expected counts. + - Vector of bin edges and matrix of binned counts: Reference graph + values for calculating expected counts. - Constant numeric ref_binned_graphlet_counts: Use as constant expected counts value.") } - + ## ------------------------------------------------------------------------ # Subtract expected counts from actual graphlet counts netdis_subtract_exp_counts( @@ -822,18 +908,18 @@ netdis_centred_graphlet_counts <- function( #' nummber of ego networks (rows). #' @param exp_graphlet_counts Matrix of expected graphlet counts (columns) for a #' nummber of ego networks (rows). -#' @param max_graphlet_size Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported. +#' @param max_graphlet_size Do the subtraction for graphlets up to this size. +#' Currently only size 4 and 5 are supported. #' @export -netdis_subtract_exp_counts <- function( - graphlet_counts, - exp_graphlet_counts, - max_graphlet_size) { - +netdis_subtract_exp_counts <- function(graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) { + # select columns for graphlets up to size max_graphlet_size id <- graphlet_key(max_graphlet_size)$id graphlet_counts <- graphlet_counts[, id] exp_graphlet_counts <- exp_graphlet_counts[, id] - + # Subtract expected counts from actual graphlet counts graphlet_counts - exp_graphlet_counts } @@ -846,34 +932,34 @@ netdis_subtract_exp_counts <- function( #' @param graphlet_counts Matrix of graphlet and node counts (columns) for a #' nummber of ego networks (rows). #' @param density_breaks Density values defining bin edges. -#' @param density_binned_reference_counts Reference network graphlet counts for +#' @param density_binned_ref_counts Reference network graphlet counts for #' each density bin. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' Currently only size 4 and 5 are supported. #' @param scale_fn Optional function to scale calculated expected counts, taking #' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, #' and returning a scale factor that the looked up -#' \code{density_binned_reference_counts} values will be multiplied by. +#' \code{density_binned_ref_counts} values will be multiplied by. #' #' @export -netdis_expected_counts <- function( - graphlet_counts, - density_breaks, - density_binned_reference_counts, - max_graphlet_size, - scale_fn = NULL) { - - +netdis_expected_counts <- function(graphlet_counts, + density_breaks, + density_binned_ref_counts, + max_graphlet_size, + scale_fn = NULL) { + + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- t(apply( graphlet_counts, 1, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts, + density_binned_ref_counts = density_binned_ref_counts, scale_fn = scale_fn )) - + expected_graphlet_counts } @@ -885,29 +971,30 @@ netdis_expected_counts <- function( #' #' @param graphlet_counts Node and graphlet counts for an ego network. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' Currently only size 4 and 5 are supported. #' @param density_breaks Density values defining bin edges. -#' @param density_binned_reference_counts Reference network graphlet counts for +#' @param density_binned_ref_counts Reference network graphlet counts for #' each density bin. #' @param scale_fn Optional function to scale calculated expected counts, taking #' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and #' returning a scale factor that the looked up -#' \code{density_binned_reference_counts} values will be multiplied by. +#' \code{density_binned_ref_counts} values will be multiplied by. #' netdis_expected_counts_ego <- function(graphlet_counts, max_graphlet_size, density_breaks, - density_binned_reference_counts, + density_binned_ref_counts, scale_fn = NULL) { - + # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- density_from_counts(graphlet_counts) matched_density_index <- interval_index(query_density, density_breaks) - + matched_reference_counts <- - density_binned_reference_counts[matched_density_index, ] - + density_binned_ref_counts[matched_density_index, ] + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number @@ -931,7 +1018,7 @@ netdis_expected_counts_ego <- function(graphlet_counts, #' (default \code{agg_fn = mean}). #' #' @export -mean_density_binned_graphlet_counts <- function(graphlet_counts, +mean_density_binned_graphlet_counts <- function(graphlet_counts, # nolint: object_length_linter. density_interval_indexes, agg_fn = mean) { # The ego network graphlet counts are an E x G matrix with rows (E) @@ -939,22 +1026,22 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, # to calculate the mean count for each graphlet / density bin combination, # so we will use tapply to average counts for each graphlet across density # bins, using apply to map this operation over graphlets - mean_density_binned_graphlet_counts <- + mean_density_binned_counts <- apply(graphlet_counts, MARGIN = 2, function(gc) { tapply(gc, INDEX = density_interval_indexes, FUN = agg_fn) }) - + # if only 1 bin (i.e. no binning) will be left with a 1D list. # convert it into a 2D list. - if (is.null(dim(mean_density_binned_graphlet_counts))) { - dim(mean_density_binned_graphlet_counts) <- - c(1, length(mean_density_binned_graphlet_counts)) - - colnames(mean_density_binned_graphlet_counts) <- + if (is.null(dim(mean_density_binned_counts))) { + dim(mean_density_binned_counts) <- + c(1, length(mean_density_binned_counts)) + + colnames(mean_density_binned_counts) <- colnames(graphlet_counts) } - - mean_density_binned_graphlet_counts + + mean_density_binned_counts } #' For case where don't want to use binning, return a single bin which covers @@ -996,13 +1083,13 @@ density_binned_counts <- function(graphlet_counts, # by dividing by total number of k-tuples in # ego-network (where k is graphlet size) graphlet_counts <- scale_fn(graphlet_counts, - max_graphlet_size = max_graphlet_size + max_graphlet_size = max_graphlet_size ) } - + mean_density_binned_graphlet_counts(graphlet_counts, - density_interval_indexes, - agg_fn = agg_fn + density_interval_indexes, + agg_fn = agg_fn ) } @@ -1014,53 +1101,52 @@ density_binned_counts <- function(graphlet_counts, #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin indexes for each ego network in #' \code{graphlet_counts}. -#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -#' included in graphlet_counts. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently +#' only size 4 and 5 are supported. included in graphlet_counts. exp_counts_bin_gp <- function(bin_idx, graphlet_counts, density_interval_indexes, max_graphlet_size) { # extract ego networks belonging to input density bin index counts <- graphlet_counts[density_interval_indexes == bin_idx, ] - + # mean graphlet counts in this density bin means <- colMeans(counts) - + # subtract mean graphlet counts from actual graphlet counts mean_sub_counts <- sweep(counts, 2, means) - + # variance in graphlet counts across ego networks in this density bin - Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) - + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) # nolint: object_name_linter. + # Dealing with zero variance HERE ind_zerovar <- (Vd_sq < .00000001) - if(sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 - + if (sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 + # GP theta parameter for each graphlet id in this density bin theta_d <- 2 * means / (Vd_sq + means) - + exp_counts_dk <- vector() for (k in 2:max_graphlet_size) { graphlet_idx <- graphlet_ids_for_size(k) - + # GP lambda parameter for graphlet size k in this density bin lambda_dk <- mean(2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]), - na.rm = TRUE + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE ) - + # Expected counts for graphlet size k in this density bin exp_counts_dk <- append( exp_counts_dk, lambda_dk / theta_d[graphlet_idx] ) - } - + # Dealing with divisions by zero. ind <- is.na(exp_counts_dk) ind <- ind | is.infinite(exp_counts_dk) - if(sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 - + if (sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 + exp_counts_dk } @@ -1070,13 +1156,13 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin index for #' each ego network. -#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -#' included in graphlet_counts. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently +#' only size 4 and 5 are supported. included in graphlet_counts. #' @export density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, max_graphlet_size) { - + # calculate expected counts for each density bin index nbins <- length(unique(density_interval_indexes)) expected_counts_bin <- t(sapply( @@ -1086,10 +1172,10 @@ density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes = density_interval_indexes, max_graphlet_size = max_graphlet_size )) - + # remove NAs caused by bins with zero counts for a graphlet expected_counts_bin[is.nan(expected_counts_bin)] <- 0 - + expected_counts_bin } @@ -1137,10 +1223,10 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { graphlet_tuple_counts <- t(apply(graphlet_counts, 1, - count_graphlet_tuples, - max_graphlet_size = max_graphlet_size + count_graphlet_tuples, + max_graphlet_size = max_graphlet_size )) - + graphlet_tuple_counts } @@ -1177,12 +1263,12 @@ scale_graphlet_counts_ego <- function(graphlet_counts, graphlet_counts, max_graphlet_size = max_graphlet_size ) - + scaled_graphlet_counts <- scale_graphlet_count( graphlet_counts, ego_graphlet_tuples ) - + return(scaled_graphlet_counts) } @@ -1196,13 +1282,13 @@ scale_graphlet_counts_ego <- function(graphlet_counts, #' @export count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { # extract node counts from graph_graphlet_counts - N <- graph_graphlet_counts["N"] - + N <- graph_graphlet_counts["N"] # nolint: object_name_linter. + graphlet_key <- graphlet_key(max_graphlet_size) graphlet_node_counts <- graphlet_key$node_count - + graphlet_tuple_counts <- choose(N, graphlet_node_counts) - + graphlet_tuple_counts <- stats::setNames( graphlet_tuple_counts, graphlet_key$id diff --git a/R/measures_net_emd.R b/R/measures_netemd.R similarity index 50% rename from R/measures_net_emd.R rename to R/measures_netemd.R index 8246dfeb..acdff938 100755 --- a/R/measures_net_emd.R +++ b/R/measures_netemd.R @@ -1,23 +1,37 @@ #' NetEMD Network Earth Mover's Distance between a pair of networks. #' -#' Calculates the network Earth Mover's Distance (EMD) between -#' two sets of network features. This is done by individually normalising the distribution -#' of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. +#' Calculates the network Earth Mover's Distance (EMD) between +#' two sets of network features. This is done by individually normalising the +#' distribution of each feature so that they have unit mass and unit variance. +#' Then the minimun EMD between the same pair of features (one for each +#' corresponding graph) is calculated by considering all possible translations +#' of the feature distributions. Finally the average over all features is +#' reported. #' This is calculated as follows: #' 1. Normalise each feature histogram to have unit mass and unit variance. -#' 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. +#' 2. For each feature, find the minimum EMD between each pair of histograms +#' considering all possible histogram translations. #' 3. Take the average minimum EMD across all features. -#' @param graph_1 A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided. -#' @param graph_2 A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided. -#' @param dhists_1 Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param graph_1 A network/graph object from the \code{igraph} package. +#' \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is +#' provided. +#' @param graph_2 A network/graph object from the \code{igraph} package. +#' \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is +#' provided. +#' @param dhists_1 Either, a \code{dhist} discrete histogram object, or list of +#' such objects, or a matrix of network features (each column representing a +#' feature). \code{dhists_1} can be set to \code{NULL} (default) if +#' \code{graph_1} is provided. A \code{dhist} object can be obtained from +#' \code{graph_features_to_histograms}. #' @param dhists_2 Same as \code{dhists_1}. -#' @param method The method to be used to find the minimum EMD across all potential -#' offsets for each pair of histograms. Default is "optimise" to use +#' @param method The method to be used to find the minimum EMD across all +#' potential offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the -#' histograms at all offsets that are candidates for the minimal EMD at the cost of computational time. +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the +#' histograms at all offsets that are candidates for the minimal EMD at the cost +#' of computational time. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms. #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to @@ -26,55 +40,103 @@ #' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice). #' @param feature_type Type of graphlet-based feature to count: "graphlet" -#' counts the number of graphlets each node participates in; "orbit" (default) calculates -#' the number of graphlet orbits each node participates in. +#' counts the number of graphlets each node participates in; "orbit" (default) +#' calculates the number of graphlet orbits each node participates in. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Possible values are 4, and 5 (default). #' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. -#' @return NetEMD measure for the two sets of discrete histograms (or graphs). If -#' (\code{return_details = FALSE}) then a list with the following named elements is returned -#' \code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: -#' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -#' offsets giving the minimal EMD for each pair of histograms +#' include nodes for each ego-network. NetEmd was proposed for individual nodes +#' alone, hence the default value is 0. +#' @return NetEMD measure for the two sets of discrete histograms (or graphs). +#' If (\code{return_details = FALSE}) then a list with the following named +#' elements is returned \code{netemd}: the NetEMD for the set of histogram +#' pairs (or graphs), \code{min_emds}: the minimal EMD for each pair of +#' histograms, \code{min_offsets}: the associated offsets giving the minimal EMD +#' for each pair of histograms #' @examples -#' require(igraph) -#' graph_1 <- graph.lattice(c(8,8)) -#' graph_2 <- graph.lattice(c(44,44)) -#' netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) -#' -#' #Providing a matrix of network features -#' props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) -#' props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) -#' -#' netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) -#' -#' #Providing the network features as lists of dhist objects -#' dhists_1<- graph_features_to_histograms(props_a) -#' dhists_2<- graph_features_to_histograms(props_b) -#' -#' netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) -#' -#' -#' # A variation of NetEmd: Using the Laplacian spectrum -#' #Laplacian -#' Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) -#' Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) -#' -#' #Normalized Laplacian -#' NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) -#' NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) -#' -#' #Spectra (This may take a couple of minutes). -#' props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -#' props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -#' -#' netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. -#' +#' require(igraph) +#' graph_1 <- graph.lattice(c(8, 8)) +#' graph_2 <- graph.lattice(c(44, 44)) +#' netemd_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' feature_type = "orbit", +#' max_graphlet_size = 5 +#' ) +#' +#' # Providing a matrix of network features +#' props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) +#' props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) +#' +#' netemd_one_to_one( +#' dhists_1 = props_a, +#' dhists_2 = props_b, +#' smoothing_window_width = 1 +#' ) +#' +#' # Providing the network features as lists of dhist objects +#' dhists_1 <- graph_features_to_histograms(props_a) +#' dhists_2 <- graph_features_to_histograms(props_b) +#' +#' netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2) +#' +#' +#' # A variation of NetEmd: Using the Laplacian spectrum +#' # Laplacian +#' Lapg_1 <- igraph::laplacian_matrix( +#' graph = graph_1, +#' normalized = FALSE, +#' sparse = FALSE +#' ) +#' Lapg_2 <- igraph::laplacian_matrix( +#' graph = graph_2, +#' normalized = FALSE, +#' sparse = FALSE +#' ) +#' +#' # Normalized Laplacian +#' NLapg_1 <- igraph::laplacian_matrix( +#' graph = graph_1, +#' normalized = TRUE, +#' sparse = FALSE +#' ) +#' NLapg_2 <- igraph::laplacian_matrix( +#' graph = graph_2, +#' normalized = TRUE, +#' sparse = FALSE +#' ) +#' +#' # Spectra (This may take a couple of minutes). +#' props_1 <- cbind( +#' L.Spectra = eigen(Lapg_1)$values, +#' NL.Spectra = eigen(NLapg_1)$values +#' ) +#' props_2 <- cbind( +#' L.Spectra = eigen(Lapg_2)$values, +#' NL.Spectra = eigen(NLapg_2)$values +#' ) +#' +#' # Use of smoothing window 1 is given for discrete integer distributions. If +#' # the network features are considered continuous variables +#' # smoothing_window_width equal to zero is recommended. +#' netemd_one_to_one( +#' dhists_1 = props_1, +#' dhists_2 = props_2, +#' smoothing_window_width = 0 +#' ) +#' #' @export -netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2=NULL, method = "optimise", - return_details = FALSE, smoothing_window_width = 0,feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { +netemd_one_to_one <- function(graph_1 = NULL, + graph_2 = NULL, + dhists_1 = NULL, + dhists_2 = NULL, + method = "optimise", + return_details = FALSE, + smoothing_window_width = 0, + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0) { ## ------------------------------------------------------------------------ # Check arguments 1 if (!igraph::is.igraph(graph_1) & is.null(dhists_1)) { @@ -85,45 +147,55 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= } ## ------------------------------------------------------------------------ # Check arguments 2 - # If dhists_1 is a matrix of network features then transform them to dhist objects. - if(is.matrix(dhists_1)){ + # If dhists_1 is a matrix of network features then transform them to dhist + # objects. + if (is.matrix(dhists_1)) { dhists_1 <- graph_features_to_histograms(dhists_1) } - if(is.matrix(dhists_2)){ + if (is.matrix(dhists_2)) { dhists_2 <- graph_features_to_histograms(dhists_2) } ## ------------------------------------------------------------------------ # Check arguments 3 - #If input is graph then get graphlet counts - if(igraph::is.igraph(graph_1)){ - if(!is.null(dhists_1)){warning("dhists_1 will be calculated from graph_1.")} - dhists_1 <- gdd(graph = graph_1, feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size + # If input is graph then get graphlet counts + if (igraph::is.igraph(graph_1)) { + if (!is.null(dhists_1)) { + warning("dhists_1 will be calculated from graph_1.") + } + dhists_1 <- gdd( + graph = graph_1, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size ) } - if(igraph::is.igraph(graph_2)){ - if(!is.null(dhists_2)){warning("dhists_2 will be calculated from graph_2.")} - dhists_2 <- gdd(graph = graph_2, feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size + if (igraph::is.igraph(graph_2)) { + if (!is.null(dhists_2)) { + warning("dhists_2 will be calculated from graph_2.") + } + dhists_2 <- gdd( + graph = graph_2, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size ) } - - rm(graph_1,graph_2) + + rm(graph_1, graph_2) ## ------------------------------------------------------------------------ # Require either a pair of "dhist" discrete histograms or two lists of "dhist" # discrete histograms - pair_of_dhist_lists <- all(purrr::map_lgl(dhists_1, is_dhist)) && all(purrr::map_lgl(dhists_2, is_dhist)) - + pair_of_dhist_lists <- all( + purrr::map_lgl(dhists_1, is_dhist) + ) && + all(purrr::map_lgl(dhists_2, is_dhist)) + # If input is two lists of "dhist" discrete histograms, determine the minimum # EMD and associated offset for pairs of histograms taken from the same # position in each list if (pair_of_dhist_lists) { details <- purrr::map2(dhists_1, dhists_2, function(dhist1, dhist2) { netemd_single_pair(dhist1, dhist2, - method = method, - smoothing_window_width = smoothing_window_width + method = method, + smoothing_window_width = smoothing_window_width ) }) # Collect the minimum EMDs and associated offsets for all histogram pairs @@ -133,24 +205,29 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= # The NetEMD is the arithmetic mean of the minimum EMDs for each pair of # histograms arithmetic_mean <- sum(min_emds) / length(min_emds) - net_emd <- arithmetic_mean + netemd <- arithmetic_mean # Return just the NetEMD or a list including the NetEMD plus the details of # the minumum EMD and associated offsets for the individual histograms # Note that the offsets represent shifts after the histograms have been # scaled to unit variance if (return_details) { - return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std)) + return(list( + netemd = netemd, + min_emds = min_emds, + min_offsets = min_offsets, + min_offsets_std = min_offsets_std + )) } else { return(arithmetic_mean) } - } - else { + } else { # Wrap each member of a single pair of histograms is a list and recursively - # call this net_emd function. This ensures they are treated the same. - return(netemd_one_to_one(dhists_1 = list(dhists_1), dhists_2 = list(dhists_2), - method = method, - return_details = return_details, - smoothing_window_width = smoothing_window_width + # call this netemd function. This ensures they are treated the same. + return(netemd_one_to_one( + dhists_1 = list(dhists_1), dhists_2 = list(dhists_2), + method = method, + return_details = return_details, + smoothing_window_width = smoothing_window_width )) } } @@ -158,14 +235,20 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' NetEMDs between all graph pairs using provided Graphlet-based Degree #' Distributions -#' @param graphs A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided. -#' @param dhists A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param graphs A list of network/graph objects from the \code{igraph} package. +#' \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is +#' provided. +#' @param dhists A list whose elements contain either: A list of \code{dhist} +#' discrete histogram objects for each graph, or a list a matrix of network +#' features (each column representing a feature). \code{dhists} can be set to +#' \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object +#' can be obtained from \code{graph_features_to_histograms}. #' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms @@ -177,28 +260,43 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @param feature_type Type of graphlet-based feature to count: "graphlet" -#' counts the number of graphlets each node participates in; "orbit" (default) calculates -#' the number of graphlet orbits each node participates in. +#' counts the number of graphlets each node participates in; "orbit" (default) +#' calculates the number of graphlet orbits each node participates in. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Possible values are 4, and 5 (default). #' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. +#' include nodes for each ego-network. NetEmd was proposed for individual nodes +#' alone, hence the default value is 0. #' @return NetEMD measures between all pairs of graphs for which features #' were provided. Format of returned data depends on the \code{return_details} #' parameter. If set to FALSE, a list is returned with the following named -#' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +#' elements:\code{netemd}: a vector of NetEMDs for each pair of graphs, #' \code{comp_spec}: a comparison specification table containing the graph names #' and indices within the input GDD list for each pair of graphs compared. -#' If \code{return_details} is set to FALSE, the list also contains the following -#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD -#' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving -#' the minimal EMD for each GDD +#' If \code{return_details} is set to FALSE, the list also contains the +#' following matrices for each graph pair: \code{min_emds}: the minimal EMD for +#' each GDD used to compute the NetEMD, \code{min_offsets}: the associated +#' offsets giving the minimal EMD for each GDD #' @export -netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L),feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { - if(max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores," cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) - +netemd_many_to_many <- function(graphs = NULL, + dhists = NULL, + method = "optimise", + smoothing_window_width = 0, + return_details = FALSE, + mc.cores = getOption("mc.cores", 2L), # nolint: object_name_linter. + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0) { + if (max_graphlet_size > 4 & mc.cores > 1) { + print(paste( + "This function will compute orbits of graphlets up to size 5 using ", + mc.cores, + " cores. Depending on the density and size of the graphs, this may lead ", + "to a large compsumption of RAM." + )) + } + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows @@ -206,97 +304,145 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo if (.Platform$OS.type != "unix") { # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores <- 1 + mc.cores <- 1 # nolint: object_name_linter. } ## ------------------------------------------------------------------------ # Check arguments 1 which_imput_type <- NULL - if(!is.null(graphs) & is.null(dhists)){ - if ( !all(( unlist(sapply(X = graphs, FUN = igraph::is.igraph)) ) ) ) { - stop("Graphs need to be igraph graph objects, or a list of dhists network features should be supplied.") + if (!is.null(graphs) & is.null(dhists)) { + if (!all((unlist(sapply(X = graphs, FUN = igraph::is.igraph))))) { + stop( + "Graphs need to be igraph graph objects, or a list of dhists network ", + "features should be supplied." + ) } which_imput_type <- "Graphs" } - if (!is.null(dhists) ) { - if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { + if (!is.null(dhists)) { + if (all((unlist(sapply(X = dhists, FUN = is.matrix))))) { which_imput_type <- "Matrix" - } - if ( all(( unlist(sapply(X = dhists, FUN = - function(l){ all(( unlist(sapply(X = l, FUN = is_dhist)) ) ) } - )) ) ) ) { + } + if (all((unlist(sapply( + X = dhists, FUN = + function(l) { + all((unlist(sapply(X = l, FUN = is_dhist)))) + } + ))))) { which_imput_type <- "dhist" } - if(is.null(which_imput_type)){ - warning("dhists does not conform to a Matrix or dhist class for all elmenents/netwroks in the list.") + if (is.null(which_imput_type)) { + warning( + "dhists does not conform to a Matrix or dhist class for all ", + "elmenents/netwroks in the list." + ) } } ## ------------------------------------------------------------------------ # Check arguments 2 - # If dhists is a list of matrices of network features then transform them to dhist objects. - if(which_imput_type == "Matrix"){ - dhists <- sapply(X = dhists,FUN = graph_features_to_histograms, simplify = FALSE ) + # If dhists is a list of matrices of network features then transform them to + # dhist objects. + if (which_imput_type == "Matrix") { + dhists <- sapply( + X = dhists, + FUN = graph_features_to_histograms, + simplify = FALSE + ) } ## ------------------------------------------------------------------------ # Check arguments 3 - #If input is graph then get graphlet counts - if(which_imput_type == "Graphs"){ + # If input is graph then get graphlet counts + if (which_imput_type == "Graphs") { dhists <- parallel::mcmapply(gdd, graphs, - MoreArgs = - list( - feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size - ), - SIMPLIFY = FALSE, mc.cores = mc.cores + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores ) } rm(graphs) ## ------------------------------------------------------------------------ # Check arguments 4 - #cross_comparison_spec was coded to require names! - if(is.null(names(dhists))){ - names(dhists) <- paste("Net",1:length(dhists),sep = "") + # cross_comparison_spec was coded to require names! + if (is.null(names(dhists))) { + names(dhists) <- paste("Net", 1:length(dhists), sep = "") } ## ------------------------------------------------------------------------ comp_spec <- cross_comparison_spec(dhists) num_features <- length(dhists[[1]]) - out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { - netemd_one_to_one(dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], - method = method, return_details = return_details, - smoothing_window_width = smoothing_window_width + out <- purrr::simplify( + parallel::mcmapply(function(index_a, index_b) { + netemd_one_to_one( + dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], + method = method, return_details = return_details, + smoothing_window_width = smoothing_window_width + ) + }, + comp_spec$index_a, comp_spec$index_b, + SIMPLIFY = FALSE, mc.cores = mc.cores ) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) + ) if (return_details) { - net_emds <- purrr::simplify(purrr::map(out, ~ .$net_emd)) - min_emds <- matrix(purrr::simplify(purrr::map(out, ~ .$min_emds)), ncol = num_features, byrow = TRUE) - colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = ""))) - min_offsets <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets)), ncol = num_features, byrow = TRUE) - colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = ""))) - min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), ncol = num_features, byrow = TRUE) - colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = ""))) - ret <- list(netemds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) + netemds <- purrr::simplify(purrr::map(out, ~ .$netemd)) + min_emds <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_emds)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_emds) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = "")) + ) + min_offsets <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_offsets)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_offsets) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = "")) + ) + min_offsets_std <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_offsets_std) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = "")) + ) + ret <- list( + netemds = netemds, + comp_spec = comp_spec, + min_emds = min_emds, + min_offsets = min_offsets, + min_offsets_std = min_offsets_std + ) } else { - net_emds <- out - ret <- list(netemds = net_emds, comp_spec = comp_spec) + netemds <- out + ret <- list(netemds = netemds, comp_spec = comp_spec) } return(ret) } -#' Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms +#' Internal function to compute the minimum Earth Mover's Distance between +#' standarized and translated histograms #' #' Calculates the minimum Earth Mover's Distance (EMD) between two -#' discrete histograms after normalising each histogram to unit mass and variance. +#' discrete histograms after normalising each histogram to unit mass and +#' variance. #' This is calculated as follows: #' 1. Normalise each histogram to have unit mass and unit variance #' 2. Find the minimum EMD between the histograms -#' @param dhists_1 A \code{dhist} discrete histogram object or a list of such objects -#' @param dhists_2 A \code{dhist} discrete histogram object or a list of such objects +#' @param dhists_1 A \code{dhist} discrete histogram object or a list of such +#' objects +#' @param dhists_2 A \code{dhist} discrete histogram object or a list of such +#' objects #' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to #' "smear" point masses across a finite width in the real domain. Default is 0, @@ -304,18 +450,25 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo #' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice) #' @return A list with the following named elements -#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated -#' offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. -#' @examples -#' require(igraph) -#' goldstd_1 <- graph.lattice(c(8,8)) -#' goldstd_2 <- graph.lattice(c(44,44)) -#' props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) -#' props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) -#' dhists_1<- graph_features_to_histograms(props_1) -#' dhists_2<- graph_features_to_histograms(props_2) -#' # Obtain the minimum NetEMD_edges between the histograms -#' netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +#' \code{netemd}: the NetEMD for the set of histogram pairs, +#' \code{min_offsets}: the associated offsets giving the minimal EMD for each +#' pair of histograms and \code{min_offset_std}: Offset used in the standardised +#' histograms. +#' @examples +#' require(igraph) +#' goldstd_1 <- graph.lattice(c(8, 8)) +#' goldstd_2 <- graph.lattice(c(44, 44)) +#' props_1 <- count_orbits_per_node(graph = goldstd_1, max_graphlet_size = 5) +#' props_2 <- count_orbits_per_node(graph = goldstd_2, max_graphlet_size = 5) +#' dhists_1 <- graph_features_to_histograms(props_1) +#' dhists_2 <- graph_features_to_histograms(props_2) +#' # Obtain the minimum NetEMD_edges between the histograms +#' netemd_single_pair( +#' dhists_1[[1]], +#' dhists_2[[1]], +#' method = "optimise", +#' smoothing_window_width = 0 +#' ) #' @export netemd_single_pair <- function(dhist1, dhist2, method = "optimise", smoothing_window_width = 0) { @@ -335,14 +488,11 @@ netemd_single_pair <- function(dhist1, dhist2, method = "optimise", dhist1 <- as_smoothed_dhist(dhist1, smoothing_window_width) dhist2 <- as_smoothed_dhist(dhist2, smoothing_window_width) } - + # Store means and variances to calculate offset later mean1 <- dhist_mean_location(dhist1) mean2 <- dhist_mean_location(dhist2) - - var1 <- dhist_variance(dhist1) - var2 <- dhist_variance(dhist2) - + # Mean centre histograms. This helps with numerical stability as, after # variance normalisation, the differences between locations are often small. # We want to avoid calculating small differences between large numbers as @@ -351,11 +501,11 @@ netemd_single_pair <- function(dhist1, dhist2, method = "optimise", # clustered around zero, rather than some potentially large mean location. dhist1 <- mean_centre_dhist(dhist1) dhist2 <- mean_centre_dhist(dhist2) - + # Normalise histogram to unit mass and unit variance dhist1_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist1)) dhist2_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist2)) - + # Calculate minimal EMD result <- min_emd(dhist1_norm, dhist2_norm, method = method) # As we mean-centred the histograms prior to passing to min_emd(), the offset diff --git a/R/net_emd_speed_benchmark.R b/R/net_emd_speed_benchmark.R index 62c97611..51b544a2 100644 --- a/R/net_emd_speed_benchmark.R +++ b/R/net_emd_speed_benchmark.R @@ -1,17 +1,18 @@ -netEMDSpeedTest <- function() { +netemd_speed_test <- function() { ## load the data source_dir <- system.file(file.path("extdata", "random"), package = "netdist") print(source_dir) edge_format <- "ncol" file_pattern <- "" - # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - # edge_format = "ncol" - # file_pattern = ".txt" - graphs <- read_simple_graphs(source_dir = source_dir, format = edge_format, pattern = file_pattern) + graphs <- read_simple_graphs( + source_dir = source_dir, + format = edge_format, + pattern = file_pattern + ) n1 <- names(graphs) lab1 <- c() - gddBuildTime <- c() - netEMDtime <- c() + gdd_build_time <- c() + netemd_time <- c() for (i in 1:length(graphs)) { for (j in 1:(i)) @@ -20,15 +21,58 @@ netEMDSpeedTest <- function() { g2 <- graphs[[j]] lab1 <- append(lab1, paste(n1[i], n1[j], sep = ",")) print(paste(n1[i], n1[j], sep = ",")) - fulltimeStart <- Sys.time() + fulltime_start <- Sys.time() gdd1 <- gdd(g1) gdd2 <- gdd(g2) - netEMDStart <- Sys.time() - net_emd(gdd1, gdd2) - endTime <- Sys.time() - gddBuildTime <- append(gddBuildTime, as.double(netEMDStart - fulltimeStart)) - netEMDtime <- append(netEMDtime, as.double(endTime - netEMDStart)) + netemd_start <- Sys.time() + netemd_single_pair(gdd1, gdd2) + end_time <- Sys.time() + gdd_build_time <- append( + gdd_build_time, + as.double(netemd_start - fulltime_start) + ) + netemd_time <- append(netemd_time, as.double(end_time - netemd_start)) } } - list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) + list(gdd_build_time = gdd_build_time, netemd_time = netemd_time) +} + +#' @export +netemd_speed_test_smooth <- function() { + ## load the data + source_dir <- system.file(file.path("extdata", "random"), package = "netdist") + print(source_dir) + edge_format <- "ncol" + file_pattern <- "" + graphs <- read_simple_graphs( + source_dir = source_dir, + format = edge_format, + pattern = file_pattern + ) + n1 <- names(graphs) + lab1 <- c() + gdd_build_time <- c() + netemd_time <- c() + for (i in 1:length(graphs)) + { + for (j in 1:(i)) + { + g1 <- graphs[[i]] + g2 <- graphs[[j]] + lab1 <- append(lab1, paste(n1[i], n1[j], sep = ",")) + print(paste(n1[i], n1[j], sep = ",")) + fulltime_start <- Sys.time() + gdd1 <- gdd(g1) + gdd2 <- gdd(g2) + netemd_start <- Sys.time() + netemd_single_pair(gdd1, gdd2, smoothing_window_width = 1) + end_time <- Sys.time() + gdd_build_time <- append( + gdd_build_time, + as.double(netemd_start - fulltime_start) + ) + netemd_time <- append(netemd_time, as.double(end_time - netemd_start)) + } + } + list(gdd_build_time = gdd_build_time, netemd_time = netemd_time) } diff --git a/R/orca_interface.R b/R/orca_interface.R index 130595a9..82e0555c 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -67,12 +67,19 @@ indexed_edges_to_graph <- function(indexed_edges) { #' previous alterations have been made #' @return A named list of simplified igraph graph object, with the name of each #' graph set to the name of the file it was read from. -#' @examples -#' # Set source directory for Virus protein-protein interaction edge files stored in the netdist package. -#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -#' print(source_dir) +#' @examples +#' # Set source directory for Virus protein-protein interaction edge files +#' # stored in the netdist package. +#' source_dir <- system.file( +#' file.path("extdata", "VRPINS"), +#' package = "netdist" +#' ) +#' print(source_dir) #' # Load query graphs as igraph objects -#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +#' graph_1 <- read_simple_graph( +#' file.path(source_dir, "EBV.txt"), +#' format = "ncol" +#' ) #' graph_1 #' @export read_simple_graphs <- function(source_dir, @@ -208,24 +215,22 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, #' #' Converts a matrix of node level features (e.g. for example counts #' of multiple graphlets or orbits at each node) to -#' a set of histogram like objects (observed frequency distribution of each feature/column) -#' @param features_matrix A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i. -#' @return Feature histograms: List of "discrete histograms" for each -#' feature +#' a set of histogram like objects (observed frequency distribution of each +#' feature/column) +#' @param features_matrix A matrix whose rows represent nodes and whose columns +#' represent different node level features. This means that entry ij provides +#' the value of feature j for node i. +#' @return Feature histograms: List of "discrete histograms" for each feature #' @export graph_features_to_histograms <- function(features_matrix) { apply(features_matrix, 2, dhist_from_obs) } - -graph_features_to_histogramsSLOW <- function(features_matrix) { - apply(features_matrix, 2, dhist_from_obsSLOW) -} - #' Graphlet-based degree distributions (GDDs) #' -#' Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object -#' using the ORCA fast graphlet orbit counting package. +#' Short-cut function to create graphlet-based degree distributions from +#' \code{igraph} graph object using the ORCA fast graphlet orbit counting +#' package. #' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates @@ -233,7 +238,8 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Currently only size 4 and 5 are supported. -#' @param ego_neighbourhood_size The number of steps from the source node used to select the +#' @param ego_neighbourhood_size The number of steps from the source node used +#' to select the #' neighboring nodes to be included in the source node ego-network. #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. @@ -256,8 +262,7 @@ gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, out <- count_graphlets_per_node(graph, max_graphlet_size = max_graphlet_size ) - } - else { + } else { stop("gdd: unrecognised feature_type") } graph_features_to_histograms(out) @@ -342,7 +347,7 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { total_counts <- total_counts / nodes_per_graphlet # add overall graph node count to total_counts - N <- igraph::vcount(graph) + N <- igraph::vcount(graph) # nolint: object_name_linter. total_counts <- c(N = N, total_counts) total_counts } @@ -354,8 +359,9 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Currently only size 4 (default) and 5 are supported. -#' @param neighbourhood_size The number of steps from the source node used to select the -#' neighboring nodes to be included in the source node ego-network. (Default 2). +#' @param neighbourhood_size The number of steps from the source node used to +#' select the neighboring nodes to be included in the source node ego-network. +#' (Default 2). #' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} #' nodes are returned. (Default 3). #' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} @@ -518,7 +524,8 @@ orbit_to_graphlet_counts <- function(orbit_counts) { #' Graphlet key #' #' Metdata about graphlet groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. +#' Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain @@ -554,7 +561,8 @@ graphlet_key <- function(max_graphlet_size) { #' Orbit key #' #' Metdata about orbit groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. +#' Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain @@ -612,11 +620,11 @@ graphlet_ids_for_size <- function(graphlet_size) { #' @param feature_type Type of graphlet-based degree distributions. Can be #' \code{graphlet} to count graphlets or \code{orbit} to count orbits. #' @return A named list where each element contains a set of GDDs for a single -#' @param max_graphlet_size Maximum size of graphlets to use when generating GDD. -#' Currently only size 4 and 5 are supported. -#' @param ego_neighbourhood_size The number of steps from the source node used to select the -#' neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be -#' used. +#' @param max_graphlet_size Maximum size of graphlets to use when generating +#' GDD. Currently only size 4 and 5 are supported. +#' @param ego_neighbourhood_size The number of steps from the source node used +#' to select the neighboring nodes to be included in the source node +#' ego-network. If set to 0, ego-networks will not be used. #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @return A named list where each element contains a set of GDDs for a single @@ -629,7 +637,7 @@ gdd_for_all_graphs <- function(source_dir, feature_type = "orbit", max_graphlet_size = 4, ego_neighbourhood_size = 0, - mc.cores = getOption("mc.cores", 2L)) { + mc.cores = getOption("mc.cores", 2L)) { # nolint: object_name_linter. # Create function to read graph from file and generate GDD graphs <- read_simple_graphs( source_dir = source_dir, format = format, pattern = pattern @@ -642,7 +650,7 @@ gdd_for_all_graphs <- function(source_dir, if (.Platform$OS.type != "unix") { # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores <- 1 + mc.cores <- 1 # nolint: object_name_linter. } parallel::mcmapply(gdd, graphs, MoreArgs = diff --git a/README.md b/README.md index 5b26bd35..819b7b17 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Network Comparison -An R package implementing the Netdis and NetEMD alignment-free network comparison measures. +An R package implementing the Netdis and NetEMD alignment-free network comparison measures. + ### :warning: BETA: Package under construction (pre-release) :warning: Until this package hits release 1.0 anything can change with no notice. @@ -8,6 +9,7 @@ Until this package hits release 1.0 anything can change with no notice. [![GitHub release](https://img.shields.io/github/release/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/releases/latest) [![Build](https://github.com/alan-turing-institute/network-comparison/actions/workflows/build.yml/badge.svg)](https://github.com/alan-turing-institute/network-comparison/actions/workflows/build.yml) +[![Quality checks](https://github.com/alan-turing-institute/network-comparison/actions/workflows/quality.yaml/badge.svg)](https://github.com/alan-turing-institute/network-comparison/actions/workflows/quality.yaml) [![Codecov](https://img.shields.io/codecov/c/github/alan-turing-institute/network-comparison/master.svg)](https://codecov.io/gh/alan-turing-institute/network-comparison?branch=master) [![license](https://img.shields.io/github/license/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/edit/master/LICENSE) [![Github All Releases](https://img.shields.io/github/downloads/alan-turing-institute/network-comparison/total.svg)](https://github.com/alan-turing-institute/network-comparison/releases/latest) @@ -55,7 +57,7 @@ using `browseVignettes(package = "netdist")`. You can list the functions available in the package with `library(help = "netdist")` and get more detailed help on individual functions using `?function_name` (e.g. -`?net_emd`). In RStudio, typing `?netdist::` should also provide a drop down list +`?netemd`). In RStudio, typing `?netdist::` should also provide a drop down list of functions you can select to load the more detailed help. ## References diff --git a/man/area_between_offset_ecmfs.Rd b/man/area_between_offset_ecmfs.Rd deleted file mode 100644 index 9dc8149d..00000000 --- a/man/area_between_offset_ecmfs.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dhist.R -\name{area_between_offset_ecmfs} -\alias{area_between_offset_ecmfs} -\title{Area between two offset Empirical Cumulative Mass Functions (ECMFs)} -\usage{ -area_between_offset_ecmfs(ecmf1, ecmf2, offset) -} -\arguments{ -\item{ecmf1}{An Empirical Cululative Mass Function (ECMF) object of class -\code{dhist_ecmf}} - -\item{ecmf2}{An Empirical Cululative Mass Function (ECMF) object of class -\code{dhist_ecmf}} - -\item{offset}{An offset to add to all locations of the first ECMF. Postive -offsets will shift the ECMF to the right and negative ones to the left.} -} -\value{ -area The area between the two ECMFs, calculated as the integral of -the absolute difference between the two ECMFs -} -\description{ -Area between two offset Empirical Cumulative Mass Functions (ECMFs) -} diff --git a/man/as_smoothed_dhist.Rd b/man/as_smoothed_dhist.Rd index 1dfc0c14..f81847dd 100644 --- a/man/as_smoothed_dhist.Rd +++ b/man/as_smoothed_dhist.Rd @@ -11,11 +11,13 @@ as_smoothed_dhist(dhist, smoothing_window_width) \item{smoothing_window_width}{If greater than 0, the discrete histogram will be treated as having the mass at each location "smoothed" uniformly across -a bin centred on the location and having width = \code{smoothing_window_width}} +a bin centred on the location and having +width = \code{smoothing_window_width}} } \value{ -A copy of a \code{dhist} object with its \code{smoothing_window_width} -attribute set to the value provided \code{smoothing_window_width} parameter. +A copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to the value provided +\code{smoothing_window_width} parameter. } \description{ Returns a "smoothed" copy of a \code{dhist} object with its diff --git a/man/as_unsmoothed_dhist.Rd b/man/as_unsmoothed_dhist.Rd index d4304810..4d2c250c 100644 --- a/man/as_unsmoothed_dhist.Rd +++ b/man/as_unsmoothed_dhist.Rd @@ -10,8 +10,8 @@ as_unsmoothed_dhist(dhist) \item{dhist}{A discrete histogram as a \code{dhist} object} } \value{ -A copy of a \code{dhist} object with its \code{smoothing_window_width} -attribute set to 0. +A copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to 0. } \description{ Returns an "unsmoothed" copy of a \code{dhist} object with its diff --git a/man/count_graphlet_tuples.Rd b/man/count_graphlet_tuples.Rd index 4109039a..6bf797a8 100644 --- a/man/count_graphlet_tuples.Rd +++ b/man/count_graphlet_tuples.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{count_graphlet_tuples} \alias{count_graphlet_tuples} \title{For each graphlet calculate the number of possible sets of k nodes in the diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd index 1bd70670..5a34711a 100644 --- a/man/count_graphlet_tuples_ego.Rd +++ b/man/count_graphlet_tuples_ego.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{count_graphlet_tuples_ego} \alias{count_graphlet_tuples_ego} \title{Run count_graphlet_tuples across pre-computed ego networks.} diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index 5ae422c8..35afac8d 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -20,8 +20,9 @@ count_graphlets_ego( Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Currently only size 4 (default) and 5 are supported.} -\item{neighbourhood_size}{The number of steps from the source node used to select the -neighboring nodes to be included in the source node ego-network. (Default 2).} +\item{neighbourhood_size}{The number of steps from the source node used to +select the neighboring nodes to be included in the source node ego-network. +(Default 2).} \item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} nodes are returned. (Default 3).} diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index 91bf7eca..90adc699 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{density_binned_counts} \alias{density_binned_counts} \title{Used to calculate aggregated graphlet counts for each density bin.} diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index 0dcd867e..90bbe06c 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{density_binned_counts_gp} \alias{density_binned_counts_gp} \title{Calculate expected counts in density bins using the @@ -17,8 +17,8 @@ density_binned_counts_gp( \item{density_interval_indexes}{Density bin index for each ego network.} -\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -included in graphlet_counts.} +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently +only size 4 and 5 are supported. included in graphlet_counts.} } \description{ Calculate expected counts in density bins using the diff --git a/man/density_from_counts.Rd b/man/density_from_counts.Rd index 6436ca4e..a5331548 100644 --- a/man/density_from_counts.Rd +++ b/man/density_from_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{density_from_counts} \alias{density_from_counts} \title{Calculate edge density for a single graph.} diff --git a/man/dhist.Rd b/man/dhist.Rd index 4566df12..701f3385 100644 --- a/man/dhist.Rd +++ b/man/dhist.Rd @@ -15,8 +15,8 @@ location} \item{smoothing_window_width}{If greater than 0, the discrete histogram will be treated as having the mass at each location "smoothed" uniformly across -a bin centred on the location and having width = \code{smoothing_window_width} -(default = \code{0} - no smoothing)} +a bin centred on the location and having +width = \code{smoothing_window_width} (default = \code{0} - no smoothing)} \item{sorted}{Whether or not to return a discrete histogram with locations and masses sorted by ascending mass (default = \code{TRUE})} @@ -26,13 +26,14 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } Note that locations where no mass is present are not included in the returned \code{dhist} object. Mass in these discrete histograms is treated as being -present precisely at the specified location. Discrete histograms should not be used -for data where observations have been grouped into bins representing ranges -of observation values. +present precisely at the specified location. Discrete histograms should not +be used for data where observations have been grouped into bins representing +ranges of observation values. } \description{ Creates a discrete histogram object of class \code{dhist}, with bin diff --git a/man/dhist_from_obs.Rd b/man/dhist_from_obs.Rd index 584da0b8..f2df2ffd 100644 --- a/man/dhist_from_obs.Rd +++ b/man/dhist_from_obs.Rd @@ -14,9 +14,11 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } } \description{ -Generate a sparse discrete histogram from a set of discrete numeric observations +Generate a sparse discrete histogram from a set of discrete numeric +observations } diff --git a/man/dhist_from_obs_slow.Rd b/man/dhist_from_obs_slow.Rd index 39afc81e..e40b78fb 100644 --- a/man/dhist_from_obs_slow.Rd +++ b/man/dhist_from_obs_slow.Rd @@ -14,9 +14,11 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } } \description{ -Generate a sparse discrete histogram from a set of discrete numeric observations +Generate a sparse discrete histogram from a set of discrete numeric +observations } diff --git a/man/ego_network_density.Rd b/man/ego_network_density.Rd index 43114243..f1bcb65a 100644 --- a/man/ego_network_density.Rd +++ b/man/ego_network_density.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{ego_network_density} \alias{ego_network_density} \title{Calculate ego network edge densities.} diff --git a/man/emd_cs.Rd b/man/emd_cs.Rd index 1e580760..99a45be7 100755 --- a/man/emd_cs.Rd +++ b/man/emd_cs.Rd @@ -20,7 +20,7 @@ Distance between the two histograms by summing the absolute difference between the two cumulative histograms. } \references{ -Calculation of the Wasserstein Distance Between Probability Distributions on the Line -S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 -\url{http://dx.doi.org/10.1137/1118101} +Calculation of the Wasserstein Distance Between Probability Distributions on +the Line S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, +784-786 \url{http://dx.doi.org/10.1137/1118101} } diff --git a/man/exp_counts_bin_gp.Rd b/man/exp_counts_bin_gp.Rd index dc3bc3c5..3e717809 100644 --- a/man/exp_counts_bin_gp.Rd +++ b/man/exp_counts_bin_gp.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{exp_counts_bin_gp} \alias{exp_counts_bin_gp} \title{INTERNAL FUNCTION - DO NOT CALL DIRECTLY @@ -22,8 +22,8 @@ exp_counts_bin_gp( \item{density_interval_indexes}{Density bin indexes for each ego network in \code{graphlet_counts}.} -\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -included in graphlet_counts.} +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently +only size 4 and 5 are supported. included in graphlet_counts.} } \description{ INTERNAL FUNCTION - DO NOT CALL DIRECTLY diff --git a/man/gdd.Rd b/man/gdd.Rd index 7444bbe7..3102d4b3 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -22,7 +22,8 @@ the number of graphlet orbits each node participates in.} Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node used to select the +\item{ego_neighbourhood_size}{The number of steps from the source node used +to select the neighboring nodes to be included in the source node ego-network.} } \value{ @@ -30,6 +31,7 @@ List of graphlet-based degree distributions, with each distribution represented as a \code{dhist} discrete histogram object. } \description{ -Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object -using the ORCA fast graphlet orbit counting package. +Short-cut function to create graphlet-based degree distributions from +\code{igraph} graph object using the ORCA fast graphlet orbit counting +package. } diff --git a/man/gdd_for_all_graphs.Rd b/man/gdd_for_all_graphs.Rd index 7e6956bf..c9703631 100644 --- a/man/gdd_for_all_graphs.Rd +++ b/man/gdd_for_all_graphs.Rd @@ -25,12 +25,12 @@ gdd_for_all_graphs( \item{feature_type}{Type of graphlet-based degree distributions. Can be \code{graphlet} to count graphlets or \code{orbit} to count orbits.} -\item{max_graphlet_size}{Maximum size of graphlets to use when generating GDD. -Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{Maximum size of graphlets to use when generating +GDD. Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node used to select the -neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be -used.} +\item{ego_neighbourhood_size}{The number of steps from the source node used +to select the neighboring nodes to be included in the source node +ego-network. If set to 0, ego-networks will not be used.} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to the \code{mc.cores} option set in the R environment.} diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index ae526218..816a847d 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -8,14 +8,16 @@ each feature.} graph_features_to_histograms(features_matrix) } \arguments{ -\item{features_matrix}{A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i.} +\item{features_matrix}{A matrix whose rows represent nodes and whose columns +represent different node level features. This means that entry ij provides +the value of feature j for node i.} } \value{ -Feature histograms: List of "discrete histograms" for each -feature +Feature histograms: List of "discrete histograms" for each feature } \description{ Converts a matrix of node level features (e.g. for example counts of multiple graphlets or orbits at each node) to -a set of histogram like objects (observed frequency distribution of each feature/column) +a set of histogram like objects (observed frequency distribution of each +feature/column) } diff --git a/man/graphlet_key.Rd b/man/graphlet_key.Rd index 23f2289b..b63c9315 100644 --- a/man/graphlet_key.Rd +++ b/man/graphlet_key.Rd @@ -7,7 +7,8 @@ graphlet_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. +Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: diff --git a/man/is_dhist.Rd b/man/is_dhist.Rd index 1f550d6f..114e35e0 100644 --- a/man/is_dhist.Rd +++ b/man/is_dhist.Rd @@ -15,8 +15,8 @@ is set to \code{dhist} (default = \code{TRUE})} } \description{ Checks if the input object is of class \code{dhist}. If \code{fast_check} is -\code{TRUE} then the only check is whether the object has a class attribute of -\code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks +\code{TRUE} then the only check is whether the object has a class attribute +of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks are also made to ensure that the object has the structure required of a \code{dhist} object. } diff --git a/man/is_numeric_vector_1d.Rd b/man/is_numeric_vector_1d.Rd index 02f41145..9ad1cdd5 100644 --- a/man/is_numeric_vector_1d.Rd +++ b/man/is_numeric_vector_1d.Rd @@ -15,7 +15,8 @@ TRUE if input is a 1D numeric vector. FALSE otherwise. \description{ Check if a variable is a 1D numeric vector by checking that: \itemize{ - \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers + \item \code{is_numeric(input)}: Input is vector, matrix, array or list of + numbers \item \code{is_null(dim(input))}: Input is not a matrix or array } } diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index cd2d0ad0..864f84c7 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{mean_density_binned_graphlet_counts} \alias{mean_density_binned_graphlet_counts} \title{mean_density_binned_graphlet_counts} diff --git a/man/min_emd.Rd b/man/min_emd.Rd index 2799f48a..08cf27f7 100644 --- a/man/min_emd.Rd +++ b/man/min_emd.Rd @@ -15,8 +15,8 @@ min_emd(dhist1, dhist2, method = "optimise") offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} } \value{ diff --git a/man/min_emd_exhaustive.Rd b/man/min_emd_exhaustive.Rd index cd02830b..91b46ce3 100644 --- a/man/min_emd_exhaustive.Rd +++ b/man/min_emd_exhaustive.Rd @@ -25,8 +25,8 @@ to calculate the EMD at all offsets where any knots from the two ECMFs align to ensure that the offset with the global minimum EMD is found. This is because of the piece-wise linear nature of the two ECMFs. Between any -two offsets where knots from the two ECMFs align, EMD will be either constant, -or uniformly increasing or decreasing. Therefore, there the EMD between two -sets of aligned knots cannot be smaller than the EMD at one or other of the -bounding offsets. +two offsets where knots from the two ECMFs align, EMD will be either +constant, or uniformly increasing or decreasing. Therefore, there the EMD +between two sets of aligned knots cannot be smaller than the EMD at one or +other of the bounding offsets. } diff --git a/man/netdis.Rd b/man/netdis.Rd index cc091357..c838c8c9 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -1,19 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis} \alias{netdis} \title{Netdis - for one graphlet size} \usage{ -netdis( - centred_graphlet_count_vector_1, - centred_graphlet_count_vector_2, - graphlet_size -) +netdis(centred_graphlet_counts_1, centred_graphlet_counts_2, graphlet_size) } \arguments{ -\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} +\item{centred_graphlet_counts_1}{Centred Graphlet Counts vector for +graph 1} -\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} +\item{centred_graphlet_counts_2}{Centred Graphlet Counts vector for +graph 2} \item{graphlet_size}{The size of graphlets to use for the Netdis calculation (only counts for graphlets of the specified size will be used). The size of diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index eb3779cd..477b5034 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_centred_graphlet_counts} \alias{netdis_centred_graphlet_counts} \title{netdis_centred_graphlet_counts} @@ -30,8 +30,8 @@ or a constant numeric value to subtract from all graphlet counts.} \item{binning_fn}{Function used to bin ego network densities. Only needed if \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are \code{NULL}. Takes densities as its single argument, and returns a named list -including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} -(density bin index for each ego network).} +including keys \code{breaks} (vector of bin edges) and +\code{interval_indexes} (density bin index for each ego network).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Only needed if \code{ref_ego_density_bins} and @@ -44,7 +44,8 @@ expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments.} -\item{max_graphlet_size}{max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{max graphlet size to calculate centred counts for. +Currently only size 4 and 5 are supported.} } \value{ graphlet_counts minus exp_graphlet_counts for graphlets up to size diff --git a/man/netdis_const_expected_counts.Rd b/man/netdis_const_expected_counts.Rd index 5eb624a6..8c83c73f 100644 --- a/man/netdis_const_expected_counts.Rd +++ b/man/netdis_const_expected_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_const_expected_counts} \alias{netdis_const_expected_counts} \title{Create matrix of constant value to use as expected counts.} diff --git a/man/netdis_expected_counts.Rd b/man/netdis_expected_counts.Rd index f9ee0968..bcb190f9 100644 --- a/man/netdis_expected_counts.Rd +++ b/man/netdis_expected_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_expected_counts} \alias{netdis_expected_counts} \title{netdis_expected_counts} @@ -7,7 +7,7 @@ netdis_expected_counts( graphlet_counts, density_breaks, - density_binned_reference_counts, + density_binned_ref_counts, max_graphlet_size, scale_fn = NULL ) @@ -18,16 +18,17 @@ nummber of ego networks (rows).} \item{density_breaks}{Density values defining bin edges.} -\item{density_binned_reference_counts}{Reference network graphlet counts for +\item{density_binned_ref_counts}{Reference network graphlet counts for each density bin.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +Currently only size 4 and 5 are supported.} \item{scale_fn}{Optional function to scale calculated expected counts, taking \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and returning a scale factor that the looked up -\code{density_binned_reference_counts} values will be multiplied by.} +\code{density_binned_ref_counts} values will be multiplied by.} } \description{ Calculates expected graphlet counts for each ego network based on its density diff --git a/man/netdis_expected_counts_ego.Rd b/man/netdis_expected_counts_ego.Rd index b565ecab..f2443138 100644 --- a/man/netdis_expected_counts_ego.Rd +++ b/man/netdis_expected_counts_ego.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_expected_counts_ego} \alias{netdis_expected_counts_ego} \title{netdis_expected_counts_ego @@ -9,7 +9,7 @@ netdis_expected_counts_ego( graphlet_counts, max_graphlet_size, density_breaks, - density_binned_reference_counts, + density_binned_ref_counts, scale_fn = NULL ) } @@ -17,17 +17,18 @@ netdis_expected_counts_ego( \item{graphlet_counts}{Node and graphlet counts for an ego network.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +Currently only size 4 and 5 are supported.} \item{density_breaks}{Density values defining bin edges.} -\item{density_binned_reference_counts}{Reference network graphlet counts for +\item{density_binned_ref_counts}{Reference network graphlet counts for each density bin.} \item{scale_fn}{Optional function to scale calculated expected counts, taking \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and returning a scale factor that the looked up -\code{density_binned_reference_counts} values will be multiplied by.} +\code{density_binned_ref_counts} values will be multiplied by.} } \description{ Calculates expected graphlet counts for one ego network based on its density diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index ef60e0f6..03e8b4ee 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_many_to_many} \alias{netdis_many_to_many} \title{Compute any of the Netdis variants between all graph pairs.} @@ -29,14 +29,15 @@ obtained by using \code{read_simple_graphs}.} graphs. 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL (default) - Expected counts will be calculated based on the properties of the -query graphs themselves. (Geometric-Poisson approximation).} +3) NULL (default) - Expected counts will be calculated based on the +properties of the query graphs themselves. (Geometric-Poisson approximation).} \item{comparisons}{Which comparisons to perform between graphs. Can be "many-to-many" (all pairwise combinations) or "one-to-many" (compare first graph in graphs to all other graphs.)} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 +(default) and 5 are supported.} \item{neighbourhood_size}{Ego network neighbourhood size (default 2).} @@ -46,24 +47,31 @@ than min_ego_nodes nodes (default 3).} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges (default 1).} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +\code{num_intervals = 100} is used (default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. +If \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts}{Pre-generated graphlet counts (default: NULL). If the \code{graphlet_counts} argument is defined then \code{graphs} will not be @@ -75,14 +83,14 @@ ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for each ego network.} -\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: NULL). Matrix containing counts -of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with -graphlet IDs and rows are labelled with the ID of the central node in each -ego-network. As well as graphlet counts, each matrix must contain an -additional column labelled "N" including the node count for -each ego network. -If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -be used.} +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: +NULL). Matrix containing counts of each graphlet (columns) for each +ego-network (rows) in the input graph. Columns are labelled with graphlet IDs +and rows are labelled with the ID of the central node in each ego-network. As +well as graphlet counts, each matrix must contain an additional column +labelled "N" including the node count for each ego network. +If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} +will not be used.} } \value{ Netdis statistics between query graphs for graphlet sizes diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index c50cd4f4..0bb4f606 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_one_to_many} \alias{netdis_one_to_many} \title{Netdis comparisons between one graph and many other graphs.} @@ -35,7 +35,8 @@ expected counts are calculated for all query graphs. 3) NULL - Expected counts will be calculated based on the properties of the query graphs themselves.} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 +and 5 are supported.} \item{neighbourhood_size}{Ego network neighbourhood size.} @@ -45,25 +46,31 @@ than min_ego_nodes nodes.} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges.} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -(Default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +\code{num_intervals = 100} is used (Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), - it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson - approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. If +\code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. If the \code{graphlet_counts_1} argument is defined then diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 28b8a259..ff31956e 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_one_to_one} \alias{netdis_one_to_one} \title{Netdis between two graphs} @@ -21,19 +21,29 @@ netdis_one_to_one( ) } \arguments{ -\item{graph_1}{A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered.} - -\item{graph_2}{A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered.} +\item{graph_1}{A simple graph object from the \code{igraph} package. +\code{graph_1} can be set to \code{NULL} (default) if +\code{graphlet_counts_1} is provided. If both \code{graph_1} and +\code{graphlet_counts_1} are not \code{NULL}, then only +\code{graphlet_counts_1} will be considered.} + +\item{graph_2}{A simple graph object from the \code{igraph} package. +\code{graph_2} can be set to \code{NULL} (default) if +\code{graphlet_counts_2} is provided. If both \code{graph_2} and +\code{graphlet_counts_2} are not \code{NULL}, then only +\code{graphlet_counts_2} will be considered.} \item{ref_graph}{Controls how expected counts are calculated. Either: 1) A numeric value - used as a constant expected counts value for all query graphs . 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the -query graphs themselves. (Geometric-Poisson approximation).} +3) NULL (Default) - Used for Netdis-GP, where the expected counts will be +calculated based on the properties of the query graphs themselves +(Geometric-Poisson approximation).} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only +4 (default) and 5 are supported.} \item{neighbourhood_size}{Ego network neighborhood size (default: 2).} @@ -43,25 +53,31 @@ than min_ego_nodes nodes (default: 3).} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges (default: 1).} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -(Default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} +and \code{num_intervals = 100} is used (Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. If +\code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. Matrix containing counts of each graphlet (columns) for @@ -69,9 +85,9 @@ each ego-network (rows) in the input graph. Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for -each ego network. (default: NULL). -If the \code{graphlet_counts_1} argument is defined then -\code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} +each ego network. If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used. These counts can be obtained with +\code{count_graphlets_ego}. (default: NULL).} \item{graphlet_counts_2}{Pre-generated graphlet counts for the second query graph. Matrix containing counts of each graphlet (columns) for @@ -81,7 +97,8 @@ ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for each ego network. (default: NULL). If the \code{graphlet_counts_2} argument is defined then -\code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} +\code{graph_2} will not be used. These counts can be obtained with +\code{count_graphlets_ego}.} \item{graphlet_counts_ref}{Pre-generated reference graphlet counts. Matrix containing counts of each graphlet (columns) for @@ -89,55 +106,98 @@ each ego-network (rows) in the reference graph. Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for -each ego network. (default: NULL). -If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -be used.} +each ego network. If the \code{graphlet_counts_ref} argument is defined then +\code{ref_graph} will not be used. (default: NULL).} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size. } \description{ -Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). +Calculates the different variants of the network dissimilarity statistic +Netdis between two graphs. The variants currently supported are Netdis using +a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), +and Netdis using a Geometric Poisson approximation for the expectation +(\code{ref_graph = NULL}). } \examples{ require(netdist) require(igraph) -#Set source directory for Virus PPI graph edge files stored in the netdist package. -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Set source directory for Virus PPI graph edge files stored in the +# netdist package. +source_dir <- system.file( + file.path("extdata", "VRPINS"), + package = "netdist" +) # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") - -#Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. - -#Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) - -# Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. -# Two lattice networks of different sizes are used for this example. - goldstd_1 <- graph.lattice(c(8,8)) #A reference net - goldstd_2 <- graph.lattice(c(44,44)) #A reference net - - netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) - netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) - - - #Providing pre-calculated subgraph counts. - - props_1 <- count_graphlets_ego(graph = graph_1) - props_2 <- count_graphlets_ego(graph = graph_2) - props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) - props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) - -#Netdis Geometric-Poisson. -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) - -#Netdis Zero Expectation. -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) - -#Netdis using gold-standard network -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +graph_1 <- read_simple_graph( + file.path(source_dir, "EBV.txt"), + format = "ncol" +) +graph_2 <- read_simple_graph( + file.path(source_dir, "ECL.txt"), + format = "ncol" +) + +# Netdis variant using the Geometric Poisson approximation to remove the +# background expectation of each network. This option will focus on detecting +# more general and global discrepancies between the ego-network structures. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) + +# Comparing the networks via their observed ego counts without centering them +# (equivalent to using expectation equal to zero). This option, will focus on +# detecting small discrepancies. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) + +# Example of the use of netdis with a reference graph.This option will focus +# on detecting discrepancies between the networks relative to the ego-network +# structure of the reference network / gold-standard. +# Two lattice networks of different sizes are used for this example. +goldstd_1 <- graph.lattice(c(8, 8)) # A reference net +goldstd_2 <- graph.lattice(c(44, 44)) # A reference net + +netdis_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + ref_graph = goldstd_1 +) +netdis_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + ref_graph = goldstd_2 +) + + +# Providing pre-calculated subgraph counts. + +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) +props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) + +# Netdis Geometric-Poisson. +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + ref_graph = NULL +) + +# Netdis Zero Expectation. +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + ref_graph = 0 +) + +# Netdis using gold-standard network +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_1 +) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_2 +) } diff --git a/man/netdis.plot.Rd b/man/netdis_plot.Rd similarity index 66% rename from man/netdis.plot.Rd rename to man/netdis_plot.Rd index 44cc33ed..fb937505 100644 --- a/man/netdis.plot.Rd +++ b/man/netdis_plot.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlottingFunctions.R -\name{netdis.plot} -\alias{netdis.plot} +\name{netdis_plot} +\alias{netdis_plot} \title{Heatmap of Netdis many-to-many comparisons} \usage{ -netdis.plot( +netdis_plot( netdislist, whatrow = c(1, 2)[2], clustering_method = "ward.D", @@ -15,17 +15,24 @@ netdis.plot( \arguments{ \item{netdislist}{Default output of \code{netdis_many_to_many}.} -\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} +to be used for plotting.} -\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} +function from the \code{pheatmap} package. The dendrogram will appear if +\code{docluster} is TRUE (default).} \item{main}{Title of the plot.} -\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} +\item{docluster}{controls the order of the rows and columns. If TRUE +(default) the rows and columns will be reordered to create the dendrogram. If +FALSE, then only the heatmap is drawn.} } \value{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } \description{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } diff --git a/man/netdis_subtract_exp_counts.Rd b/man/netdis_subtract_exp_counts.Rd index c03aa42f..a8891ff5 100644 --- a/man/netdis_subtract_exp_counts.Rd +++ b/man/netdis_subtract_exp_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_subtract_exp_counts} \alias{netdis_subtract_exp_counts} \title{netdis_subtract_exp_counts} @@ -17,7 +17,8 @@ nummber of ego networks (rows).} \item{exp_graphlet_counts}{Matrix of expected graphlet counts (columns) for a nummber of ego networks (rows).} -\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. +Currently only size 4 and 5 are supported.} } \description{ Subtract expected graphlet counts from actual graphlet counts. diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index 3ab881c9..545c2fe1 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -1,23 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{netdis_uptok} \alias{netdis_uptok} \title{Netdis - for all graphlet sizes up to max_graphlet_size} \usage{ netdis_uptok( - centred_graphlet_count_vector_1, - centred_graphlet_count_vector_2, + centred_graphlet_counts_1, + centred_graphlet_counts_2, max_graphlet_size ) } \arguments{ -\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} +\item{centred_graphlet_counts_1}{Centred Graphlet Counts vector for +graph 1} -\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} +\item{centred_graphlet_counts_2}{Centred Graphlet Counts vector for +graph 2} \item{max_graphlet_size}{max graphlet size to calculate Netdis for. The size of a graphlet is the number of nodes it contains. Netdis is -calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported.} +calculated for all graphlets from size 3 to size max_graphlet_size. Currently +only 4 and 5 are supported.} } \value{ Netdis statistic calculated using centred counts for graphlets of diff --git a/man/netemd_many_to_many.Rd b/man/netemd_many_to_many.Rd index 0e7e8da9..907e5ccd 100644 --- a/man/netemd_many_to_many.Rd +++ b/man/netemd_many_to_many.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_emd.R +% Please edit documentation in R/measures_netemd.R \name{netemd_many_to_many} \alias{netemd_many_to_many} \title{NetEMDs between all graph pairs using provided Graphlet-based Degree @@ -18,16 +18,22 @@ netemd_many_to_many( ) } \arguments{ -\item{graphs}{A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided.} +\item{graphs}{A list of network/graph objects from the \code{igraph} package. +\code{graphs} can be set to \code{NULL} (default) if \code{dhists} is +provided.} -\item{dhists}{A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} +\item{dhists}{A list whose elements contain either: A list of \code{dhist} +discrete histogram objects for each graph, or a list a matrix of network +features (each column representing a feature). \code{dhists} can be set to +\code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object +can be obtained from \code{graph_features_to_histograms}.} \item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to @@ -43,27 +49,28 @@ minimal EMDs and associated offsets for all pairs of histograms} the \code{mc.cores} option set in the R environment.} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" -counts the number of graphlets each node participates in; "orbit" (default) calculates -the number of graphlet orbits each node participates in.} +counts the number of graphlets each node participates in; "orbit" (default) +calculates the number of graphlet orbits each node participates in.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Possible values are 4, and 5 (default).} \item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} +include nodes for each ego-network. NetEmd was proposed for individual nodes +alone, hence the default value is 0.} } \value{ NetEMD measures between all pairs of graphs for which features were provided. Format of returned data depends on the \code{return_details} parameter. If set to FALSE, a list is returned with the following named -elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +elements:\code{netemd}: a vector of NetEMDs for each pair of graphs, \code{comp_spec}: a comparison specification table containing the graph names and indices within the input GDD list for each pair of graphs compared. -If \code{return_details} is set to FALSE, the list also contains the following -matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD -used to compute the NetEMD, \code{min_offsets}: the associated offsets giving -the minimal EMD for each GDD +If \code{return_details} is set to FALSE, the list also contains the +following matrices for each graph pair: \code{min_emds}: the minimal EMD for +each GDD used to compute the NetEMD, \code{min_offsets}: the associated +offsets giving the minimal EMD for each GDD } \description{ NetEMDs between all graph pairs using provided Graphlet-based Degree diff --git a/man/netemd_one_to_one.Rd b/man/netemd_one_to_one.Rd index 59f4b04e..97112b07 100644 --- a/man/netemd_one_to_one.Rd +++ b/man/netemd_one_to_one.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_emd.R +% Please edit documentation in R/measures_netemd.R \name{netemd_one_to_one} \alias{netemd_one_to_one} \title{NetEMD Network Earth Mover's Distance between a pair of networks.} @@ -18,21 +18,30 @@ netemd_one_to_one( ) } \arguments{ -\item{graph_1}{A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided.} +\item{graph_1}{A network/graph object from the \code{igraph} package. +\code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is +provided.} -\item{graph_2}{A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided.} +\item{graph_2}{A network/graph object from the \code{igraph} package. +\code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is +provided.} -\item{dhists_1}{Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} +\item{dhists_1}{Either, a \code{dhist} discrete histogram object, or list of +such objects, or a matrix of network features (each column representing a +feature). \code{dhists_1} can be set to \code{NULL} (default) if +\code{graph_1} is provided. A \code{dhist} object can be obtained from +\code{graph_features_to_histograms}.} \item{dhists_2}{Same as \code{dhists_1}.} -\item{method}{The method to be used to find the minimum EMD across all potential -offsets for each pair of histograms. Default is "optimise" to use +\item{method}{The method to be used to find the minimum EMD across all +potential offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the -histograms at all offsets that are candidates for the minimal EMD at the cost of computational time.} +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the +histograms at all offsets that are candidates for the minimal EMD at the cost +of computational time.} \item{return_details}{Logical indicating whether to return the individual minimal EMDs and associated offsets for all pairs of histograms.} @@ -44,64 +53,109 @@ which results in no smoothing. Care should be taken to select a (e.g.for the integer domain a width of 1 is the natural choice).} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" -counts the number of graphlets each node participates in; "orbit" (default) calculates -the number of graphlet orbits each node participates in.} +counts the number of graphlets each node participates in; "orbit" (default) +calculates the number of graphlet orbits each node participates in.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Possible values are 4, and 5 (default).} \item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} +include nodes for each ego-network. NetEmd was proposed for individual nodes +alone, hence the default value is 0.} } \value{ -NetEMD measure for the two sets of discrete histograms (or graphs). If -(\code{return_details = FALSE}) then a list with the following named elements is returned -\code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: -the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -offsets giving the minimal EMD for each pair of histograms +NetEMD measure for the two sets of discrete histograms (or graphs). +If (\code{return_details = FALSE}) then a list with the following named +elements is returned \code{netemd}: the NetEMD for the set of histogram +pairs (or graphs), \code{min_emds}: the minimal EMD for each pair of +histograms, \code{min_offsets}: the associated offsets giving the minimal EMD +for each pair of histograms } \description{ -Calculates the network Earth Mover's Distance (EMD) between -two sets of network features. This is done by individually normalising the distribution -of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. +Calculates the network Earth Mover's Distance (EMD) between +two sets of network features. This is done by individually normalising the +distribution of each feature so that they have unit mass and unit variance. +Then the minimun EMD between the same pair of features (one for each +corresponding graph) is calculated by considering all possible translations +of the feature distributions. Finally the average over all features is +reported. This is calculated as follows: 1. Normalise each feature histogram to have unit mass and unit variance. - 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. + 2. For each feature, find the minimum EMD between each pair of histograms + considering all possible histogram translations. 3. Take the average minimum EMD across all features. } \examples{ - require(igraph) - graph_1 <- graph.lattice(c(8,8)) - graph_2 <- graph.lattice(c(44,44)) - netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) - - #Providing a matrix of network features - props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) - props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) - - netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) - - #Providing the network features as lists of dhist objects - dhists_1<- graph_features_to_histograms(props_a) - dhists_2<- graph_features_to_histograms(props_b) - - netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) - - - # A variation of NetEmd: Using the Laplacian spectrum - #Laplacian - Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) - Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) - - #Normalized Laplacian - NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) - NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) - - #Spectra (This may take a couple of minutes). - props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) - props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) - - netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. - +require(igraph) +graph_1 <- graph.lattice(c(8, 8)) +graph_2 <- graph.lattice(c(44, 44)) +netemd_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + feature_type = "orbit", + max_graphlet_size = 5 +) + +# Providing a matrix of network features +props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) +props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) + +netemd_one_to_one( + dhists_1 = props_a, + dhists_2 = props_b, + smoothing_window_width = 1 +) + +# Providing the network features as lists of dhist objects +dhists_1 <- graph_features_to_histograms(props_a) +dhists_2 <- graph_features_to_histograms(props_b) + +netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2) + + +# A variation of NetEmd: Using the Laplacian spectrum +# Laplacian +Lapg_1 <- igraph::laplacian_matrix( + graph = graph_1, + normalized = FALSE, + sparse = FALSE +) +Lapg_2 <- igraph::laplacian_matrix( + graph = graph_2, + normalized = FALSE, + sparse = FALSE +) + +# Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix( + graph = graph_1, + normalized = TRUE, + sparse = FALSE +) +NLapg_2 <- igraph::laplacian_matrix( + graph = graph_2, + normalized = TRUE, + sparse = FALSE +) + +# Spectra (This may take a couple of minutes). +props_1 <- cbind( + L.Spectra = eigen(Lapg_1)$values, + NL.Spectra = eigen(NLapg_1)$values +) +props_2 <- cbind( + L.Spectra = eigen(Lapg_2)$values, + NL.Spectra = eigen(NLapg_2)$values +) + +# Use of smoothing window 1 is given for discrete integer distributions. If +# the network features are considered continuous variables +# smoothing_window_width equal to zero is recommended. +netemd_one_to_one( + dhists_1 = props_1, + dhists_2 = props_2, + smoothing_window_width = 0 +) + } diff --git a/man/netemd.plot.Rd b/man/netemd_plot.Rd similarity index 65% rename from man/netemd.plot.Rd rename to man/netemd_plot.Rd index 269e8010..e8738f1e 100644 --- a/man/netemd.plot.Rd +++ b/man/netemd_plot.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlottingFunctions.R -\name{netemd.plot} -\alias{netemd.plot} +\name{netemd_plot} +\alias{netemd_plot} \title{Heatmap of NetEmd many-to-many comparisons} \usage{ -netemd.plot( +netemd_plot( netemdlist, clustering_method = "ward.D", main = "NetEmd", @@ -12,19 +12,26 @@ netemd.plot( ) } \arguments{ -\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} +function from the \code{pheatmap} package. The dendrogram will appear if +\code{docluster} is TRUE (default).} \item{main}{Title of the plot.} -\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} +\item{docluster}{controls the order of the rows and columns. If TRUE +(default) the rows and columns will be reordered to create the dendrogram. If +FALSE, then only the heatmap is drawn.} \item{netdislist}{Default output of \code{netdis_many_to_many}.} -\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} +to be used for plotting.} } \value{ -Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heat map and dendrogram for the network comparisons via +\code{pheatmap}. } \description{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } diff --git a/man/netemd_single_pair.Rd b/man/netemd_single_pair.Rd index 1158d2f5..5d7dc0c4 100644 --- a/man/netemd_single_pair.Rd +++ b/man/netemd_single_pair.Rd @@ -1,8 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_emd.R +% Please edit documentation in R/measures_netemd.R \name{netemd_single_pair} \alias{netemd_single_pair} -\title{Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms} +\title{Internal function to compute the minimum Earth Mover's Distance between +standarized and translated histograms} \usage{ netemd_single_pair( dhist1, @@ -16,8 +17,8 @@ netemd_single_pair( offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to @@ -26,30 +27,40 @@ which results in no smoothing. Care should be taken to select a \code{smoothing_window_width} that is appropriate for the discrete domain (e.g.for the integer domain a width of 1 is the natural choice)} -\item{dhists_1}{A \code{dhist} discrete histogram object or a list of such objects} +\item{dhists_1}{A \code{dhist} discrete histogram object or a list of such +objects} -\item{dhists_2}{A \code{dhist} discrete histogram object or a list of such objects} +\item{dhists_2}{A \code{dhist} discrete histogram object or a list of such +objects} } \value{ A list with the following named elements -\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated -offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. +\code{netemd}: the NetEMD for the set of histogram pairs, +\code{min_offsets}: the associated offsets giving the minimal EMD for each +pair of histograms and \code{min_offset_std}: Offset used in the standardised +histograms. } \description{ Calculates the minimum Earth Mover's Distance (EMD) between two -discrete histograms after normalising each histogram to unit mass and variance. +discrete histograms after normalising each histogram to unit mass and +variance. This is calculated as follows: 1. Normalise each histogram to have unit mass and unit variance 2. Find the minimum EMD between the histograms } \examples{ - require(igraph) - goldstd_1 <- graph.lattice(c(8,8)) - goldstd_2 <- graph.lattice(c(44,44)) - props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) - props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) - dhists_1<- graph_features_to_histograms(props_1) - dhists_2<- graph_features_to_histograms(props_2) - # Obtain the minimum NetEMD_edges between the histograms - netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +require(igraph) +goldstd_1 <- graph.lattice(c(8, 8)) +goldstd_2 <- graph.lattice(c(44, 44)) +props_1 <- count_orbits_per_node(graph = goldstd_1, max_graphlet_size = 5) +props_2 <- count_orbits_per_node(graph = goldstd_2, max_graphlet_size = 5) +dhists_1 <- graph_features_to_histograms(props_1) +dhists_2 <- graph_features_to_histograms(props_2) +# Obtain the minimum NetEMD_edges between the histograms +netemd_single_pair( + dhists_1[[1]], + dhists_2[[1]], + method = "optimise", + smoothing_window_width = 0 +) } diff --git a/man/orbit_key.Rd b/man/orbit_key.Rd index 79d8afb4..e7921d92 100644 --- a/man/orbit_key.Rd +++ b/man/orbit_key.Rd @@ -7,7 +7,8 @@ orbit_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. +Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 57d45b86..5a4addc9 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -50,10 +50,17 @@ following order: previous alterations) } \examples{ -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -print(source_dir) +# Set source directory for Virus protein-protein interaction edge files +# stored in the netdist package. +source_dir <- system.file( + file.path("extdata", "VRPINS"), + package = "netdist" +) +print(source_dir) # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_1 <- read_simple_graph( + file.path(source_dir, "EBV.txt"), + format = "ncol" +) graph_1 } diff --git a/man/scale_graphlet_count.Rd b/man/scale_graphlet_count.Rd index 8a313c27..8381bb0c 100644 --- a/man/scale_graphlet_count.Rd +++ b/man/scale_graphlet_count.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{scale_graphlet_count} \alias{scale_graphlet_count} \title{Divide graphlet counts by pre-computed scaling factor from diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd index 477a1ec5..16dfc96c 100644 --- a/man/scale_graphlet_counts_ego.Rd +++ b/man/scale_graphlet_counts_ego.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{scale_graphlet_counts_ego} \alias{scale_graphlet_counts_ego} \title{Scale graphlet counts for an ego network by the n choose k possible diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd index 2c3cd29f..be469af3 100644 --- a/man/single_density_bin.Rd +++ b/man/single_density_bin.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{single_density_bin} \alias{single_density_bin} \title{For case where don't want to use binning, return a single bin which covers diff --git a/man/virusppi.Rd b/man/virusppi.Rd index 6c905b2c..174a2d04 100644 --- a/man/virusppi.Rd +++ b/man/virusppi.Rd @@ -9,11 +9,19 @@ A list of \code{igraph} objects. } \source{ -\strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. +\strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, +Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily +Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): +e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table +S2 in the supporting information. -\strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, Parkinson J (2009) The Modular Organization of Protein Interactions in Escherichia coli. PLoS Comput Biol 5(10): e1000523. \url{https://doi.org/10.1371/journal.pcbi.1000523} +\strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, +Parkinson J (2009) The Modular Organization of Protein Interactions in +Escherichia coli. PLoS Comput Biol 5(10): e1000523. +\url{https://doi.org/10.1371/journal.pcbi.1000523} -\strong{Taxonomy ground truth:} NCBI taxonomy database. \url{https://www.ncbi.nlm.nih.gov/taxonomy} +\strong{Taxonomy ground truth:} NCBI taxonomy database. +\url{https://www.ncbi.nlm.nih.gov/taxonomy} } \usage{ virusppi diff --git a/man/worldtradesub.Rd b/man/worldtradesub.Rd index aa537cac..2483480d 100644 --- a/man/worldtradesub.Rd +++ b/man/worldtradesub.Rd @@ -6,23 +6,35 @@ \alias{worldtradesub} \title{World trade networks from 1985–2014} \format{ -A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. +A list of two elements. The first element, 'wtnets', is a list of +\code{igraph} objects providing a small sample of world trade networks from +2001–2014. The second element, 'Counts', is a list of pre-computed subgraph +counts of world trade networks in the years 1985-2014. } \source{ -\strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. +\strong{World trade networks:}. United nations commodity trade +statistics database (UN comtrade). http://comtrade.un.org/, 2015. -\strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). +\strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and +Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau +of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). } \usage{ worldtradesub } \description{ -The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. +The world trade data set consists of a small sample of world trade networks +for the years 2001-2014, and pre-computed subgraph counts of a larger set of +world trade networks (1985–2014). The world trade networks are based on the +data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the +United Nations division COMTRADE [Division, 2015] for the years 2001-2014. } \details{ \itemize{ - \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. - \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. + \item wtnets: List of \code{igraph} objects providing the world trade + networks from 2001–2014. + \item Counts: Pre-computed graphlet counts for the world trade networks in + the years 1985-2014. } } \keyword{datasets} diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd index a33206bf..32befead 100644 --- a/man/zeros_to_ones.Rd +++ b/man/zeros_to_ones.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R +% Please edit documentation in R/measures_netdis.R \name{zeros_to_ones} \alias{zeros_to_ones} \title{Replace zero values in a vector with ones. Used by diff --git a/src/Makevars b/src/Makevars index 25761e11..fe240994 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ CXX_STD = CXX11 -PKG_CPPFLAGS += -fno-fast-math -msse2 -mfpmath=sse -mstackrealign +PKG_CPPFLAGS += -fno-fast-math -msse2 -mstackrealign diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a1fe416b..f020ef84 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // counts_from_observations NumericMatrix counts_from_observations(NumericMatrix features); RcppExport SEXP _netdist_counts_from_observations(SEXP featuresSEXP) { @@ -30,12 +35,29 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// netemd_smooth +double netemd_smooth(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); +RcppExport SEXP _netdist_netemd_smooth(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type loc1(loc1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val1(val1SEXP); + Rcpp::traits::input_parameter< double >::type binWidth1(binWidth1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type loc2(loc2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val2(val2SEXP); + Rcpp::traits::input_parameter< double >::type binWidth2(binWidth2SEXP); + rcpp_result_gen = Rcpp::wrap(netemd_smooth(loc1, val1, binWidth1, loc2, val2, binWidth2)); + return rcpp_result_gen; +END_RCPP +} RcppExport SEXP run_testthat_tests(); static const R_CallMethodDef CallEntries[] = { {"_netdist_counts_from_observations", (DL_FUNC) &_netdist_counts_from_observations, 1}, {"_netdist_emd_fast_no_smoothing", (DL_FUNC) &_netdist_emd_fast_no_smoothing, 4}, + {"_netdist_netemd_smooth", (DL_FUNC) &_netdist_netemd_smooth, 6}, {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, {NULL, NULL, 0} }; diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp new file mode 100644 index 00000000..d1c208b6 --- /dev/null +++ b/src/fastSmoothV2.cpp @@ -0,0 +1,197 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] +#include +#include +#include +#include +#include +#include "emd_fast_no_smoothing.h" // add_element_kahan() +#include "fastSmoothV2.h" + +using namespace Rcpp; + +double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end) +{ + double midPoint = (val1_start - val2_start) / + ((val2_end - val2_start) - (val1_end - val1_start)); + + const double midValue = val1_start + midPoint * (val1_end - val1_start); + + midPoint = midPoint * length; + + double topTriangle = 0.5 * midPoint * (midValue - val1_start); + double topRectangle = midPoint * (val1_start - val2_start); + double bottomTriangle = 0.5 * midPoint * (midValue - val2_start); + + double res = topTriangle + topRectangle - bottomTriangle; + + topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); + + res += topTriangle + topRectangle - bottomTriangle; + return res; +} + +// Compute the unsigned area between two line segments +// assumes that val1_end > val1_start and val2_end > val2_start +double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end) +{ + const double length = end - start; + + double topTriangle; + double topRectangle; + double bottomTriangle; + double midPoint; + double midValue; + double res = 0; + + bool both_differences_positive = val1_start > val2_start && val1_end >= val2_end; + bool both_differences_negative = val1_start <= val2_start && val1_end <= val2_end; + + if (both_differences_positive || both_differences_negative) + { + // They are in the same order: no bowtie + // triangle of seg1 + topTriangle = 0.5 * length * (val1_end - val1_start); + // rectangle between seg1 and seg2 + topRectangle = length * (val1_start - val2_start); + // triangle of seg2 (to be removed) + bottomTriangle = 0.5 * length * (val2_end - val2_start); + + const double sign = both_differences_positive?1.0:-1.0; + return sign * (topTriangle + topRectangle - bottomTriangle); + } + else if (val1_start > val2_start) { // bowtie, first case + return bowtie_area(length, val1_start, val1_end, val2_start, val2_end); + } + else { // bowtie, second case + return bowtie_area(length, val2_start, val2_end, val1_start, val1_end); + } +} + +// cut down and compute segment +double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2) +{ + //We have a valid range + double valStart1, valEnd1, valStart2, valEnd2; + double start,end; + double result; + start = std::max(seg1L1,seg2L1); + end = std::min(seg1L2,seg2L2); + if (start < end) { + valStart1 = seg1V1 + (seg1V2 - seg1V1)*(start - seg1L1)/(seg1L2 - seg1L1); + valEnd1 = seg1V1 + (seg1V2 - seg1V1)*(end - seg1L1)/(seg1L2 - seg1L1); + valStart2 = seg2V1 + (seg2V2 - seg2V1)*(start - seg2L1)/(seg2L2 - seg2L1); + valEnd2 = seg2V1 + (seg2V2 - seg2V1)*(end - seg2L1)/(seg2L2 - seg2L1); + result = get_segment(start, end, valStart1, valEnd1, valStart2, valEnd2); + return result; + } + else { + return 0; + } +} + +double get_double_segment_constrained( + double seg1Loc1, double seg1Loc2, double seg1Loc3, + double seg1Val1, double seg1Val2, + double seg2Loc1, double seg2Loc2, double seg2Loc3, + double seg2Val1, double seg2Val2) +{ + double res = 0; + + // compare the linear section with the linear section + res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc1, seg2Loc2, + seg1Val1, seg1Val2, seg2Val1, seg2Val2); + + // compare the linear section with the flat section + // This could be easily special cased (saving ~1 if statements ). + res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc2, seg2Loc3, + seg1Val1, seg1Val2, seg2Val2, seg2Val2); + + // compare the flat section with the linear section + // This could be easily special cased (saving ~1 if statements ). + res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc1, seg2Loc2, + seg1Val2, seg1Val2, seg2Val1, seg2Val2); + + // compare the flat section with the flat section + // This could be easily special cased (saving ~2 if statements ). + res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc2, seg2Loc3, + seg1Val2, seg1Val2, seg2Val2, seg2Val2); + + return res; +} + +//' @title +//' Compute EMD +////' +////' @param loc1 numeric vector. +////' @param val1 numeric vector. +////' @param loc2 numeric vector. +////' @param val2 numeric vector. +//' +//' @export +// [[Rcpp::export]] +double netemd_smooth(NumericVector loc1, NumericVector val1, double binWidth1, + NumericVector loc2, NumericVector val2, double binWidth2) +{ + OverlappingSegments segs(loc1, loc2, binWidth1, binWidth2); + + double res = 0.0; + double compensation = 0.0; + + for (OverlappingSegments::iterator it = segs.begin(), endIt = segs.end(); + it != endIt; ++it) + { + // The OverlappingSegments iterator returns pairs of the left + // indices of overlapping endpoints: it->first is the index into + // loc1, it->second is the index into loc2. When the smallest + // element in one of these vectors is larger than the current + // element of the other, an 'index' of -1 is returned. + + // Hist 1 + // Start of the gradient section in Seg1 + double curSeg1Loc1 = segs.loc1_left(it->first); + + // End of the gradient section in Seg1 + double curSeg1Loc2 = segs.loc1_mid(it->first); + + // End of the flat section in Seg1 + double curSeg1Loc3 = segs.loc1_right(it->first); + + // Start and end values in Seg1: val1 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg1Val1 = (it->first > 0) ? val1[it->first - 1] : 0.0; + double curSeg1Val2 = (it->first >= 0) ? val1[it->first] : 0.0; + + // Hist 2 + // Start of the gradient section in Seg1 + double curSeg2Loc1 = segs.loc2_left(it->second); + + // End of the gradient section in Seg1 + double curSeg2Loc2 = segs.loc2_mid(it->second); + + // End of the flat section in Seg1 + double curSeg2Loc3 = segs.loc2_right(it->second); + + // Start and end values in Seg2: val2 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg2Val1 = (it->second > 0) ? val2[it->second - 1] : 0.0; + double curSeg2Val2 = (it->second >= 0) ? val2[it->second] : 0.0; + + double element = get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + add_element_kahan(res, element, compensation); + } + + return res; +} diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h new file mode 100644 index 00000000..e734080b --- /dev/null +++ b/src/fastSmoothV2.h @@ -0,0 +1,236 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] + +#ifndef FASTSMOOTHV2_H +#define FASTSMOOTHV2_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Rcpp; + +class OverlappingSegments { + + // The two sequences of left-hand segment endpoints + const NumericVector& loc1; + const NumericVector& loc2; + const double binWidth1, binWidth2; + + // shorter names for loc1.size() and loc2.size() + const long N1, N2; + + // the minimum and maximum values over both sequences; + double minloc, maxloc; + +public: + OverlappingSegments(NumericVector& loc1_, NumericVector& loc2_, + double binWidth1_ = 1.0, double binWidth2_ = 1.0) + : loc1(loc1_), loc2(loc2_), + binWidth1(binWidth1_), binWidth2(binWidth2_), + N1(loc1.size()), N2(loc2.size()) + { + if (N1 == 0 || N2 == 0) + throw std::invalid_argument("Input vectors must be nonempty"); + + for (int i = 0; i < N1 - 1; i++) + if (loc1[i] > loc1[i + 1] + binWidth1) + throw std::invalid_argument( + "Elements of loc1 must be sorted in ascending order, " + "with elements separated by at least binWidth1"); + + for (int i = 0; i < N2 - 1; i++) + if (loc2[i] > loc2[i + 1] + binWidth2) + throw std::invalid_argument( + "Elements of loc2 must be sorted in ascending order, " + "with elements separated by at least binWidth2"); + + minloc = std::min(loc1[0], loc2[0]); + maxloc = std::max(loc1[N1 - 1] + binWidth1, loc2[N2 - 1] + binWidth2); + } + + // left, mid and right locations of the segments, for loc1 and loc2 + double loc1_left(long i) const { + return (i >= 0) ? loc1[i] : minloc; + } + + double loc1_mid(long i) const { + return (i >= 0) ? (loc1[i] + binWidth1) : minloc; + } + + double loc1_right(long i) const { + return (i + 1 < N1) ? loc1[i + 1] : maxloc; + } + + double loc2_left(long i) const { + return (i >= 0) ? loc2[i] : minloc; + } + + double loc2_mid(long i) const { + return (i >= 0) ? (loc2[i] + binWidth2) : minloc; + } + + double loc2_right(long i) const { + return (i + 1 < N2) ? loc2[i + 1] : maxloc; + } + + // Does interval i (from the first collection of segments) overlap + // interval j (from the second)? + bool intervals_overlap(long i, long j) const { + return (loc1_left(i) < loc2_right(j) && loc2_left(j) < loc1_right(i)); + } + + // OverlappingSegments iterator + class iterator { + const OverlappingSegments& segs; + + // the current iteration state + std::pair idx; + + public: + typedef std::pair value_type; + typedef void difference_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::input_iterator_tag iterator_category; + + // Iterate over pairs of indices (i,j) into the sequences loc1 and + // loc2, where the intervals [loc1[i], loc1[i+1]] and [loc2[j], + // loc2[j+1]] overlap. + // + // These indices are returned from the iterator as + // std::pair. + // + // A sequence has an implicit segment from minloc (with index -1) + // to its zeroth element. The elements loc1[0] and loc2[0] are + // compared to determine whether, for either sequence, this + // initial implicit segment overlaps the zeroth segment of the + // other one. If both sequences start with the same value, the + // iteration starts at (0,0). + // + explicit iterator(const OverlappingSegments& segs_) + : segs(segs_) + { + if (segs.loc1[0] < segs.loc2[0]) { + idx.first = 0; + idx.second = -1; + } + else if (segs.loc1[0] == segs.loc2[0]) { + idx.first = 0; + idx.second = 0; + } + else { + idx.first = -1; + idx.second = 0; + } + } + + // Is the current iterator at one-past-the-end? Equivalent to an + // equality comparison with segs.end(). + bool at_end() const { + return idx.first == segs.N1 && idx.second == segs.N2 - 1; + } + + // Update the current iterator to point to one-past-the-end + iterator& advance_to_end() { + idx.first = segs.N1; + idx.second = segs.N2 - 1; + return *this; + } + + iterator& operator++() { +#if !NDEBUG + // Verify precondition + if (!segs.intervals_overlap(idx.first, idx.second)) { + throw std::logic_error("Iterator precondition not satisfied: " + "current intervals do not overlap"); + } +#endif + + // Advance the second segment if it would still overlap the first + // + // The condition below is equivalent to + // idx.second < N2 - 1 && intervals_overlap(idx.first, idx.second + 1) + // given that we know (by the precondition) that + // loc1_left(idx.first) < loc2_right(idx.second) + // and therefore that + // loc1_left(idx.first) < loc2_right(idx.second + 1), + // + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) < segs.loc1_right(idx.first)) { + idx.second++; + } + // Could not advance the second segment above: advance the first instead, + // and the second as well if they share an endpoint + else { + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) == segs.loc1_right(idx.first)) { + idx.second++; + } + idx.first++; + } + +#if !NDEBUG + // Verify postcondition + if (!(at_end() || segs.intervals_overlap(idx.first, idx.second))) { + throw std::logic_error("Iterator postcondition not satisfied: " + "current intervals do not overlap (not at end)"); + } +#endif + + return *this; + } + + iterator operator++(int) { + iterator res = *this; + operator++(); + return res; + } + + value_type operator*() const { return idx; } + + const value_type *operator->() const { return &idx; } + + friend bool operator==(const iterator& lhs, const iterator& rhs) { + return lhs.idx == rhs.idx; + } + + friend bool operator!=(const iterator& lhs, const iterator& rhs) { + return !(lhs == rhs); + } + }; + + iterator begin() { return iterator(*this); } + iterator end() { return iterator(*this).advance_to_end(); } +}; + + +double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end); + +double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end); + +double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2); + +double get_double_segment_constrained( + double seg1Loc1, double seg1Loc2, double seg1Loc3, + double seg1Val1, double seg1Val2, + double seg2Loc1, double seg2Loc2, double seg2Loc3, + double seg2Val1, double seg2Val2); + +double netemd_smooth(NumericVector loc1, NumericVector val1, double binWidth1, + NumericVector loc2, NumericVector val2, double binWidth2); + +#endif // FASTSMOOTHV2_H diff --git a/src/test_emd_fast_smooth.cpp b/src/test_emd_fast_smooth.cpp new file mode 100644 index 00000000..8d9f453a --- /dev/null +++ b/src/test_emd_fast_smooth.cpp @@ -0,0 +1,210 @@ +/* + * This file uses the Catch unit testing library, alongside + * testthat's simple bindings, to test a C++ function. + * + * This file should begin with `test` and be in the `src/` folder. + * `LinkingTo: testthat` must also be within the DESCRIPTION file. + */ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] + +// All test files should include the +// header file. +#include "fastSmoothV2.h" +#include "emd_fast_no_smoothing.h" + +#include +#include +#include + +// Helper function to test tolerance +bool within_toleranceV2(double actual, double expected, double tolerance) { + if(actual > expected) { + return ((actual - expected) <= tolerance); + } + else { + return ((expected - actual) <= tolerance); + } +} + +double simpleSlowArea(double startx,double endx,double starty1,double endy1,double starty2,double endy2) +{ + // Making this step size smaller + double step = (endx-startx)/100000000.0; + double curX; + double curY1; + double curY2; + double res = 0; + for (int i=0;i<100000000;i++) + { + curX = startx + i*step; + curY1 = starty1 +(endy1-starty1)*i/100000000.0; + curY2 = starty2 +(endy2-starty2)*i/100000000.0; + res += step*std::abs(curY1-curY2); + } + return res; +} + +void runSegmentConstraintTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double tempVal1; + double tempVal2; + tempVal1 = get_segment_constrained(start,end,start,end,val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment constrained " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); +} + + +void runSegmentTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double tempVal1; + double tempVal2; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment test " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); +} + +context("emd_fast_smoothing segment constrain simple") { + test_that("emd_fast_smoothing segment constrain simple") { + // Two upward linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentConstraintTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + +context("emd_fast_smoothing segment full") { + test_that("emd_fast_smoothing segment full") { + // Two upward linear segments + runSegmentTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + +template +void runIntervalOverlapTest(Container1T& actual, Container2T& expected) +{ + std::cout << "Left endpoints of overlapping intervals:" << std::endl; + for (std::pair p : actual) + std::cout << p.first << ", " << p.second << std::endl; + + std::cout << "Expected:" << std::endl; + for (std::pair p : expected) + std::cout << p.first << ", " << p.second << std::endl; + + bool result = std::equal(actual.begin(), actual.end(), expected.begin()); + + std::cout << "Same? " << std::boolalpha << result << std::endl; + std::cout << "~~~~~~~~~~\n"; + + expect_true(result); +} + +context("emd_fast_smooth overlapping interval iterator") { + test_that("emd_fast_smooth overlapping interval iterator") { + { + NumericVector xs {1.0, 3.0, 5.0}; + NumericVector ys {2.0, 4.0, 6.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 0}, {1, 1}, {2, 1}, {2, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 3.0}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {1, -1}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {5.0, 5.5}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {-1, 0}, {0, 0}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 3.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 3.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {0, 1}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0, 3.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + } +} diff --git a/tests/testthat/test_dhist.R b/tests/testthat/test_dhist.R index 37dc59b2..c7c63b01 100644 --- a/tests/testthat/test_dhist.R +++ b/tests/testthat/test_dhist.R @@ -1,269 +1,320 @@ context("dhist: Discrete histogram from observations") -test_that("discrete_hist generates correct discrete histograms for random integer observations", { - # Method for generating random observations containing specific locations a - # specific number of times - random_observations <- function(locations, counts) { - # Construct vector containing each location replicated "count" times - observations <- purrr::simplify(purrr::map2(locations, counts, rep)) - # Randomise the order of the observations - sample(observations, size = length(observations), replace = FALSE) - } +test_that( + paste( + "discrete_hist generates correct discrete histograms for random integer", + "observations" + ), + { + # Method for generating random observations containing specific locations a + # specific number of times + random_observations <- function(locations, counts) { + # Construct vector containing each location replicated "count" times + observations <- purrr::simplify(purrr::map2(locations, counts, rep)) + # Randomise the order of the observations + sample(observations, size = length(observations), replace = FALSE) + } - set.seed(2684) - num_tests <- 100 - - run_test <- function() { - # Set parameters for generation of random observation sets - num_observations <- 100 - location_range <- -(num_observations * 3):(num_observations * 3) - # Do not allow zero counts as these locations will not be present in the - # observations generated from the locations and counts - count_range <- 1:10 - - # Generate random observation sets - locations <- sample(location_range, num_observations, replace = FALSE) - counts <- sample(count_range, num_observations, replace = TRUE) - - # Construct vector containing each location replicated "count" times - observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) - # Randomise the order of the observations - observations <- sample(observations_orig, size = length(observations_orig), replace = FALSE) - - # Generate discrete histograms - hist <- dhist_from_obs(observations) - - # discrete_hist will drop bins with zero counts, so remove these from the - # expected data (not necessary now we've restricted counts to be >= 1, but - # the bug where we generated test locations with zero counts was so annoying - # to identify that we're going with a belt and braces approach) - non_zero_count_indexes <- counts != 0 - expected_locations <- locations[non_zero_count_indexes] - expected_counts <- counts[non_zero_count_indexes] - # dhist_from_obs will return results with bins ordered by ascending location, - # so sort expected data to match - sorted_locations <- sort(expected_locations, index.return = TRUE) - sorted_location_indexes <- sorted_locations$ix - expected_locations <- expected_locations[sorted_location_indexes] - expected_counts <- expected_counts[sorted_location_indexes] - - # Check that histogram locations and counts match those used to generate the - # observations - expect_true(all.equal(hist$locations, expected_locations)) - expect_true(all.equal(hist$masses, expected_counts)) - } + set.seed(2684) + num_tests <- 100 + + run_test <- function() { + # Set parameters for generation of random observation sets + num_observations <- 100 + location_range <- -(num_observations * 3):(num_observations * 3) + # Do not allow zero counts as these locations will not be present in the + # observations generated from the locations and counts + count_range <- 1:10 + + # Generate random observation sets + locations <- sample(location_range, num_observations, replace = FALSE) + counts <- sample(count_range, num_observations, replace = TRUE) + + # Construct vector containing each location replicated "count" times + observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) + # Randomise the order of the observations + observations <- sample( + observations_orig, + size = length(observations_orig), + replace = FALSE + ) + + # Generate discrete histograms + hist <- dhist_from_obs(observations) + + # discrete_hist will drop bins with zero counts, so remove these from the + # expected data (not necessary now we've restricted counts to be >= 1, but + # the bug where we generated test locations with zero counts was so + # annoying to identify that we're going with a belt and braces approach) + non_zero_count_indexes <- counts != 0 + exp_locations <- locations[non_zero_count_indexes] + exp_counts <- counts[non_zero_count_indexes] + # dhist_from_obs will return results with bins ordered by ascending + # location, so sort expected data to match + sorted_locations <- sort(exp_locations, index.return = TRUE) + sorted_location_indexes <- sorted_locations$ix + exp_locations <- exp_locations[sorted_location_indexes] + exp_counts <- exp_counts[sorted_location_indexes] + + # Check that histogram locations and counts match those used to generate + # the observations + expect_true(all.equal(hist$locations, exp_locations)) + expect_true(all.equal(hist$masses, exp_counts)) + } - for (i in 1:num_tests) { - run_test() + for (i in 1:num_tests) { + run_test() + } } -}) - -context("dhist: constructor, equality operator and as_* transformation functions") -test_that("dhist constuctor has correct locations and masses (default smoothing, unsorted)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 - - expected1 <- list( - locations = locations1, masses = masses1, - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class - - expected2 <- list( - locations = locations2, masses = masses2, - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class - - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) - -test_that("dhist constuctor has correct locations and masses (default smoothing, sorted)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 - - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class - - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class - - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) - -test_that("dhist constuctor has correct locations and masses (default smoothing, default sorting)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 +) + +context( + "dhist: constructor, equality operator and as_* transformation functions" +) +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "unsorted)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) + + exp_class <- "dhist" + exp_smoothing_window_width <- 0 + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = exp_smoothing_window_width + ) + class(expected1) <- exp_class - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = exp_smoothing_window_width + ) + class(expected2) <- exp_class - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "sorted)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) + + exp_class <- "dhist" + exp_smoothing_window_width <- 0 + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = exp_smoothing_window_width + ) + class(expected1) <- exp_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = exp_smoothing_window_width + ) + class(expected2) <- exp_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, unsorted)", { - smoothing_window_width <- 1 + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "default sorting)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2) + + exp_class <- "dhist" + exp_smoothing_window_width <- 0 + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = exp_smoothing_window_width + ) + class(expected1) <- exp_class - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = FALSE - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = FALSE - ) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = exp_smoothing_window_width + ) + class(expected2) <- exp_class - expected_class <- "dhist" + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "unsorted)" + ), + { + smoothing_window_width <- 1 - expected1 <- list( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) - expected2 <- list( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + exp_class <- "dhist" - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- exp_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, sorted)", { - smoothing_window_width <- 1 + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- exp_class - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = TRUE - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = TRUE - ) + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "sorted)" + ), + { + smoothing_window_width <- 1 - expected_class <- "dhist" + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + exp_class <- "dhist" - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- exp_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- exp_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, default sorting)", { - smoothing_window_width <- 1 + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "default sorting)" + ), + { + smoothing_window_width <- 1 - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width - ) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) - expected_class <- "dhist" + exp_class <- "dhist" - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- exp_class - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- exp_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) test_that("as_smoothed_dhist sets smoothing_window_width correctly", { dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14) ) - expected_smoothing_window_width_pre <- 0 - expected_smoothing_window_width_post <- 1 + exp_smooth_window_width_pre <- 0 + exp_smooth_window_width_post <- 1 expect_equal( dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre + exp_smooth_window_width_pre + ) + dhist_post <- as_smoothed_dhist( + dhist_pre, + exp_smooth_window_width_post ) - dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) expect_equal( dhist_post$smoothing_window_width, - expected_smoothing_window_width_post + exp_smooth_window_width_post ) }) @@ -272,17 +323,20 @@ test_that("as_unsmoothed_dhist sets smoothing_window_width correctly", { masses = c(15, 12, 16, 13, 11, 14), smoothing_window_width <- 1 ) - expected_smoothing_window_width_pre <- 1 - expected_smoothing_window_width_post <- 0 + exp_smooth_window_width_pre <- 1 + exp_smooth_window_width_post <- 0 expect_equal( dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre + exp_smooth_window_width_pre + ) + dhist_post <- as_smoothed_dhist( + dhist_pre, + exp_smooth_window_width_post ) - dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) expect_equal( dhist_post$smoothing_window_width, - expected_smoothing_window_width_post + exp_smooth_window_width_post ) }) @@ -303,7 +357,8 @@ test_that("Non-identical dhists are NOT considered equal", { # Change a single element of the locations field dhist2_one_location_mismatch <- dhist1 - dhist2_one_location_mismatch$locations[3] <- dhist2_one_location_mismatch$locations[1] + 1 + dhist2_one_location_mismatch$locations[3] <- + dhist2_one_location_mismatch$locations[1] + 1 expect_false(dhist1 == dhist2_one_location_mismatch) # Change a single element of the masses field @@ -323,25 +378,37 @@ test_that("Non-identical dhists are NOT considered equal", { }) context("dhist: Discrete histogram variance") -test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoothing_window_width^2 / 12", { - dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) - # Be careful: ensure that no smoothing window width results in overlapping bins - smoothing_window_width_A <- 1 - smoothing_window_width_B <- 2 - dhist_unsmoothed <- as_unsmoothed_dhist(dhist) - dhist_smoothed_A <- as_smoothed_dhist(dhist, smoothing_window_width_A) - dhist_smoothed_B <- as_smoothed_dhist(dhist, smoothing_window_width_B) - - var_unsmoothed <- dhist_variance(dhist_unsmoothed) - var_smoothed_A <- dhist_variance(dhist_smoothed_A) - var_smoothed_B <- dhist_variance(dhist_smoothed_B) - - expected_var_smoothed_A <- var_unsmoothed + ((smoothing_window_width_A^2) / 12) - expected_var_smoothed_B <- var_unsmoothed + ((smoothing_window_width_B^2) / 12) - - expect_equal(var_smoothed_A, expected_var_smoothed_A) - expect_equal(var_smoothed_B, expected_var_smoothed_B) -}) +test_that( + paste( + "dhist_variance difference for smoothed and unsmoothed dhists is", + "smoothing_window_width^2 / 12" + ), + { + dhist <- dhist( + locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) + # Be careful: ensure that no smoothing window width results in overlapping + # bins + smoothing_window_width_a <- 1 + smoothing_window_width_b <- 2 + dhist_unsmoothed <- as_unsmoothed_dhist(dhist) + dhist_smoothed_a <- as_smoothed_dhist(dhist, smoothing_window_width_a) + dhist_smoothed_b <- as_smoothed_dhist(dhist, smoothing_window_width_b) + + var_unsmoothed <- dhist_variance(dhist_unsmoothed) + var_smoothed_a <- dhist_variance(dhist_smoothed_a) + var_smoothed_b <- dhist_variance(dhist_smoothed_b) + + exp_var_smoothed_a <- var_unsmoothed + + ((smoothing_window_width_a^2) / 12) + exp_var_smoothed_b <- var_unsmoothed + + ((smoothing_window_width_b^2) / 12) + + expect_equal(var_smoothed_a, exp_var_smoothed_a) + expect_equal(var_smoothed_b, exp_var_smoothed_b) + } +) test_that("dhist_variance returns sigma^2 for unsmoothed normal histograms", { num_hists <- 5 @@ -395,10 +462,10 @@ test_that("normalise_dhist_mass output sums to 1", { smoothing_window_width = smoothing_window_width )) }) - expected_total_mass <- 1 + exp_total_mass <- 1 # Check total masses match expectations purrr::map_dbl(normalised_dhists, function(dhist) { - expect_equal(sum(dhist$masses), expected_total_mass) + expect_equal(sum(dhist$masses), exp_total_mass) }) # Check other histogram properties unchanged purrr::walk(normalised_dhists, function(dhist) { @@ -410,110 +477,149 @@ test_that("normalise_dhist_mass output sums to 1", { }) context("dhist: Discrete histogram variance normalisation") -test_that("normalise_histogram_variance output has variance of 1 for random integer histograms", { - # Generate histograms with random masses and random centres - num_hists <- 10 - num_bins <- 70 - - mass_min <- 0 - mass_max <- 100 - rand_masses <- function() { - return(runif(num_bins, mass_min, mass_max)) - } +test_that( + paste( + "normalise_histogram_variance output has variance of 1 for random integer", + "histograms" + ), + { + # Generate histograms with random masses and random centres + num_hists <- 10 + num_bins <- 70 + + mass_min <- 0 + mass_max <- 100 + rand_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } - centre_min <- -30 - centre_max <- 70 - rand_locations <- function() { - return(round(sample(centre_min:centre_max, num_bins), digits = 0)) - } + centre_min <- -30 + centre_max <- 70 + rand_locations <- function() { + return(round(sample(centre_min:centre_max, num_bins), digits = 0)) + } - rand_dhists <- replicate(num_hists, dhist(masses = rand_masses(), locations = rand_locations()), simplify = FALSE) + rand_dhists <- replicate( + num_hists, + dhist(masses = rand_masses(), locations = rand_locations()), + simplify = FALSE + ) - smoothing_window_width <- 1 - rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) - rand_dhists_smoothed <- purrr::map(rand_dhists, as_smoothed_dhist, smoothing_window_width = smoothing_window_width) + smoothing_window_width <- 1 + rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) + rand_dhists_smoothed <- purrr::map( + rand_dhists, + as_smoothed_dhist, + smoothing_window_width = smoothing_window_width + ) - expected_post_norm_smoothing_windows <- purrr::map_dbl(rand_dhists_smoothed, function(dhist) { - smoothing_window_width / dhist_std(dhist) - }) + exp_post_norm_smooth_windows <- purrr::map_dbl( + rand_dhists_smoothed, + function(dhist) { + smoothing_window_width / dhist_std(dhist) + } + ) - actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) { - normalise_dhist_variance(dhist) - }) - actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { - normalise_dhist_variance(dhist) - }) - expected_variance <- 1 - # Check variance of normalised hostograms is as expected - purrr::walk(actual_dhist_unsmoothed, function(dhist) { - expect_equal(dhist_variance(dhist), expected_variance) - }) - purrr::walk(actual_dhist_smoothed, function(dhist) { - expect_equal(dhist_variance(dhist), expected_variance) - }) - # Check smoothing window is as expected (0 for unsmoothe; smoothing_window_width/sigma for smoothed) - purrr::walk(actual_dhist_unsmoothed, function(dhist) { - expect_equal(dhist$smoothing_window_width, 0) - }) - purrr::walk2( - actual_dhist_smoothed, expected_post_norm_smoothing_windows, - function(dhist, sww) { - expect_equal(dhist$smoothing_window_width, sww) - } - ) - # Check masses unaltered - purrr::walk2( - actual_dhist_unsmoothed, rand_dhists_unsmoothed, - function(actual, expected) { - expect_equal(actual$masses, expected$masses) - } - ) - purrr::walk2( - actual_dhist_smoothed, rand_dhists_smoothed, - function(actual, expected) { - expect_equal(actual$masses, expected$masses) + actual_dhist_unsmoothed <- purrr::map( + rand_dhists_unsmoothed, + function(dhist) { + normalise_dhist_variance(dhist) + } + ) + actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) + exp_variance <- 1 + # Check variance of normalised hostograms is as expected + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist_variance(dhist), exp_variance) + }) + purrr::walk(actual_dhist_smoothed, function(dhist) { + expect_equal(dhist_variance(dhist), exp_variance) + }) + # Check smoothing window is as expected (0 for unsmoothe; smoothing_window + # width/sigma for smoothed) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist$smoothing_window_width, 0) + }) + purrr::walk2( + actual_dhist_smoothed, exp_post_norm_smooth_windows, + function(dhist, sww) { + expect_equal(dhist$smoothing_window_width, sww) + } + ) + # Check masses unaltered + purrr::walk2( + actual_dhist_unsmoothed, rand_dhists_unsmoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + purrr::walk2( + actual_dhist_smoothed, rand_dhists_smoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + } +) + +test_that( + paste( + "normalise_histogram_variance output has variance of 1 for normal", + "histograms" + ), + { + num_hists <- 5 + num_bins <- 100001 + + mus <- runif(num_hists, -10, 10) + sigmas <- runif(num_hists, 0, 10) + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) } - ) -}) -test_that("normalise_histogram_variance output has variance of 1 for normal histograms", { - num_hists <- 5 - num_bins <- 100001 - - mus <- runif(num_hists, -10, 10) - sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { + locations <- rand_locations(mu, sigma) + masses <- dnorm(locations, mean = mu, sd = sigma) + return(dhist(masses = masses, locations = locations)) + }) + + actuals <- purrr::map(rand_dhists, function(dhist) { + dhist_variance(normalise_dhist_variance(dhist)) + }) + expected <- 1 + purrr::map_dbl(actuals, function(actual) { + expect_equal(actual, expected) + }) } - - rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { - locations <- rand_locations(mu, sigma) - masses <- dnorm(locations, mean = mu, sd = sigma) - return(dhist(masses = masses, locations = locations)) - }) - - actuals <- purrr::map(rand_dhists, function(dhist) { - dhist_variance(normalise_dhist_variance(dhist)) - }) - expected <- 1 - purrr::map_dbl(actuals, function(actual) { - expect_equal(actual, expected) - }) -}) +) context("dhist: Sort dhist") test_that("sort_dhist works", { # NOTE: Need to construct dhist objects explicitly as the dhist constructor # now returns a sorted dhist and we want to be independent of this - dhist1 <- list(locations = c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) + dhist1 <- list( + locations = c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) class(dhist1) <- "dhist" - dhist2 <- list(locations = c(3, 0, -62, 7, 16, -58), masses = c(23, 24, 26, 22, 21, 25)) + dhist2 <- list( + locations = c(3, 0, -62, 7, 16, -58), + masses = c(23, 24, 26, 22, 21, 25) + ) class(dhist2) <- "dhist" - expected1 <- list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13, 12, 11)) + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11) + ) class(expected1) <- "dhist" - expected2 <- list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21) + ) class(expected2) <- "dhist" actual1 <- sort_dhist(dhist1) @@ -524,174 +630,275 @@ test_that("sort_dhist works", { }) context("dhist: ECMF") -test_that("dhist_ecmf returns correct step function when smoothing_window_width is zero", { - dhist1 <- dhist(locations = c(1, 2, 4, 7, 11, 16, 22), masses = c(21, 22, 23, 27, 31, 36, 42)) - dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) - - ecmf1 <- dhist_ecmf(dhist1) - actual_knots1 <- ecmf_knots(ecmf1) - actual_knots_ecds1 <- ecmf1(actual_knots1) - inter_knots_x <- head(actual_knots1, length(actual_knots1) - 1) - actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) - extra_knots <- c(actual_knots1[1] - 1, actual_knots1[length(actual_knots1)] + 1) - actual_extra_knots_ecds1 <- ecmf1(extra_knots) - - cum_masses1 <- cumsum(dhist1$masses) - max_cum_mass <- cum_masses1[length(cum_masses1)] - expected_knots_ecds1 <- cum_masses1 - expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) - 1) - expected_extra_knots_ecds1 <- c(0, max_cum_mass) - expected_knots1 <- dhist1$locations - - expect_equal(actual_knots1, expected_knots1) - expect_equal(actual_knots_ecds1, expected_knots_ecds1) - expect_equal(actual_inter_knots_ecds1, expected_inter_knots_ecds1) - expect_equal(actual_extra_knots_ecds1, expected_extra_knots_ecds1) -}) +test_that( + paste( + "dhist_ecmf returns correct step function when smoothing_window_width is", + "zero" + ), + { + dhist1 <- dhist( + locations = c(1, 2, 4, 7, 11, 16, 22), + masses = c(21, 22, 23, 27, 31, 36, 42) + ) + dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) + + ecmf1 <- dhist_ecmf(dhist1) + actual_knots1 <- ecmf_knots(ecmf1) + actual_knots_ecds1 <- ecmf1(actual_knots1) + inter_knots_x <- head(actual_knots1, length(actual_knots1) - 1) + actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) + extra_knots <- c( + actual_knots1[1] - 1, + actual_knots1[length(actual_knots1)] + 1 + ) + actual_extra_knots_ecds1 <- ecmf1(extra_knots) + + cum_masses1 <- cumsum(dhist1$masses) + max_cum_mass <- cum_masses1[length(cum_masses1)] + exp_knots_ecds1 <- cum_masses1 + exp_inter_knots_ecds1 <- head( + exp_knots_ecds1, + length(exp_knots_ecds1) - 1 + ) + exp_extra_knots_ecds1 <- c(0, max_cum_mass) + exp_knots1 <- dhist1$locations + + expect_equal(actual_knots1, exp_knots1) + expect_equal(actual_knots_ecds1, exp_knots_ecds1) + expect_equal(actual_inter_knots_ecds1, exp_inter_knots_ecds1) + expect_equal(actual_extra_knots_ecds1, exp_extra_knots_ecds1) + } +) context("dhist: Area between ECMFs (simple integer dhists)") -test_that("area_between_dhist_ecmfs returns correct value for simple integer dhists", { - # Example dhists constructed by hand to result in lots of "bowtie" segments - # for smoothed ECMFs and to allow expected areas to be calculated by hand - # Unsmoothed locations are on an integer grid, smoothed bin edges are on a - # half-integer grid - # Smoothed and unsmoothed ECMF cumulative masses are on integer grid - # Smoothed ECMF crossing points are on a quarter-integer grid - dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) - dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) - - # Set up smoothed and unsmoothed versions of histograms - smoothing_window_width <- 1 - dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) - dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) - dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) - dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - - # Set expected area - expected_area_unsmoothed <- 4 - expected_area_smoothed <- 3 - - # Generate ecmfs - ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) - ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) - ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) - ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - - # Calculate area between ECMFs - actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) - actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - - # Compare caculated areas with expected areas - expect_equal(actual_area_unsmoothed, expected_area_unsmoothed) - expect_equal(actual_area_smoothed, expected_area_smoothed) -}) +test_that( + paste( + "area_between_dhist_ecmfs returns correct value for simple integer dhists" + ), + { + # Example dhists constructed by hand to result in lots of "bowtie" segments + # for smoothed ECMFs and to allow expected areas to be calculated by hand + # Unsmoothed locations are on an integer grid, smoothed bin edges are on a + # half-integer grid + # Smoothed and unsmoothed ECMF cumulative masses are on integer grid + # Smoothed ECMF crossing points are on a quarter-integer grid + dhist_a <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) + dhist_b <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) + + # Set up smoothed and unsmoothed versions of histograms + smoothing_window_width <- 1 + dhist_a_unsmoothed <- as_unsmoothed_dhist(dhist_a) + dhist_b_unsmoothed <- as_unsmoothed_dhist(dhist_b) + dhist_a_smoothed <- as_smoothed_dhist(dhist_a, smoothing_window_width) + dhist_b_smoothed <- as_smoothed_dhist(dhist_b, smoothing_window_width) + + # Set expected area + exp_area_unsmoothed <- 4 + exp_area_smoothed <- 3 + + # Generate ecmfs + ecmf_a_unsmoothed <- dhist_ecmf(dhist_a_unsmoothed) + ecmf_b_unsmoothed <- dhist_ecmf(dhist_b_unsmoothed) + ecmf_a_smoothed <- dhist_ecmf(dhist_a_smoothed) + ecmf_b_smoothed <- dhist_ecmf(dhist_b_smoothed) + + # Calculate area between ECMFs + actual_area_unsmoothed <- area_between_dhist_ecmfs( + ecmf_a_unsmoothed, + ecmf_b_unsmoothed + ) + actual_area_smoothed <- area_between_dhist_ecmfs( + ecmf_a_smoothed, + ecmf_b_smoothed + ) -context("dhist: Area between ECMFs (non-integer normalised dhists)") -test_that("area_between_dhist_ecmfs returns correct value for non-integer normalised dhists", { + # Compare caculated areas with expected areas + expect_equal(actual_area_unsmoothed, exp_area_unsmoothed) + expect_equal(actual_area_smoothed, exp_area_smoothed) + } +) - # Previous simple integer grid where both histograms have been separately - # normalised to unit mass and variance. Has locations and masses at a range - # of floating point locations. Has bowties, triangles and trapeziums. - dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) - dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) +context("dhist: Area between ECMFs (non-integer normalised dhists)") +test_that( + paste( + "area_between_dhist_ecmfs returns correct value for non-integer normalised", + "dhists" + ), + { + + # Previous simple integer grid where both histograms have been separately + # normalised to unit mass and variance. Has locations and masses at a range + # of floating point locations. Has bowties, triangles and trapeziums. + dhist_a <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) + dhist_b <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) + + dhist_a <- normalise_dhist_mass(normalise_dhist_variance(dhist_a)) + dhist_b <- normalise_dhist_mass(normalise_dhist_variance(dhist_b)) + + # Set up smoothed and unsmoothed versions of histograms + smoothing_window_width <- 1 + dhist_a_unsmoothed <- as_unsmoothed_dhist(dhist_a) + dhist_b_unsmoothed <- as_unsmoothed_dhist(dhist_b) + dhist_a_smoothed <- as_smoothed_dhist(dhist_a, smoothing_window_width) + dhist_b_smoothed <- as_smoothed_dhist(dhist_b, smoothing_window_width) + + # Generate ecmfs + ecmf_a_unsmoothed <- dhist_ecmf(dhist_a_unsmoothed) + ecmf_b_unsmoothed <- dhist_ecmf(dhist_b_unsmoothed) + ecmf_a_smoothed <- dhist_ecmf(dhist_a_smoothed) + ecmf_b_smoothed <- dhist_ecmf(dhist_b_smoothed) + + # Define some functions to make calculation of manually measured areas + # easier + rectangle_area <- function(width, height) { + return(width * height) + } + triangle_area <- function(base, height) { + return(0.5 * base * height) + } + trapezium_area <- function(side_a, side_b, height) { + return(0.5 * (side_a + side_b) * height) + } + # Measurements of expected area between ECMFs done by hand by printing + # normalised ECMFs on a grid with x-spacing of 0.02 and y-spacing of 0.01) + # Actual grid counts preserved in data to facilitate less tedious manual + # checking if required + # --- Unsmoothed --- + area_a_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) + area_b_unsmoothed <- rectangle_area( + width = 50.5 * 0.02, height = 37.5 * 0.01 + ) + area_c_unsmoothed <- rectangle_area( + width = 26 * 0.02, height = 12.5 * 0.01 + ) + area_d_unsmoothed <- rectangle_area( + width = 34.5 * 0.02, height = 12.5 * 0.01 + ) + area_e_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) + exp_area_unsmoothed <- + sum( + area_a_unsmoothed, area_b_unsmoothed, area_c_unsmoothed, + area_d_unsmoothed, area_e_unsmoothed + ) + # --- Smoothed --- + area_a_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) + area_b_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) + area_c_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) + area_d_smoothed <- trapezium_area( + side_a = 18.5 * 0.01, + side_b = 37.5 * 0.01, + height = 14.5 * 0.02 + ) + area_e_smoothed <- trapezium_area( + side_a = 37.5 * 0.01, + side_b = 37.5 * 0.01, + height = 16 * 0.02 + ) + area_f_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) + area_g_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) + area_h_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) + area_i_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) + area_j_smoothed <- trapezium_area( + side_a = 12.5 * 0.01, + side_b = 20 * 0.01, + height = 30.5 * 0.02 + ) + area_k_smoothed <- trapezium_area( + side_a = 20 * 0.01, + side_b = 18 * 0.01, + height = 8 * 0.02 + ) + area_l_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) + exp_area_smoothed <- + sum( + area_a_smoothed, area_b_smoothed, area_c_smoothed, area_d_smoothed, + area_e_smoothed, area_f_smoothed, area_g_smoothed, area_h_smoothed, + area_i_smoothed, area_j_smoothed, area_k_smoothed, area_l_smoothed + ) + + # Calculate area between ECMFs + actual_area_unsmoothed <- area_between_dhist_ecmfs( + ecmf_a_unsmoothed, + ecmf_b_unsmoothed + ) + actual_area_smoothed <- area_between_dhist_ecmfs( + ecmf_a_smoothed, + ecmf_b_smoothed + ) - dhistA <- normalise_dhist_mass(normalise_dhist_variance(dhistA)) - dhistB <- normalise_dhist_mass(normalise_dhist_variance(dhistB)) + # Compare caculated areas with expected areas + expect_equalish_manual <- function(actual, expected, relative_tolerance) { + relative_diff <- abs(actual - expected) / expected + expect_lte(relative_diff, relative_tolerance) + } - # Set up smoothed and unsmoothed versions of histograms - smoothing_window_width <- 1 - dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) - dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) - dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) - dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - - # Generate ecmfs - ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) - ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) - ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) - ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - - # Define some functions to make calculation of manually measured areas easier - rectangle_area <- function(width, height) { - return(width * height) - } - triangle_area <- function(base, height) { - return(0.5 * base * height) - } - trapezium_area <- function(side_a, side_b, height) { - return(0.5 * (side_a + side_b) * height) + # Given manual measurement of areas between curves, consider area correct + # if actual and expected areas are within 1% of each other + expect_equalish_manual(actual_area_unsmoothed, exp_area_unsmoothed, 0.01) + expect_equalish_manual(actual_area_smoothed, exp_area_smoothed, 0.01) } - # Measurements of expected area between ECMFs done by hand by printing - # normalised ECMFs on a grid with x-spacing of 0.02 and y-spacing of 0.01) - # Actual grid counts preserved in data to facilitate less tedious manual - # checking if required - # --- Unsmoothed --- - area_A_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) - area_B_unsmoothed <- rectangle_area(width = 50.5 * 0.02, height = 37.5 * 0.01) - area_C_unsmoothed <- rectangle_area(width = 26 * 0.02, height = 12.5 * 0.01) - area_D_unsmoothed <- rectangle_area(width = 34.5 * 0.02, height = 12.5 * 0.01) - area_E_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) - expected_area_unsmoothed <- - sum( - area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, - area_D_unsmoothed, area_E_unsmoothed - ) - # --- Smoothed --- - area_A_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) - area_B_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) - area_C_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) - area_D_smoothed <- trapezium_area(side_a = 18.5 * 0.01, side_b = 37.5 * 0.01, height = 14.5 * 0.02) - area_E_smoothed <- trapezium_area(side_a = 37.5 * 0.01, side_b = 37.5 * 0.01, height = 16 * 0.02) - area_F_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) - area_G_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) - area_H_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) - area_I_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) - area_J_smoothed <- trapezium_area(side_a = 12.5 * 0.01, side_b = 20 * 0.01, height = 30.5 * 0.02) - area_K_smoothed <- trapezium_area(side_a = 20 * 0.01, side_b = 18 * 0.01, height = 8 * 0.02) - area_L_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) - expected_area_smoothed <- - sum( - area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, - area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, - area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed - ) - - # Calculate area between ECMFs - actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) - actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - - # Compare caculated areas with expected areas - expect_equalish_manual <- function(actual, expected, relative_tolerance) { - relative_diff <- abs(actual - expected) / expected - expect_lte(relative_diff, relative_tolerance) - } - - # Given manual measurement of areas between curves, consider area correct - # if actual and expected areas are within 1% of each other - expect_equalish_manual(actual_area_unsmoothed, expected_area_unsmoothed, 0.01) - expect_equalish_manual(actual_area_smoothed, expected_area_smoothed, 0.01) -}) +) context("dhist: Harmonise dhist locations") test_that("harmonise_dhist_locations works A", { - dhist1 <- dhist(masses = c(11, 12, 13), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) - dhist2 <- dhist(masses = c(21, 22, 23), locations = c(2, 4, 6), smoothing_window_width = 1, sorted = FALSE) + dhist1 <- dhist( + masses = c(11, 12, 13), + locations = c(1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) + dhist2 <- dhist( + masses = c(21, 22, 23), + locations = c(2, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ) expected <- list( - dhist1 = dhist(masses = c(11, 12, 13, 0, 0, 0), locations = c(1, 3, 5, 2, 4, 6), smoothing_window_width = 1, sorted = FALSE), - dhist2 = dhist(masses = c(21, 22, 23, 0, 0, 0), locations = c(2, 4, 6, 1, 3, 5), smoothing_window_width = 1, sorted = FALSE) + dhist1 = dhist( + masses = c(11, 12, 13, 0, 0, 0), + locations = c(1, 3, 5, 2, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ), + dhist2 = dhist( + masses = c(21, 22, 23, 0, 0, 0), + locations = c(2, 4, 6, 1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) ) actual <- harmonise_dhist_locations(dhist1, dhist2) expect_equal(actual, expected) }) test_that("harmonise_dhist_locations works B", { - dhist1 <- dhist(masses = c(1, 1, 1), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) - dhist2 <- dhist(masses = c(1, 1, 1), locations = c(4, 5, 6), smoothing_window_width = 1, sorted = FALSE) + dhist1 <- dhist( + masses = c(1, 1, 1), + locations = c(1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) + dhist2 <- dhist( + masses = c(1, 1, 1), + locations = c(4, 5, 6), + smoothing_window_width = 1, + sorted = FALSE + ) expected <- list( - dhist1 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(1, 3, 5, 4, 6), smoothing_window_width = 1, sorted = FALSE), - dhist2 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(4, 5, 6, 1, 3), smoothing_window_width = 1, sorted = FALSE) + dhist1 = dhist( + masses = c(1, 1, 1, 0, 0), + locations = c(1, 3, 5, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ), + dhist2 = dhist( + masses = c(1, 1, 1, 0, 0), + locations = c(4, 5, 6, 1, 3), + smoothing_window_width = 1, + sorted = FALSE + ) ) actual <- harmonise_dhist_locations(dhist1, dhist2) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 960a00c0..3c782aec 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -1,16 +1,19 @@ context("EMD: Cost matrix") # COST_MATRIX: Property-based tests -test_that("cost_matrix returns all zeros when all bin locations are identical", { - bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) - bin_centres2 <- bin_centres1 - expected <- matrix(0, - nrow = length(bin_centres1), - ncol = length(bin_centres2) - ) - expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) -}) +test_that( + "cost_matrix returns all zeros when all bin locations are identical", + { + bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) + bin_centres2 <- bin_centres1 + expected <- matrix(0, + nrow = length(bin_centres1), + ncol = length(bin_centres2) + ) + expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) + } +) -test_that("cost_matrix returns zeros along diagonal when both sets of bin +test_that("cost_matrix returns zeros along diagonal when both sets of bin locations are the same", { bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 @@ -18,8 +21,8 @@ test_that("cost_matrix returns zeros along diagonal when both sets of bin expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) }) -test_that("cost_matrix returns zeros along diagonal and taxicab distance from - all zeros for all other elements when both sets of bin locations are +test_that("cost_matrix returns zeros along diagonal and taxicab distance from + all zeros for all other elements when both sets of bin locations are the same and are a sequence of consecutive integers", { bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 @@ -28,7 +31,7 @@ test_that("cost_matrix returns zeros along diagonal and taxicab distance from expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) }) -test_that("cost_matrix is correct size when the two histograms are of different +test_that("cost_matrix is correct size when the two histograms are of different lengths", { bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) bin_centres2 <- c(8, 9, 10) @@ -41,7 +44,7 @@ test_that("cost_matrix is correct size when the two histograms are of different context("EMD: EMD") # EMD: Property-based tests -test_that("EMD methods return 0 when comparing a 1D feature distribution to +test_that("EMD methods return 0 when comparing a 1D feature distribution to itself", { bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 @@ -59,7 +62,7 @@ test_that("EMD methods return 0 when comparing a 1D feature distribution to expect_equal(emd(histogram1, histogram2), expected) }) -test_that("EMD methods return numBins/2 when offsetting a symmetric discrete +test_that("EMD methods return numBins/2 when offsetting a symmetric discrete triangle distribution by 1", { cost_fn <- function(triangle_width) { move_dist <- ceiling((triangle_width + 1) / 2) @@ -191,7 +194,7 @@ test_that("EMD methods return numBins/2 when offsetting a symmetric discrete expect_equal(emd(histogram1, histogram2), expected) }) -test_that("EMD methods return same result for densely and sparsely specified +test_that("EMD methods return same result for densely and sparsely specified bins", { sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) @@ -242,7 +245,7 @@ test_that("EMD methods return same result for densely and sparsely specified ) }) -test_that("EMD methods return same result when order of densely specified bins +test_that("EMD methods return same result when order of densely specified bins is changed", { bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) @@ -284,7 +287,7 @@ test_that("EMD methods return same result when order of densely specified bins ) }) -test_that("EMD methods return same result when order of sparsely specified bins +test_that("EMD methods return same result when order of sparsely specified bins is changed", { bin_masses1 <- c(1, 1, 1, 1, 1, 1) bin_masses2 <- c(1, 1, 1, 1, 1, 1) @@ -392,7 +395,7 @@ test_that("min_emd_ methods correctly compare a non-offset 1D feature expect_equal(actual_optimise, expected) }) -test_that("min_emd_ methods correctly compare an offset 1D feature +test_that("min_emd_ methods correctly compare an offset 1D feature distribution to itself", { offset <- 10 bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R new file mode 100644 index 00000000..8c85769a --- /dev/null +++ b/tests/testthat/test_fastEMD.R @@ -0,0 +1,420 @@ +context("Name of test context") + +library(Rcpp) + +make_function <- function(x1, v1, w1, x2, v2, w2) { + f1 <- function(x) { + res1 <- rep(0, length(x)) + m1 <- x1[1] < x + m2 <- x <= (x1[1] + w1) + m3 <- m1 * m2 + res1 <- res1 + m3 * (x - x1[1]) * (v1[1] - 0) / w1 + if (length(x1) > 1) { + m1 <- (x1[1] + w1) < x + m2 <- x <= (x1[2]) + m3 <- m1 * m2 + res1 <- res1 + m3 * v1[1] + for (i in 2:length(x1)) + { + m1 <- x1[i] < x + m2 <- x <= (x1[i] + w1) + m3 <- m1 * m2 + res1 <- res1 + m3 * ((x - x1[i]) * (v1[i] - v1[i - 1]) / w1 + v1[i - 1]) + if (i < length(x1)) { + m1 <- (x1[i] + w1) < x + m2 <- x <= (x1[i + 1]) + m3 <- m1 * m2 + res1 <- res1 + m3 * v1[i] + } + } + } + m1 <- x > (x1[length(v1)] + w1) + res1 <- res1 + m1 * v1[length(v1)] + + + + res2 <- rep(0, length(x)) + m1 <- x2[1] < x + m2 <- x <= (x2[1] + w2) + m3 <- m1 * m2 + res2 <- m3 * (x - x2[1]) * (v2[1] - 0) / w2 + + if (length(x2) > 1) { + m1 <- (x2[1] + w2) < x + m2 <- x <= (x2[2]) + m3 <- m1 * m2 + res2 <- res2 + m3 * v2[1] + } + + for (i in 2:length(x2)) + { + m1 <- x2[i] < x + m2 <- x <= (x2[i] + w2) + m3 <- m1 * m2 + res2 <- res2 + m3 * ((x - x2[i]) * (v2[i] - v2[i - 1]) / w2 + v2[i - 1]) + if (i < length(x2)) { + m1 <- (x2[i] + w2) < x + m2 <- x <= (x2[i + 1]) + m3 <- m1 * m2 + res2 <- res2 + m3 * v2[i] + } + } + m1 <- x > (w2 + x2[length(v2)]) + res2 <- res2 + m1 * v2[length(v2)] + + abs(res1 - res2) + } + f1 +} + +get_val <- function(x1, v1, w1, x2, v2, w2) { + f1 <- make_function(x1, v1, w1, x2, v2, w2) + mi123 <- min(min(x1), min(x2)) + ma123 <- max(max(x1) + w1, max(x2) + w2) + integrate(f1, mi123, ma123) +} + +get_r_value <- function(x1, v1, w1, x2, v2, w2) { + v1s <- c(v1[1], diff(v1)) + v2s <- c(v2[1], diff(v2)) + d1 <- dhist(x1 + w1 / 2, v1s, smoothing_window_width = w1) + d2 <- dhist(x2 + w2 / 2, v2s, smoothing_window_width = w2) + d1ec <- dhist_ecmf(d1) + d2ec <- dhist_ecmf(d2) + area_between_dhist_ecmfs(d1ec, d2ec) +} + + +compar_r_only_vs_opt <- function(x1, v1, w1, x2, v2, w2) { + v1s <- c(v1[1], diff(v1)) + v2s <- c(v2[1], diff(v2)) + d1 <- dhist(x1, v1s, smoothing_window_width = w1) + d2 <- dhist(x2, v2s, smoothing_window_width = w2) + res1 <- netdist::netemd_single_pair(d1, d2, method = "optimise") + res2 <- netdist::netemd_single_pair(d1, d2, method = "optimiseRonly") + testthat::expect_lt(abs(res1$min_emd - res2$min_emd), 10**(-4)) + c(res1$min_emd, res2$min_emd) +} + + + +test_that("3 element test", { + for (w1 in (1:10) / 10.0) + { + for (w2 in (1:10) / 10.0) + { + x1 <- c(1, 2, 3) + v1 <- c(0.25, 0.70, 1.00) + x2 <- c(1, 2, 3) + v2 <- c(0.25, 0.70, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-3)) + } + } +}) + +test_that("2 element test w1=0.1, w2=0.2", { + w1 <- 0.1 + w2 <- 0.2 + x1 <- c(1, 2) + v1 <- c(0.25, 0.75) + x2 <- c(1, 2) + v2 <- c(0.5, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[2], x1[2]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-4)) +}) + + +test_that("1 element at 0 vs many test Mixture", { + for (w1 in (1:10) * 2) + { + x1 <- c(0) + v1 <- c(1.00) + x2 <- 1:w1 + v2 <- (1:w1) / w1 + f1 <- make_function(x1, v1, 1, x2, v2, 1) + res2 <- integrate(f1, 0, w1 + 1, abs.tol = 0.000000001)[[1]] + + res1 <- netemd_smooth(x1, v1, 1, x2, v2, 1) + + expect_lt(abs(res1 - res2), 10**(-3)) + } +}) + + +test_that("1 element vs many test Mixture", { + for (w1 in (1:10) * 2) + { + x1 <- c(w1 / 2) + v1 <- c(1.00) + x2 <- 1:w1 + v2 <- (1:w1) / w1 + f1 <- make_function(x1, v1, 1, x2, v2, 1) + res2 <- integrate(f1, 0, w1 + 1, abs.tol = 0.000000001)[[1]] + + res1 <- netemd_smooth(x1, v1, 1, x2, v2, 1) + + expect_lt(abs(res1 - res2), 10**(-3)) + } +}) + + + +test_that("3 element test Mixture", { + for (w1 in (1:10) / 10.0) + { + for (w2 in (1:10) / 10.0) + { + x1 <- c(1, 2, 3) + v1 <- c(0.65, 0.70, 1.00) + x2 <- c(1, 2, 3) + v2 <- c(0.25, 0.70, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-3)) + } + } +}) + +test_that("3 element test Mixture MidPoint", { + w1 <- 1 + w2 <- 1 + for (v1_2 in (1:10) / 10.0) + { + for (v2_2 in (1:10) / 10.0) + { + x1 <- c(1, 2, 3) + v1 <- c(0.1, v1_2, 1.00) + x2 <- c(1, 2, 3) + v2 <- c(0.1, v2_2, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-3)) + } + } +}) + + +test_that("3 element test Mixture StartPoint", { + w1 <- 1 + w2 <- 1 + for (v1_1 in (1:5) / 10.0) + { + for (v2_1 in (1:5) / 10.0) + { + x1 <- c(1, 2, 3) + v1 <- c(v1_1, 0.5, 1.00) + x2 <- c(1, 2, 3) + v2 <- c(v2_1, 0.5, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-3)) + } + } +}) + + +test_that("3 element test Mixture StartLoc", { + w1 <- 1 + w2 <- 1 + for (x1_1 in (1:9) / 10.0) + { + for (x2_1 in (1:9) / 10.0) + { + x1 <- c(x1_1, 2, 3) + v1 <- c(0.25, 0.5, 1.00) + x2 <- c(x2_1, 2, 4) + v2 <- c(0.3, 0.5, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.000000001 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-3)) + } + } +}) + + +test_that("many element test Mixture ", { + w1 <- 1 + w2 <- 1 + for (i in (2:10) * 1) + { + for (j in (2:10) * 1) + { + for (x123 in (1:2)) + { + for (y123 in (1:2)) + { + x1 <- cumsum(abs(rnorm(i))) + v1 <- cumsum(abs(rnorm(i))) + w1 <- min(diff(x1)) / x123 + v1 <- v1 / v1[length(v1)] + x2 <- cumsum(abs(rnorm(j))) + w2 <- min(diff(x2)) / y123 + v2 <- cumsum(abs(rnorm(j))) + v2 <- v2 / v2[length(v2)] + f1 <- make_function(x1, v1, w1, x2, v2, w2) + top1 <- max(x2[length(x2)], x1[length(x1)]) + max(w1, w2) + bottom1 <- min(x2[1], x1[1]) + + q1 <- compar_r_only_vs_opt(x1, v1, w1, x2, v2, w2) + + res2 <- 0 + res2 <- res2 + integrate( + f1, bottom1, top1, + abs.tol = 0.000000001, subdivisions = 100000000 + )[[1]] + + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + res3 <- get_r_value(x1, v1, w1, x2, v2, w2) + # Swapped to percentage error + expect_lt(abs(res1 - res3), 10**(-3)) + } + } + } + } +}) + + +test_that("3 element test w1=0.1, w2=0.2", { + w1 <- 0.1 + w2 <- 0.2 + x1 <- c(1, 2, 3) + v1 <- c(0.25, 0.70, 1.00) + x2 <- c(1, 2, 3) + v2 <- c(0.25, 0.70, 1.00) + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res2 <- integrate( + f1, 0, max(x2[3], x1[3]) + max(w1, w2), + abs.tol = 0.0000000001 + )[[1]] + res1 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + + expect_lt(abs(res1 - res2), 10**(-4)) +}) + +test_that("Old failure case", { + d1 <- list() + attr(d1, "class") <- "dhist" + d1$locations <- 0 + d1$masses <- 1000 + d2 <- list() + attr(d2, "class") <- "dhist" + d2$locations <- c(0, 1, 2, 3) + d2$masses <- c(8634, 1242, 114, 10) + sq1 <- netemd_single_pair( + d1, d2, + method = "optimise", smoothing_window_width = 1 + ) + sq2 <- netemd_single_pair( + d1, d2, + method = "optimiseRonly", smoothing_window_width = 1 + ) + expect_lt(abs(sq1$min_emd - sq2$min_emd), 10**(-4)) +}) + + +test_that("Old failure case 2", { + x1 <- c(0.2862492, 0.6917626) + v1 <- c(0.6519357, 1.0000000) + w1 <- 0.2027567 + x2 <- c(0.9990626, 1.4882579) + v2 <- c(0.6519357, 1.0000000) + w2 <- 0.2445976 + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res1 <- integrate( + f1, min(min(x1), min(x2)), max(max(x1), max(x2)) + max(w1, w2) + )[[1]] + res2 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + expect_lt(abs(res2 - res1), 10**(-4)) +}) + + + +test_that("Old Failure Case 2 reverse", { + x2 <- c(0.2862492, 0.6917626) + v2 <- c(0.6519357, 1.0000000) + w2 <- 0.2027567 + x1 <- c(0.9990626, 1.4882579) + v1 <- c(0.6519357, 1.0000000) + w1 <- 0.2445976 + f1 <- make_function(x1, v1, w1, x2, v2, w2) + res1 <- integrate( + f1, min(min(x1), min(x2)), max(max(x1), max(x2)) + max(w1, w2) + )[[1]] + res2 <- netemd_smooth(x1, v1, w1, x2, v2, w2) + expect_lt(abs(res2 - res1), 10**(-4)) +}) + + +test_that("equal distributions moving upwards", { + x2 <- 1:10 + v2 <- 1:10 + w2 <- 0.5 + x1 <- 1:10 + v1 <- c(1, 2) + v1 <- 1:10 + w1 <- 0.5 + for (i in 0:10) + { + res2 <- netemd_smooth(x1, v1 + i, w1, x2, v2, w2) + res3 <- get_r_value(x1, v1 + i, w1, x2, v2, w2) + print(c(i, res2, res3)) + expect_lt(abs(res2 - 9.25 * i), 10**(-4)) + expect_lt(abs(res2 - res3), 10**(-4)) + } +}) + +test_that("equal distributions moving upwards diff width", { + x2 <- 1:10 + v2 <- 1:10 + w2 <- 0.5 + x1 <- 1:10 + v1 <- c(1, 2) + v1 <- 1:10 + w1 <- 0.25 + for (i in 0:10) + { + res2 <- netemd_smooth(x1, v1 + i, w1, x2, v2, w2) + res3 <- get_r_value(x1, v1 + i, w1, x2, v2, w2) + print(c(i, res2, res3)) + expect_lt(abs(res2 - res3), 10**(-4)) + } +}) diff --git a/tests/testthat/test_graph_binning.R b/tests/testthat/test_graph_binning.R index b54d4796..d2bfa1c5 100644 --- a/tests/testthat/test_graph_binning.R +++ b/tests/testthat/test_graph_binning.R @@ -1,29 +1,46 @@ context("Graph binning: Adaptive binning") -test_that("adaptive_breaks merges 2 lowest bins where only first bin is below minimum", { - min_count <- 5 - x <- c( - 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 3, 4, 5, 6, 7) - - expect_equal(final_breaks_actual, final_breaks_expected) -}) +test_that( + "adaptive_breaks merges 2 lowest bins where only first bin is below minimum", + { + min_count <- 5 + x <- c( + 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 3, 4, 5, 6, 7) -test_that("adaptive_breaks merges 3 lowest bins where lowest 2 combined are below minimum", { - min_count <- 5 - x <- c( - 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 4, 5, 6, 7) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) + +test_that( + paste( + "adaptive_breaks merges 3 lowest bins where lowest 2 combined are below", + "minimum" + ), + { + min_count <- 5 + x <- c( + 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 4, 5, 6, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) test_that("adaptive_breaks merges pair of bins in middle", { min_count <- 5 @@ -32,7 +49,11 @@ test_that("adaptive_breaks merges pair of bins in middle", { rep(5.5, min_count), rep(6.5, min_count + 1) ) initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 3, 5, 6, 7) expect_equal(final_breaks_actual, final_breaks_expected) @@ -45,57 +66,92 @@ test_that("adaptive_breaks merges two spearated pairs of bins in middle", { rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count) ) initial_breaks <- 1:8 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 4, 5, 7, 8) expect_equal(final_breaks_actual, final_breaks_expected) }) -test_that("adaptive_breaks merges 2 uppermost bins where both are below minimum", { - min_count <- 5 - x <- c( - rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - - expect_equal(final_breaks_actual, final_breaks_expected) -}) +test_that( + "adaptive_breaks merges 2 uppermost bins where both are below minimum", + { + min_count <- 5 + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) -test_that("adaptive_breaks merges 2 uppermost bins where only last bin is below minimum", { - min_count <- 5 - x <- c( - rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 3, 4, 5, 7) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) + +test_that( + paste( + "adaptive_breaks merges 2 uppermost bins where only last bin is below", + "minimum" + ), + { + min_count <- 5 + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) test_that("adaptive_breaks merges bins with no members with the next bin", { min_count <- 5 x <- c(rep(1.5, min_count), rep(5.5, min_count), rep(6.5, min_count)) initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 6, 7) expect_equal(final_breaks_actual, final_breaks_expected) }) -test_that("adaptive_breaks merges 2 bins below minimum, plus the empty bins between them", { - min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.3, 1), rep(5.5, 4), rep(6.5, min_count)) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 6, 7) +test_that( + paste( + "adaptive_breaks merges 2 bins below minimum, plus the empty bins between", + "them" + ), + { + min_count <- 5 + x <- c(rep(1.5, min_count), rep(2.3, 1), rep(5.5, 4), rep(6.5, min_count)) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 6, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) context("Graph binning: Adaptively binned densities") test_that("binned_densities_adaptive works", { @@ -129,7 +185,20 @@ test_that("binned_densities_adaptive works", { expected_interval_indexes = expected_interval_indexes ) # Test 2: - densities <- c(0, 0.012, 0.099, 0.201, 0.299, 0.402, 0.49, 0.596, 0.699, 0.803, 0.899, 1.0) + densities <- c( + 0, + 0.012, + 0.099, + 0.201, + 0.299, + 0.402, + 0.49, + 0.596, + 0.699, + 0.803, + 0.899, + 1.0 + ) min_counts_per_interval <- 2 num_intervals <- 100 expected_breaks <- c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) @@ -140,13 +209,3 @@ test_that("binned_densities_adaptive works", { expected_interval_indexes = expected_interval_indexes ) }) - -expected_binned_graphlet_counts <- - function(graphs, binning_fn, max_graphlet_size) { - binned_graphs <- binning_fn(graphs) - ref_counts <- purrr::map( - binned_graphs$graphs, count_graphlets_for_graph, - max_graphlet_size - ) - ref_counts - } diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index ac1a68c2..604b0caf 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -39,33 +39,34 @@ test_that(test_message, { # === TEST count_graphlet_tuples === # Generate expected tuple counts for graphlets up to size 4 and 5 - expected_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) - expected_tuple_count_n37_gs4 <- graphlet_tuple_counts(37, 4) - expected_tuple_count_n73_gs4 <- graphlet_tuple_counts(73, 4) - expected_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) - expected_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) - expected_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) + exp_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) + exp_tuple_count_n37_gs4 <- graphlet_tuple_counts(37, 4) + exp_tuple_count_n73_gs4 <- graphlet_tuple_counts(73, 4) + exp_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) + exp_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) + exp_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) # Generate actual tuple counts for graphlets up to size 4 and 5 - actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graphlet_counts_n11, 4) - actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graphlet_counts_n37, 4) - actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graphlet_counts_n73, 4) - actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graphlet_counts_n11, 5) - actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graphlet_counts_n37, 5) - actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graphlet_counts_n73, 5) + act_tuple_count_n11_gs4 <- count_graphlet_tuples(graphlet_counts_n11, 4) + act_tuple_count_n37_gs4 <- count_graphlet_tuples(graphlet_counts_n37, 4) + act_tuple_count_n73_gs4 <- count_graphlet_tuples(graphlet_counts_n73, 4) + act_tuple_count_n11_gs5 <- count_graphlet_tuples(graphlet_counts_n11, 5) + act_tuple_count_n37_gs5 <- count_graphlet_tuples(graphlet_counts_n37, 5) + act_tuple_count_n73_gs5 <- count_graphlet_tuples(graphlet_counts_n73, 5) # Compare expected tuple counts with actual - expect_equal(expected_tuple_count_n11_gs4, actual_tuple_count_n11_gs4) - expect_equal(expected_tuple_count_n37_gs4, actual_tuple_count_n37_gs4) - expect_equal(expected_tuple_count_n73_gs4, actual_tuple_count_n73_gs4) - expect_equal(expected_tuple_count_n11_gs5, actual_tuple_count_n11_gs5) - expect_equal(expected_tuple_count_n37_gs5, actual_tuple_count_n37_gs5) - expect_equal(expected_tuple_count_n73_gs5, actual_tuple_count_n73_gs5) + expect_equal(exp_tuple_count_n11_gs4, act_tuple_count_n11_gs4) + expect_equal(exp_tuple_count_n37_gs4, act_tuple_count_n37_gs4) + expect_equal(exp_tuple_count_n73_gs4, act_tuple_count_n73_gs4) + expect_equal(exp_tuple_count_n11_gs5, act_tuple_count_n11_gs5) + expect_equal(exp_tuple_count_n37_gs5, act_tuple_count_n37_gs5) + expect_equal(exp_tuple_count_n73_gs5, act_tuple_count_n73_gs5) # === TEST count_graphlet_tuples_ego === # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar # to the method under test. However, it's a simple method so maybe that's ok? - graphlet_tuple_counts_ego <- function(graphlet_counts_ego, max_graphlet_size) { + graphlet_tuple_counts_ego <- function(graphlet_counts_ego, + max_graphlet_size) { t(apply(graphlet_counts_ego, 1, count_graphlet_tuples, max_graphlet_size = max_graphlet_size @@ -81,53 +82,130 @@ test_that(test_message, { # Generate expected tuple counts for graphlets up to size 4 and 5 # 1. For ego-networks of order 1 - expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) - expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) - expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) - expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) - expected_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego1, 5) - expected_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego1, 5) + exp_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n11_ego1, 4 + ) + exp_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n37_ego1, 4 + ) + exp_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n73_ego1, 4 + ) + exp_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n11_ego1, 5 + ) + exp_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n37_ego1, 5 + ) + exp_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n73_ego1, 5 + ) # 2. For ego-networks of order 2 - expected_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego2, 4) - expected_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego2, 4) - expected_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego2, 4) - expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) - expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) - expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) + exp_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n11_ego2, 4 + ) + exp_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n37_ego2, 4 + ) + exp_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n73_ego2, 4 + ) + exp_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n11_ego2, 5 + ) + exp_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n37_ego2, 5 + ) + exp_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n73_ego2, 5 + ) # Calculate actual tuple counts # 1. For ego-networks of order 1 - actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) - actual_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego(graph_n37_ego1, 4) - actual_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego(graph_n73_ego1, 4) - actual_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego(graph_n11_ego1, 5) - actual_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego(graph_n37_ego1, 5) - actual_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego(graph_n73_ego1, 5) + act_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n11_ego1, 4 + ) + act_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n37_ego1, 4 + ) + act_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n73_ego1, 4 + ) + act_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n11_ego1, 5 + ) + act_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n37_ego1, 5 + ) + act_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n73_ego1, 5 + ) # 2. For ego-networks of order 2 - actual_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego(graph_n11_ego2, 4) - actual_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego(graph_n37_ego2, 4) - actual_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego(graph_n73_ego2, 4) - actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) - actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) - actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) + act_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n11_ego2, 4 + ) + act_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n37_ego2, 4 + ) + act_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n73_ego2, 4 + ) + act_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n11_ego2, 5 + ) + act_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n37_ego2, 5 + ) + act_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n73_ego2, 5 + ) # Compare expected with actual - expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) - expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) - expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) - expect_equal(expected_tuple_count_n11_ego1_gs5, actual_tuple_count_n11_ego1_gs5) - expect_equal(expected_tuple_count_n37_ego1_gs5, actual_tuple_count_n37_ego1_gs5) - expect_equal(expected_tuple_count_n73_ego1_gs5, actual_tuple_count_n73_ego1_gs5) + expect_equal( + exp_tuple_count_n11_ego1_gs4, act_tuple_count_n11_ego1_gs4 + ) + expect_equal( + exp_tuple_count_n37_ego1_gs4, act_tuple_count_n37_ego1_gs4 + ) + expect_equal( + exp_tuple_count_n73_ego1_gs4, act_tuple_count_n73_ego1_gs4 + ) + expect_equal( + exp_tuple_count_n11_ego1_gs5, act_tuple_count_n11_ego1_gs5 + ) + expect_equal( + exp_tuple_count_n37_ego1_gs5, act_tuple_count_n37_ego1_gs5 + ) + expect_equal( + exp_tuple_count_n73_ego1_gs5, act_tuple_count_n73_ego1_gs5 + ) # 2. For ego-networks of order 2 - expect_equal(expected_tuple_count_n11_ego2_gs4, actual_tuple_count_n11_ego2_gs4) - expect_equal(expected_tuple_count_n37_ego2_gs4, actual_tuple_count_n37_ego2_gs4) - expect_equal(expected_tuple_count_n73_ego2_gs4, actual_tuple_count_n73_ego2_gs4) - expect_equal(expected_tuple_count_n11_ego2_gs5, actual_tuple_count_n11_ego2_gs5) - expect_equal(expected_tuple_count_n37_ego2_gs5, actual_tuple_count_n37_ego2_gs5) - expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) + expect_equal( + exp_tuple_count_n11_ego2_gs4, act_tuple_count_n11_ego2_gs4 + ) + expect_equal( + exp_tuple_count_n37_ego2_gs4, act_tuple_count_n37_ego2_gs4 + ) + expect_equal( + exp_tuple_count_n73_ego2_gs4, act_tuple_count_n73_ego2_gs4 + ) + expect_equal( + exp_tuple_count_n11_ego2_gs5, act_tuple_count_n11_ego2_gs5 + ) + expect_equal( + exp_tuple_count_n37_ego2_gs5, act_tuple_count_n37_ego2_gs5 + ) + expect_equal( + exp_tuple_count_n73_ego2_gs5, act_tuple_count_n73_ego2_gs5 + ) }) -context("Measures Netdis: Ego-network density values match those for manually verified networks") +context( + paste( + "Measures Netdis: Ego-network density values match those for manually", + "verified networks" + ) +) test_that("Ego-network 4-node density values match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets @@ -163,19 +241,19 @@ test_that("Ego-network 4-node density values match manually verified totals", { # Set manually verified ego-network node counts and edge densities # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - names(expected_densities_o1) <- node_labels + exp_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + exp_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(exp_node_counts_o1, 2) + exp_densities_o1 <- c(exp_edge_counts_o1 / max_edge_counts_o1) + names(exp_densities_o1) <- node_labels # Order 1 expected densities should be: # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - names(expected_densities_o2) <- node_labels + exp_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + exp_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(exp_node_counts_o2, 2) + exp_densities_o2 <- c(exp_edge_counts_o2 / max_edge_counts_o2) + names(exp_densities_o2) <- node_labels # Order 2 expected densities should be: # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 @@ -193,178 +271,196 @@ test_that("Ego-network 4-node density values match manually verified totals", { ) # Calculate densities - actual_densities_o1 <- ego_network_density(graphlet_counts_ego_o1) - actual_densities_o2 <- ego_network_density(graphlet_counts_ego_o2) + act_densities_o1 <- ego_network_density(graphlet_counts_ego_o1) + act_densities_o2 <- ego_network_density(graphlet_counts_ego_o2) # Check densities match expected values - expect_equal(actual_densities_o1, expected_densities_o1) - expect_equal(actual_densities_o2, expected_densities_o2) + expect_equal(act_densities_o1, exp_densities_o1) + expect_equal(act_densities_o2, exp_densities_o2) }) -context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") -test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set parameters for test - max_graphlet_size <- 4 - min_counts_per_interval <- 2 - num_intervals <- 100 - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - - # Set manually verified ego-network node counts and edge densities - # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - # Order 1 expected densities should be: - # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 - # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - # Order 2 expected densities should be: - # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - - # Set manually verified density bins for ego-networks - # 1. Ego-networks of order 1 - expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) - expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - expected_binned_densities_o1 <- list( - densities = expected_densities_o1, - interval_indexes = expected_interval_indexes_o1, - breaks = expected_breaks_o1 - ) - # Check binned densities are as expected - actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) - # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1 / 3 - expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- - (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 - expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) - expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - expected_binned_densities_o2 <- list( - densities = expected_densities_o2, - interval_indexes = expected_interval_indexes_o2, - breaks = expected_breaks_o2 - ) - # Check binned densities are as expected - actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - expected_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels - # 2-step ego networks - expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels - - # Calculate binned average expected counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + - expected_counts_o1[8, ]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + - expected_counts_o1[5, ] + expected_counts_o1[9, ] + - expected_counts_o1[10, ]) / 5 - expected_mean_density_binned_counts_o1 <- rbind( - mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 - ) - rownames(expected_mean_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + - expected_counts_o2[7, ]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + - expected_counts_o2[10, ]) / 3 - expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, - mean_counts_bin4_o2 - ) - rownames(expected_mean_density_binned_counts_o2) <- 1:4 +context( + paste( + "Measures Netdis: Ego-network density-binned reference counts for manually", + "verified networks" + ) +) +test_that( + paste( + "Ego-network 4-node density-binned reference counts match manually", + "verified totals" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + exp_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + exp_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(exp_node_counts_o1, 2) + exp_densities_o1 <- c(exp_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + exp_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + exp_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(exp_node_counts_o2, 2) + exp_densities_o2 <- c(exp_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + exp_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + exp_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + exp_binned_densities_o1 <- list( + densities = exp_densities_o1, + interval_indexes = exp_interval_indexes_o1, + breaks = exp_breaks_o1 + ) + # Check binned densities are as expected + act_binned_densities_o1 <- binned_densities_adaptive( + exp_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(act_binned_densities_o1, exp_binned_densities_o1) + # 2. Ego-networks of order 2 + exp_min_break_o2 <- 1 / 3 + exp_max_break_o2 <- 0.6 + exp_initial_interval_o2 <- + (exp_max_break_o2 - exp_min_break_o2) / (num_intervals) + exp_breaks_o2 <- exp_min_break_o2 + + (exp_initial_interval_o2 * c(0, 9, 50, 63, 100)) + exp_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + exp_binned_densities_o2 <- list( + densities = exp_densities_o2, + interval_indexes = exp_interval_indexes_o2, + breaks = exp_breaks_o2 + ) + # Check binned densities are as expected + act_binned_densities_o2 <- binned_densities_adaptive( + exp_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(act_binned_densities_o2, exp_binned_densities_o2) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + exp_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(exp_counts_o1) <- node_labels + colnames(exp_counts_o1) <- graphlet_labels + # 2-step ego networks + exp_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(exp_counts_o2) <- node_labels + colnames(exp_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (exp_counts_o1[1, ] + + exp_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (exp_counts_o1[6, ] + exp_counts_o1[7, ] + + exp_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (exp_counts_o1[3, ] + exp_counts_o1[4, ] + + exp_counts_o1[5, ] + exp_counts_o1[9, ] + + exp_counts_o1[10, ]) / 5 + exp_mean_density_binned_counts_o1 <- rbind( # nolint: object_length_linter. + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(exp_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (exp_counts_o2[1, ] + + exp_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (exp_counts_o2[2, ] + exp_counts_o2[6, ] + + exp_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (exp_counts_o2[3, ] + + exp_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (exp_counts_o2[8, ] + + exp_counts_o2[9, ] + exp_counts_o2[10, ]) / 3 + exp_mean_density_binned_counts_o2 <- rbind( # nolint: object_length_linter. + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(exp_mean_density_binned_counts_o2) <- 1:4 - # Calculate actual output of function under test - actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1 - ) - actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2 - ) + # Calculate actual output of function under test + act_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( # nolint: object_length_linter. + exp_counts_o1, exp_interval_indexes_o1 + ) + act_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( # nolint: object_length_linter. + exp_counts_o2, exp_interval_indexes_o2 + ) - # Check actual output vs expected - expect_equal( - actual_mean_density_binned_counts_o1, - expected_mean_density_binned_counts_o1 - ) - expect_equal( - actual_mean_density_binned_counts_o2, - expected_mean_density_binned_counts_o2 - ) -}) + # Check actual output vs expected + expect_equal( + act_mean_density_binned_counts_o1, + exp_mean_density_binned_counts_o1 + ) + expect_equal( + act_mean_density_binned_counts_o2, + exp_mean_density_binned_counts_o2 + ) + } +) -context("Measures Netdis: scale_graphlet_counts_ego for manually verified networks") +context( + "Measures Netdis: scale_graphlet_counts_ego for manually verified networks" +) test_that("Ego-network 4-node graphlet counts match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets @@ -391,13 +487,14 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 + # Count graphlets in each ego network of the graph with neighbourhood sizes of + # 1 and 2 max_graphlet_size <- 4 min_ego_edges <- 0 min_ego_nodes <- 0 - # Use previously tested functions to generate ego networks and calcualte graphlet - # counts. + # Use previously tested functions to generate ego networks and calcualte + # graphlet counts. # ego nets ego_networks_o1 <- make_named_ego_graph(graph, order = 1, @@ -423,11 +520,11 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { # Calculate scaled counts with scale_graphlet_counts_ego # (function to test). - actual_counts_o1 <- + act_counts_o1 <- scale_graphlet_counts_ego(graphlet_counts_o1, max_graphlet_size = max_graphlet_size ) - actual_counts_o2 <- + act_counts_o2 <- scale_graphlet_counts_ego(graphlet_counts_o2, max_graphlet_size = max_graphlet_size ) @@ -436,7 +533,7 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { k <- graphlet_key$node_count # Set manually verified counts # 1-step ego networks - expected_counts_o1 <- rbind( + exp_counts_o1 <- rbind( c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), @@ -448,10 +545,10 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels + rownames(exp_counts_o1) <- node_labels + colnames(exp_counts_o1) <- graphlet_labels # 2-step ego networks - expected_counts_o2 <- rbind( + exp_counts_o2 <- rbind( c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), @@ -463,301 +560,336 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels + rownames(exp_counts_o2) <- node_labels + colnames(exp_counts_o2) <- graphlet_labels # Test that actual counts match expected - expect_equal(actual_counts_o1, expected_counts_o1) - expect_equal(actual_counts_o2, expected_counts_o2) + expect_equal(act_counts_o1, exp_counts_o1) + expect_equal(act_counts_o2, exp_counts_o2) }) -context("Measures Netdis: Ego-network density-binned counts for manually verified networks") -test_that("density_binned_counts output matches manually verified totals with different scaling and aggregation functions", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +context( + paste( + "Measures Netdis: Ego-network density-binned counts for manually", + "verified networks" + ) +) +test_that( + paste( + "density_binned_counts output matches manually verified totals with", + "different scaling and aggregation functions" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Set node and graphlet labels to use for row and col names in + # expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + exp_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + exp_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(exp_node_counts_o1, 2) + exp_densities_o1 <- c(exp_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + exp_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + exp_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(exp_node_counts_o2, 2) + exp_densities_o2 <- c(exp_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + exp_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + exp_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + exp_binned_densities_o1 <- list( + densities = exp_densities_o1, + interval_indexes = exp_interval_indexes_o1, + breaks = exp_breaks_o1 + ) + # 2. Ego-networks of order 2 + exp_min_break_o2 <- 1 / 3 + exp_max_break_o2 <- 0.6 + exp_initial_interval_o2 <- + (exp_max_break_o2 - exp_min_break_o2) / (num_intervals) + exp_breaks_o2 <- exp_min_break_o2 + + (exp_initial_interval_o2 * c(0, 9, 50, 63, 100)) + exp_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + exp_binned_densities_o2 <- list( + densities = exp_densities_o2, + interval_indexes = exp_interval_indexes_o2, + breaks = exp_breaks_o2 + ) - # Set parameters for test - max_graphlet_size <- 4 - min_counts_per_interval <- 2 - num_intervals <- 100 - min_ego_edges <- 0 - min_ego_nodes <- 0 + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + exp_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(exp_counts_o1) <- node_labels + colnames(exp_counts_o1) <- graphlet_labels + # 2-step ego networks + exp_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(exp_counts_o2) <- node_labels + colnames(exp_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (exp_counts_o1[1, ] + + exp_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (exp_counts_o1[6, ] + exp_counts_o1[7, ] + + exp_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (exp_counts_o1[3, ] + exp_counts_o1[4, ] + + exp_counts_o1[5, ] + exp_counts_o1[9, ] + + exp_counts_o1[10, ]) / 5 + exp_mean_density_binned_counts_o1 <- rbind( # nolint: object_length_linter. + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(exp_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (exp_counts_o2[1, ] + + exp_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (exp_counts_o2[2, ] + exp_counts_o2[6, ] + + exp_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (exp_counts_o2[3, ] + + exp_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (exp_counts_o2[8, ] + exp_counts_o2[9, ] + + exp_counts_o2[10, ]) / 3 + exp_mean_density_binned_counts_o2 <- rbind( # nolint: object_length_linter. + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(exp_mean_density_binned_counts_o2) <- 1:4 - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # density_binned_counts with default arguments should give + # mean graphlet count in each density bin + act_density_binned_counts_o1 <- density_binned_counts( + exp_counts_o1, + exp_interval_indexes_o1 + ) - # Set manually verified ego-network node counts and edge densities - # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - # Order 1 expected densities should be: - # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 - # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - # Order 2 expected densities should be: - # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + act_density_binned_counts_o2 <- density_binned_counts( + exp_counts_o2, + exp_interval_indexes_o2 + ) - # Set manually verified density bins for ego-networks - # 1. Ego-networks of order 1 - expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) - expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - expected_binned_densities_o1 <- list( - densities = expected_densities_o1, - interval_indexes = expected_interval_indexes_o1, - breaks = expected_breaks_o1 - ) - # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1 / 3 - expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- - (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 - expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) - expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - expected_binned_densities_o2 <- list( - densities = expected_densities_o2, - interval_indexes = expected_interval_indexes_o2, - breaks = expected_breaks_o2 - ) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - expected_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels - # 2-step ego networks - expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels - - # Calculate binned average expected counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + - expected_counts_o1[8, ]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + - expected_counts_o1[5, ] + expected_counts_o1[9, ] + - expected_counts_o1[10, ]) / 5 - expected_mean_density_binned_counts_o1 <- rbind( - mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 - ) - rownames(expected_mean_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + - expected_counts_o2[7, ]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + - expected_counts_o2[10, ]) / 3 - expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, - mean_counts_bin4_o2 - ) - rownames(expected_mean_density_binned_counts_o2) <- 1:4 - - # density_binned_counts with default arguments should give - # mean graphlet count in each density bin - actual_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1 - ) - - actual_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2 - ) - - # Check actual output vs expected - expect_equal( - actual_density_binned_counts_o1, - expected_mean_density_binned_counts_o1 - ) - expect_equal( - actual_density_binned_counts_o2, - expected_mean_density_binned_counts_o2 - ) - - # Calculate max binned counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - # apply(x, 2, max): returns max of each column in x - max_counts_bin1_o1 <- apply(rbind(expected_counts_o1[1, ], expected_counts_o1[2, ]), 2, max) - max_counts_bin2_o1 <- apply(rbind( - expected_counts_o1[6, ], expected_counts_o1[7, ], - expected_counts_o1[8, ] - ), 2, max) - max_counts_bin3_o1 <- apply(rbind( - expected_counts_o1[3, ], expected_counts_o1[4, ], - expected_counts_o1[5, ], expected_counts_o1[9, ], - expected_counts_o1[10, ] - ), 2, max) - - expected_max_density_binned_counts_o1 <- rbind( - max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 - ) - rownames(expected_max_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - max_counts_bin1_o2 <- apply(rbind(expected_counts_o2[1, ], expected_counts_o2[4, ]), 2, max) - max_counts_bin2_o2 <- apply(rbind( - expected_counts_o2[2, ], expected_counts_o2[6, ], - expected_counts_o2[7, ] - ), 2, max) - max_counts_bin3_o2 <- apply(rbind(expected_counts_o2[3, ], expected_counts_o2[5, ]), 2, max) - max_counts_bin4_o2 <- apply(rbind( - expected_counts_o2[8, ], expected_counts_o2[9, ], - expected_counts_o2[10, ] - ), 2, max) - - expected_max_density_binned_counts_o2 <- rbind( - max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, - max_counts_bin4_o2 - ) - rownames(expected_max_density_binned_counts_o2) <- 1:4 - - # density_binned_counts with agg_fn = max should give - # max graphlet count in each density bin - agg_fn <- max - scale_fn <- NULL - - actual_max_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1, - agg_fn = agg_fn, - scale_fn = scale_fn - ) - - actual_max_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2, - agg_fn = agg_fn, - scale_fn = scale_fn - ) - - # Check actual output vs expected - expect_equal( - actual_max_density_binned_counts_o1, - expected_max_density_binned_counts_o1 - ) - expect_equal( - actual_max_density_binned_counts_o2, - expected_max_density_binned_counts_o2 - ) + # Check actual output vs expected + expect_equal( + act_density_binned_counts_o1, + exp_mean_density_binned_counts_o1 + ) + expect_equal( + act_density_binned_counts_o2, + exp_mean_density_binned_counts_o2 + ) - # density_binned_counts with scale_fn = scale_graphlet_counts_ego - # should give mean graphlet counts in each density bin scaled by - # count_graphlet_tuples. - agg_fn <- mean - scale_fn <- scale_graphlet_counts_ego + # Calculate max binned counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + # apply(x, 2, max): returns max of each column in x + max_counts_bin1_o1 <- apply( + rbind(exp_counts_o1[1, ], exp_counts_o1[2, ]), 2, max + ) + max_counts_bin2_o1 <- apply(rbind( + exp_counts_o1[6, ], exp_counts_o1[7, ], + exp_counts_o1[8, ] + ), 2, max) + max_counts_bin3_o1 <- apply(rbind( + exp_counts_o1[3, ], exp_counts_o1[4, ], + exp_counts_o1[5, ], exp_counts_o1[9, ], + exp_counts_o1[10, ] + ), 2, max) + + exp_max_density_binned_counts_o1 <- rbind( # nolint: object_length_linter. + max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 + ) + rownames(exp_max_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + max_counts_bin1_o2 <- apply( + rbind(exp_counts_o2[1, ], exp_counts_o2[4, ]), 2, max + ) + max_counts_bin2_o2 <- apply(rbind( + exp_counts_o2[2, ], exp_counts_o2[6, ], + exp_counts_o2[7, ] + ), 2, max) + max_counts_bin3_o2 <- apply( + rbind(exp_counts_o2[3, ], exp_counts_o2[5, ]), 2, max + ) + max_counts_bin4_o2 <- apply(rbind( + exp_counts_o2[8, ], exp_counts_o2[9, ], + exp_counts_o2[10, ] + ), 2, max) + + exp_max_density_binned_counts_o2 <- rbind( # nolint: object_length_linter. + max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, + max_counts_bin4_o2 + ) + rownames(exp_max_density_binned_counts_o2) <- 1:4 + + # density_binned_counts with agg_fn = max should give + # max graphlet count in each density bin + agg_fn <- max + scale_fn <- NULL + + act_max_density_binned_counts_o1 <- density_binned_counts( # nolint: object_length_linter. + exp_counts_o1, + exp_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn + ) - # calculate expected counts using previously tested function - expected_scaled_counts_o1 <- - scale_graphlet_counts_ego(expected_counts_o1, - max_graphlet_size = max_graphlet_size + act_max_density_binned_counts_o2 <- density_binned_counts( # nolint: object_length_linter. + exp_counts_o2, + exp_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn ) - expected_scaled_counts_o2 <- - scale_graphlet_counts_ego(expected_counts_o2, - max_graphlet_size = max_graphlet_size + + # Check actual output vs expected + expect_equal( + act_max_density_binned_counts_o1, + exp_max_density_binned_counts_o1 + ) + expect_equal( + act_max_density_binned_counts_o2, + exp_max_density_binned_counts_o2 ) - # calculate mean expected counts using expected density bins - mean_scaled_counts_bin1_o1 <- (expected_scaled_counts_o1[1, ] + expected_scaled_counts_o1[2, ]) / 2 - mean_scaled_counts_bin2_o1 <- (expected_scaled_counts_o1[6, ] + expected_scaled_counts_o1[7, ] + - expected_scaled_counts_o1[8, ]) / 3 - mean_scaled_counts_bin3_o1 <- (expected_scaled_counts_o1[3, ] + expected_scaled_counts_o1[4, ] + - expected_scaled_counts_o1[5, ] + expected_scaled_counts_o1[9, ] + - expected_scaled_counts_o1[10, ]) / 5 - expected_scaled_density_binned_counts_o1 <- rbind( - mean_scaled_counts_bin1_o1, mean_scaled_counts_bin2_o1, mean_scaled_counts_bin3_o1 - ) - rownames(expected_scaled_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_scaled_counts_bin1_o2 <- (expected_scaled_counts_o2[1, ] + expected_scaled_counts_o2[4, ]) / 2 - mean_scaled_counts_bin2_o2 <- (expected_scaled_counts_o2[2, ] + expected_scaled_counts_o2[6, ] + - expected_scaled_counts_o2[7, ]) / 3 - mean_scaled_counts_bin3_o2 <- (expected_scaled_counts_o2[3, ] + expected_scaled_counts_o2[5, ]) / 2 - mean_scaled_counts_bin4_o2 <- (expected_scaled_counts_o2[8, ] + expected_scaled_counts_o2[9, ] + - expected_scaled_counts_o2[10, ]) / 3 - expected_scaled_density_binned_counts_o2 <- rbind( - mean_scaled_counts_bin1_o2, mean_scaled_counts_bin2_o2, mean_scaled_counts_bin3_o2, - mean_scaled_counts_bin4_o2 - ) - rownames(expected_scaled_density_binned_counts_o2) <- 1:4 - - # Calculate scaled binned counts with density_binned_counts (function to test) - actual_scaled_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1, - agg_fn = agg_fn, - scale_fn = scale_fn, - max_graphlet_size = max_graphlet_size - ) + # density_binned_counts with scale_fn = scale_graphlet_counts_ego + # should give mean graphlet counts in each density bin scaled by + # count_graphlet_tuples. + agg_fn <- mean + scale_fn <- scale_graphlet_counts_ego + + # calculate expected counts using previously tested function + exp_scaled_counts_o1 <- + scale_graphlet_counts_ego(exp_counts_o1, + max_graphlet_size = max_graphlet_size + ) + exp_scaled_counts_o2 <- + scale_graphlet_counts_ego(exp_counts_o2, + max_graphlet_size = max_graphlet_size + ) + + # calculate mean expected counts using expected density bins + mean_scaled_counts_bin1_o1 <- (exp_scaled_counts_o1[1, ] + + exp_scaled_counts_o1[2, ]) / 2 + mean_scaled_counts_bin2_o1 <- (exp_scaled_counts_o1[6, ] + + exp_scaled_counts_o1[7, ] + + exp_scaled_counts_o1[8, ]) / 3 + mean_scaled_counts_bin3_o1 <- (exp_scaled_counts_o1[3, ] + + exp_scaled_counts_o1[4, ] + + exp_scaled_counts_o1[5, ] + + exp_scaled_counts_o1[9, ] + + exp_scaled_counts_o1[10, ]) / 5 + exp_scaled_density_binned_counts_o1 <- rbind( # nolint: object_length_linter. + mean_scaled_counts_bin1_o1, + mean_scaled_counts_bin2_o1, + mean_scaled_counts_bin3_o1 + ) + rownames(exp_scaled_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_scaled_counts_bin1_o2 <- (exp_scaled_counts_o2[1, ] + + exp_scaled_counts_o2[4, ]) / 2 + mean_scaled_counts_bin2_o2 <- (exp_scaled_counts_o2[2, ] + + exp_scaled_counts_o2[6, ] + + exp_scaled_counts_o2[7, ]) / 3 + mean_scaled_counts_bin3_o2 <- (exp_scaled_counts_o2[3, ] + + exp_scaled_counts_o2[5, ]) / 2 + mean_scaled_counts_bin4_o2 <- (exp_scaled_counts_o2[8, ] + + exp_scaled_counts_o2[9, ] + + exp_scaled_counts_o2[10, ]) / 3 + exp_scaled_density_binned_counts_o2 <- rbind( # nolint: object_length_linter. + mean_scaled_counts_bin1_o2, + mean_scaled_counts_bin2_o2, + mean_scaled_counts_bin3_o2, + mean_scaled_counts_bin4_o2 + ) + rownames(exp_scaled_density_binned_counts_o2) <- 1:4 + + # Calculate scaled binned counts with density_binned_counts (function to + # test) + act_scaled_density_binned_counts_o1 <- density_binned_counts( # nolint: object_length_linter. + exp_counts_o1, + exp_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) - actual_scaled_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2, - agg_fn = agg_fn, - scale_fn = scale_fn, - max_graphlet_size = max_graphlet_size - ) + act_scaled_density_binned_counts_o2 <- density_binned_counts( # nolint: object_length_linter. + exp_counts_o2, + exp_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) - # Check actual output vs expected - expect_equal( - actual_scaled_density_binned_counts_o1, - expected_scaled_density_binned_counts_o1 - ) - expect_equal( - actual_scaled_density_binned_counts_o2, - expected_scaled_density_binned_counts_o2 - ) -}) + # Check actual output vs expected + expect_equal( + act_scaled_density_binned_counts_o1, + exp_scaled_density_binned_counts_o1 + ) + expect_equal( + act_scaled_density_binned_counts_o2, + exp_scaled_density_binned_counts_o2 + ) + } +) context("Measures Netdis: Expected graphlet counts") test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { @@ -801,17 +933,17 @@ test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { ) # WITH scale_fn = NULL (bin counts directly with no scaling) # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index) { + exp_exp_graphlet_counts_fn <- function(density_index) { scaled_reference_counts[density_index, ] } # Determine expected and actual expected graphlet counts - expected_expected_graphlet_counts <- - purrr::map(density_indexes, expected_expected_graphlet_counts_fn) - actual_expected_graphlet_counts <- + exp_exp_graphlet_counts <- + purrr::map(density_indexes, exp_exp_graphlet_counts_fn) + act_exp_graphlet_counts <- purrr::map(graphlet_counts, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts, + density_binned_ref_counts = scaled_reference_counts, scale_fn = NULL ) @@ -819,37 +951,41 @@ test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { # NOTE: v2.0.0 of testthat library made a breaking change that means using # map, mapply etc can cause failures under certain conditions # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 - for (i in 1:length(actual_expected_graphlet_counts)) { + for (i in 1:length(act_exp_graphlet_counts)) { expect_equal( - actual_expected_graphlet_counts[i], - expected_expected_graphlet_counts[i] + act_exp_graphlet_counts[i], + exp_exp_graphlet_counts[i] ) } # WITH scale_fn = count_graphlet_tuples (default netdis from paper) # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index, node_count) { + exp_exp_graphlet_counts_fn <- function(density_index, node_count) { reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } # Determine expected and actual expected graphlet counts - expected_expected_graphlet_counts <- - purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) - actual_expected_graphlet_counts <- + exp_exp_graphlet_counts <- + purrr::map2( + density_indexes, + num_nodes, + exp_exp_graphlet_counts_fn + ) + act_exp_graphlet_counts <- purrr::map(graphlet_counts, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts, + density_binned_ref_counts = scaled_reference_counts, scale_fn = count_graphlet_tuples ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using # map, mapply etc can cause failures under certain conditions # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 - for (i in 1:length(actual_expected_graphlet_counts)) { + for (i in 1:length(act_exp_graphlet_counts)) { expect_equal( - actual_expected_graphlet_counts[i], - expected_expected_graphlet_counts[i] + act_exp_graphlet_counts[i], + exp_exp_graphlet_counts[i] ) } }) @@ -899,13 +1035,17 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) max_edges_o1 <- choose(num_nodes_o1, 2) densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 + # Order 1 densities should be: + # 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 + # 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 # 2. Ego-networks of order 2 num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) max_edges_o2 <- choose(num_nodes_o2, 2) densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 + # Order 2 densities should be: + # 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 + # 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 # Set manually defined density breaks and indexes breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) @@ -924,7 +1064,7 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) colnames(scaled_reference_counts) <- graphlet_labels - expected_dims <- dim(scaled_reference_counts) + exp_dims <- dim(scaled_reference_counts) min_ego_nodes <- 3 min_ego_edges <- 1 @@ -932,31 +1072,33 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { # With scale_fn = count_graphlet_tuples (default netdis paper) #------------------------------------------------------- # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index, node_count) { + exp_exp_graphlet_counts_fn <- function(density_index, node_count) { reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet # types as columns and ego networks for nodes in graph as rows - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn + exp_exp_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( + density_indexes_o1, num_nodes_o1, exp_exp_graphlet_counts_fn ))) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn + exp_exp_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( + density_indexes_o2, num_nodes_o2, exp_exp_graphlet_counts_fn ))) # Sanity check for expected output shape. Should be matrix with graphlet types # as columns and nodes as rows - expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) - expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + expect_equal(dim(exp_exp_graphlet_counts_ego_o1), exp_dims) + expect_equal(dim(exp_exp_graphlet_counts_ego_o2), exp_dims) # Set column labels to graphlet names - colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels - colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + colnames(exp_exp_graphlet_counts_ego_o1) <- graphlet_labels + colnames(exp_exp_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o1) + rownames(exp_exp_graphlet_counts_ego_o1) <- + rownames(graphlet_counts_ego_o1) + rownames(exp_exp_graphlet_counts_ego_o2) <- + rownames(graphlet_counts_ego_o1) # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- + act_exp_graphlet_counts_ego_o1 <- netdis_expected_counts( graphlet_counts_ego_o1, breaks, @@ -964,7 +1106,7 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { max_graphlet_size, scale_fn = count_graphlet_tuples ) - actual_expected_graphlet_counts_ego_o2 <- + act_exp_graphlet_counts_ego_o2 <- netdis_expected_counts( graphlet_counts_ego_o2, breaks, @@ -975,42 +1117,44 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { # Compare actual to expected expect_equal( - actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1 + act_exp_graphlet_counts_ego_o1, + exp_exp_graphlet_counts_ego_o1 ) expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 + act_exp_graphlet_counts_ego_o2, + exp_exp_graphlet_counts_ego_o2 ) #------------------------------------------------------- # With scale_fn = NULL (take reference counts directly) #------------------------------------------------------- # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index) { + exp_exp_graphlet_counts_fn <- function(density_index) { scaled_reference_counts[density_index, ] } # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet # types as columns and ego networks for nodes in graph as rows - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map( - density_indexes_o1, expected_expected_graphlet_counts_fn + exp_exp_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map( + density_indexes_o1, exp_exp_graphlet_counts_fn ))) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map( - density_indexes_o2, expected_expected_graphlet_counts_fn + exp_exp_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map( + density_indexes_o2, exp_exp_graphlet_counts_fn ))) # Sanity check for expected output shape. Should be matrix with graphlet types # as columns and nodes as rows - expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) - expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + expect_equal(dim(exp_exp_graphlet_counts_ego_o1), exp_dims) + expect_equal(dim(exp_exp_graphlet_counts_ego_o2), exp_dims) # Set column labels to graphlet names - colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels - colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + colnames(exp_exp_graphlet_counts_ego_o1) <- graphlet_labels + colnames(exp_exp_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o2) + rownames(exp_exp_graphlet_counts_ego_o1) <- + rownames(graphlet_counts_ego_o1) + rownames(exp_exp_graphlet_counts_ego_o2) <- + rownames(graphlet_counts_ego_o2) # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- + act_exp_graphlet_counts_ego_o1 <- netdis_expected_counts( graphlet_counts_ego_o1, breaks, @@ -1018,7 +1162,7 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { max_graphlet_size, scale_fn = NULL ) - actual_expected_graphlet_counts_ego_o2 <- + act_exp_graphlet_counts_ego_o2 <- netdis_expected_counts( graphlet_counts_ego_o2, breaks, @@ -1029,12 +1173,12 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { # Compare actual to expected expect_equal( - actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1 + act_exp_graphlet_counts_ego_o1, + exp_exp_graphlet_counts_ego_o1 ) expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 + act_exp_graphlet_counts_ego_o2, + exp_exp_graphlet_counts_ego_o2 ) }) @@ -1056,44 +1200,47 @@ test_that("netdis statistic function output matches manually verified result", { names(counts_2) <- ids # manually verified results - expected_netdis_3 <- 0.03418796 - expected_netdis_4 <- 0.02091792 - expected_netdis_5 <- 0.03826385 + exp_netdis_3 <- 0.03418796 + exp_netdis_4 <- 0.02091792 + exp_netdis_5 <- 0.03826385 # check function to test - actual_netdis_3 <- netdis(counts_1, counts_2, 3) - actual_netdis_4 <- netdis(counts_1, counts_2, 4) - actual_netdis_5 <- netdis(counts_1, counts_2, 5) + act_netdis_3 <- netdis(counts_1, counts_2, 3) + act_netdis_4 <- netdis(counts_1, counts_2, 4) + act_netdis_5 <- netdis(counts_1, counts_2, 5) - expect_equal(expected_netdis_3, actual_netdis_3) - expect_equal(expected_netdis_4, actual_netdis_4) - expect_equal(expected_netdis_5, actual_netdis_5) + expect_equal(exp_netdis_3, act_netdis_3) + expect_equal(exp_netdis_4, act_netdis_4) + expect_equal(exp_netdis_5, act_netdis_5) }) -test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { - # arbitrary counts of correct size for graphlets up to size 5 - counts_1 <- c( - 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, - 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 - ) - counts_2 <- c( - 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, - 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 - ) +test_that( + "netdis_uptok gives expected netdis result for graphlets up to size k", + { + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c( + 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 + ) + counts_2 <- c( + 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 + ) - # add graphlet names - ids <- graphlet_key(5)$id - names(counts_1) <- ids - names(counts_2) <- ids + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids - # manually verified results - expected_netdis <- c(0.03418796, 0.02091792, 0.03826385) - names(expected_netdis) <- c("netdis3", "netdis4", "netdis5") + # manually verified results + exp_netdis <- c(0.03418796, 0.02091792, 0.03826385) + names(exp_netdis) <- c("netdis3", "netdis4", "netdis5") - # check function to test - actual_netdis <- netdis_uptok(counts_1, counts_2, 5) + # check function to test + act_netdis <- netdis_uptok(counts_1, counts_2, 5) - expect_equal(expected_netdis, actual_netdis) -}) + expect_equal(exp_netdis, act_netdis) + } +) context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { @@ -1129,18 +1276,18 @@ test_that("netdis_many_to_many gives expected result", { # 4 ECL HSV-1 2 3 # 5 ECL KSHV 2 4 # 6 HSV-1 KSHV 3 4 - expected_netdis_netdis <- matrix(nrow = 2, ncol = 6) - expected_netdis_netdis[1, ] <- c( + exp_netdis_netdis <- matrix(nrow = 2, ncol = 6) + exp_netdis_netdis[1, ] <- c( 0.1846655, 0.008264222, 0.01005385, 0.2065762, 0.2091241, 0.0001335756 ) - expected_netdis_netdis[2, ] <- c( + exp_netdis_netdis[2, ] <- c( 0.1749835, 0.165264120, 0.01969246, 0.2917612, 0.2215579, 0.0760242643 ) - rownames(expected_netdis_netdis) <- c("netdis3", "netdis4") + rownames(exp_netdis_netdis) <- c("netdis3", "netdis4") - expected_netdis_comp_spec <- cross_comparison_spec( + exp_netdis_comp_spec <- cross_comparison_spec( list( "EBV" = c(), "ECL" = c(), @@ -1149,14 +1296,14 @@ test_that("netdis_many_to_many gives expected result", { ) ) - expected_netdis <- list( - netdis = expected_netdis_netdis, - comp_spec = expected_netdis_comp_spec + exp_netdis <- list( + netdis = exp_netdis_netdis, + comp_spec = exp_netdis_comp_spec ) # Calculate netdis statistics - actual_netdis <- netdis_many_to_many(graphs, + act_netdis <- netdis_many_to_many(graphs, ref_graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, @@ -1165,7 +1312,7 @@ test_that("netdis_many_to_many gives expected result", { ) # Check results as expected - expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) + expect_equal(exp_netdis, act_netdis, tolerance = .001, scale = 1) }) context("Netdis: functions for different pairwise comparisons") @@ -1194,11 +1341,11 @@ test_that("netdis_one_to_one gives expected result", { min_ego_edges <- 1 # manually verified results - expected_netdis <- c(0.1846655, 0.1749835) - names(expected_netdis) <- c("netdis3", "netdis4") + exp_netdis <- c(0.1846655, 0.1749835) + names(exp_netdis) <- c("netdis3", "netdis4") # check function to test - actual_netdis <- netdis_one_to_one(graph_1, + act_netdis <- netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = max_graphlet_size, @@ -1207,7 +1354,7 @@ test_that("netdis_one_to_one gives expected result", { min_ego_edges = min_ego_edges ) - expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) + expect_equal(exp_netdis, act_netdis, tolerance = .001, scale = 1) }) test_that("netdis_one_to_many gives expected result", { # Set source directory for Virus PPI graph edge files @@ -1233,14 +1380,14 @@ test_that("netdis_one_to_many gives expected result", { # ECL HSV-1 KSHV VZV # netdis3 0.1846655 0.008264222 0.01005385 0.006777578 # netdis4 0.1749835 0.165264120 0.01969246 0.159711160 - expected_netdis <- matrix(nrow = 2, ncol = 4) - colnames(expected_netdis) <- c("ECL", "HSV-1", "KSHV", "VZV") - rownames(expected_netdis) <- c("netdis3", "netdis4") - expected_netdis[1, ] <- c(0.1846655, 0.008264222, 0.01005385, 0.006777578) - expected_netdis[2, ] <- c(0.1749835, 0.165264120, 0.01969246, 0.159711160) + exp_netdis <- matrix(nrow = 2, ncol = 4) + colnames(exp_netdis) <- c("ECL", "HSV-1", "KSHV", "VZV") + rownames(exp_netdis) <- c("netdis3", "netdis4") + exp_netdis[1, ] <- c(0.1846655, 0.008264222, 0.01005385, 0.006777578) + exp_netdis[2, ] <- c(0.1749835, 0.165264120, 0.01969246, 0.159711160) # Calculate netdis statistics - actual_netdis <- netdis_one_to_many(graph_1, graphs_compare, + act_netdis <- netdis_one_to_many(graph_1, graphs_compare, ref_graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, @@ -1249,7 +1396,7 @@ test_that("netdis_one_to_many gives expected result", { ) # Check results as expected - expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) + expect_equal(exp_netdis, act_netdis, tolerance = .001, scale = 1) }) context("Netdis: error if no query graphs or graphlet counts provided") @@ -1411,10 +1558,10 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson # manually verified result for graphlets of size 4 # verified using a different implementation of geometric poisson with these # networks. - expected_netdis4 <- 0.1892716 + exp_netdis4 <- 0.1892716 # check function to test - actual_netdis <- netdis_one_to_one(graph_1, + act_netdis <- netdis_one_to_one(graph_1, graph_2, ref_graph = NULL, max_graphlet_size = max_graphlet_size, @@ -1423,7 +1570,7 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson min_ego_edges = min_ego_edges, ) - expect_equal(expected_netdis4, actual_netdis[["netdis4"]], + expect_equal(exp_netdis4, act_netdis[["netdis4"]], tolerance = .0001, scale = 1 ) }) @@ -1449,8 +1596,3 @@ test_that("netdis_uptok errors for unsupported max_graphlet_size", { # graphlet size less than 3 expect_error(netdis_uptok(counts_1, counts_2, 2)) }) - -context("Netdis: works correctly when using a single density bin") -test_that("netdis single density bin works correctly", { - # TODO -}) diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index 6a99c71b..6f68111e 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -1,6 +1,10 @@ -self_net_emd <- function(histogram, shift, method) { - netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method) +self_netemd <- function(histogram, shift, method) { + netemd_one_to_one( + dhists_1 = histogram, + dhists_2 = shift_dhist(histogram, shift), + method = method + ) } expected <- 0 @@ -8,32 +12,30 @@ locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) histogram <- dhist(locations = locations, masses = masses) -expect_equal(self_net_emd(histogram, shift = 1, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 1, "exhaustive"), expected) -expect_equal(self_net_emd(histogram, shift = 0.5, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 0.5, "exhaustive"), expected) -expect_equal(self_net_emd(histogram, shift = 0.1, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 0.1, "exhaustive"), expected) -expect_equal(self_net_emd(histogram, shift = 0.05, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 0.05, "exhaustive"), expected) -expect_equal(self_net_emd(histogram, shift = 0.01, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) -expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) -expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 1, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 1, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 0.5, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 0.5, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 0.1, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 0.1, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 0.05, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 0.05, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 0.01, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 0.01, "exhaustive"), expected) +expect_equal(self_netemd(histogram, shift = 0, "optimise"), expected) +expect_equal(self_netemd(histogram, shift = 0, "exhaustive"), expected) expect_self_netemd_correct <- function(histogram, shift, method, - return_details = FALSE) { - self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), + return_details = FALSE) { + self_netemd <- netemd_one_to_one( + dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method, return_details = return_details ) - loc <- histogram$locations - mass <- histogram$masses - var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 expected <- list( - net_emd = 0, min_emds = 0, min_offsets = shift, + netemd = 0, min_emds = 0, min_offsets = shift, min_offsets_std = 0 ) - expect_equal(self_net_emd, expected) + expect_equal(self_netemd, expected) # nolint: object_usage_linter. } locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) @@ -88,40 +90,48 @@ expect_self_netemd_correct(histogram, return_details = TRUE ) -test_that("net_emd returns 0 when comparing any normal histogram against itself (no offset)", { - num_hists <- 5 - num_bins <- 101 +test_that( + paste( + "netemd returns 0 when comparing any normal histogram against itself", + "(no offset)" + ), + { + num_hists <- 5 + num_bins <- 101 - mus <- runif(num_hists, -10, 10) - sigmas <- runif(num_hists, 0, 10) + mus <- runif(num_hists, -10, 10) + sigmas <- runif(num_hists, 0, 10) - rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } - rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { - locations <- rand_locations(mu, sigma) - masses <- dnorm(locations, mean = mu, sd = sigma) - return(dhist(masses = masses, locations = locations)) - }) + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { + locations <- rand_locations(mu, sigma) + masses <- dnorm(locations, mean = mu, sd = sigma) + return(dhist(masses = masses, locations = locations)) + }) - expected <- 0 - actuals_opt <- purrr::map(rand_dhists, function(dhist) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") - }) - purrr::walk(actuals_opt, function(actual) { - expect_equal(actual, expected) - }) + expected <- 0 + actuals_opt <- purrr::map(rand_dhists, function(dhist) { + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") + }) + purrr::walk(actuals_opt, function(actual) { + expect_equal(actual, expected) + }) - actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive") - }) - purrr::walk(actuals_exhaustive_default, function(actual) { - expect_equal(actual, expected) - }) -}) + actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { + netemd_one_to_one( + dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive" + ) + }) + purrr::walk(actuals_exhaustive_default, function(actual) { + expect_equal(actual, expected) + }) + } +) -test_that("net_emd returns 0 when comparing any normal histogram randomly offset +test_that("netemd returns 0 when comparing any normal histogram randomly offset against itself", { num_hists <- 2 num_bins <- 101 @@ -145,7 +155,11 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset netemd_offset_self <- function(dhist, offsets, method) { netemds <- purrr::map_dbl(offsets, function(offset) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) + netemd_one_to_one( + dhists_1 = dhist, + dhists_2 = shift_dhist(dhist, offset), + method = method + ) }) return(netemds) } @@ -175,7 +189,7 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset }) }) -test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any +test_that("netemd returns min_emd = 0 and min_offset = 0 when comparing any normal histogram randomly offset against itself", { num_hists <- 2 num_bins <- 101 @@ -199,16 +213,20 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift),method = method, return_details = return_details + self_netemd <- netemd_one_to_one( + dhists_1 = histogram, + dhists_2 = shift_dhist(histogram, shift), + method = method, + return_details = return_details ) loc <- histogram$locations mass <- histogram$masses var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 expected <- list( - net_emd = 0, min_emds = 0, min_offsets = shift, + netemd = 0, min_emds = 0, min_offsets = shift, min_offsets_std = 0 ) - expect_equal(self_net_emd, expected) + expect_equal(self_netemd, expected) } purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { @@ -228,34 +246,59 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any }) }) -test_that("net_emd returns analytically derived non-zero solutions for distributions - where the analytical solution is known", { - # Helper functions to create dhists for a given value of "p" - two_bin_dhist <- function(p) { - dhist(locations = c(0, 1), masses = c(p, 1 - p)) - } - three_bin_dhist <- function(p) { - dhist(locations = c(-1, 0, 1), masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p))) - } +test_that( + paste( + "netemd returns analytically derived non-zero solutions for", + "distributions where the analytical solution is known" + ), + { + # Helper functions to create dhists for a given value of "p" + two_bin_dhist <- function(p) { + dhist(locations = c(0, 1), masses = c(p, 1 - p)) + } + three_bin_dhist <- function(p) { + dhist( + locations = c(-1, 0, 1), + masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p)) + ) + } - # Helper function to test actual vs expected - test_pair <- function(p, expected) { - dhistA <- two_bin_dhist(p) - dhistB <- three_bin_dhist(p) - expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "exhaustive"), expected, tolerance = 1e-12) - # Even setting the stats::optimise method tolerance to machine double precision, the - # optimised NetEMD is ~1e-09, so set a slightly looser tolerance here - expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "optimise"), expected, tolerance = 1e-08) - } + # Helper function to test actual vs expected + test_pair <- function(p, expected) { + dhist_a <- two_bin_dhist(p) + dhist_b <- three_bin_dhist(p) + expect_equal( + netemd_one_to_one( + dhists_1 = dhist_a, + dhists_2 = dhist_b, + method = "exhaustive" + ), + expected, + tolerance = 1e-12 + ) + # Even setting the stats::optimise method tolerance to machine double + # precision, the optimised NetEMD is ~1e-09, so set a slightly looser + # tolerance here + expect_equal( + netemd_one_to_one( + dhists_1 = dhist_a, + dhists_2 = dhist_b, + method = "optimise" + ), + expected, + tolerance = 1e-08 + ) + } - # Test for p values with analytically calculated NetEMD - test_pair(1 / 2, 1) - test_pair(1 / 3, 1 / sqrt(2)) - test_pair(1 / 5, 1 / 2) -}) + # Test for p values with analytically calculated NetEMD + test_pair(1 / 2, 1) + test_pair(1 / 3, 1 / sqrt(2)) + test_pair(1 / 5, 1 / 2) + } +) context("Measures NetEMD: Virus PPI (EMD)") -# EMD and NET_EMD: Virus PPI datasets +# EMD and netemd: Virus PPI datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of virus PPI graphs to themselves", { # Load viurs PPI network data in ORCA-compatible edge list format @@ -267,14 +310,14 @@ test_that("emd return 0 when comparing graphlet orbit degree distributions of # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { + purrr::walk(gdd, function(gdd_Ox) { # nolint: object_length_linter expect_equal(emd(gdd_Ox, gdd_Ox), 0) }) }) }) context("Measures NetEMD: Virus PPI (NetEMD)") -test_that("net_emd return 0 when comparing graphlet orbit degree distributions +test_that("netemd return 0 when comparing graphlet orbit degree distributions of virus PPI graphs to themselves", { # Load virus PPI network data in ORCA-compatible edge list format data_indexes <- 1:length(virusppi) @@ -291,8 +334,9 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + purrr::walk(gdd, function(gdd_Ox) { # nolint: object_length_linter + expect_equalish(netemd_one_to_one( + dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -301,7 +345,7 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions }) context("Measures NetEMD: Random graphs (EMD)") -# EMD and NET_EMD: Random graph datasets +# EMD and netemd: Random graph datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { # Load random graph data in ORCA-compatible edge list format @@ -317,14 +361,14 @@ test_that("emd return 0 when comparing graphlet orbit degree distributions of # Map over random graphs purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { + purrr::walk(gdd, function(gdd_Ox) { # nolint: object_length_linter expect_equal(emd(gdd_Ox, gdd_Ox), 0) }) }) }) context("Measures NetEMD: Random graphs (NetEMD)") -test_that("net_emd return 0 when comparing graphlet orbit degree distributions +test_that("netemd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { # Load random graph data in ORCA-compatible edge list format random_graphs <- read_simple_graphs( @@ -345,8 +389,9 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over random graphs purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + purrr::walk(gdd, function(gdd_Ox) { # nolint: object_length_linter + expect_equalish(netemd_one_to_one( + dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -403,11 +448,16 @@ test_that("netemd_many_to_many works", { expected_netemd_fn <- function(gdds) { list( netemds = c( - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$HSV, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$KSHV, dhists_2 = gdds$VZV) + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$KSHV, dhists_2 = gdds$VZV) ), comp_spec = cross_comparison_spec(gdds) ) @@ -415,7 +465,7 @@ test_that("netemd_many_to_many works", { # Comparison function for clarity compare_fn <- function(gdds) { - expect_equal(netemd_many_to_many(dhists=gdds), expected_netemd_fn(gdds)) + expect_equal(netemd_many_to_many(dhists = gdds), expected_netemd_fn(gdds)) } # Map over test parameters, comparing actual gdds to expected diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index 5c48dbc4..67ab7212 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -1,48 +1,60 @@ context("ORCA interface: Graph to ORCA edgelist round-trip") test_that("Graph to indexed edge list round trip conversion works", { data_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - g_orig <- igraph::read_graph(file = file.path(data_dir, "EBV.txt"), format = "ncol") + g_orig <- igraph::read_graph( + file = file.path(data_dir, "EBV.txt"), + format = "ncol" + ) g_rtrip <- netdist::indexed_edges_to_graph(graph_to_indexed_edges(g_orig)) - expect_true(all.equal(igraph::get.edgelist(g_orig), igraph::get.edgelist(g_orig))) + expect_true(all.equal( + igraph::get.edgelist(g_orig), + igraph::get.edgelist(g_orig) + )) }) context("ORCA interface: Graphlet key") -test_that("graphlet_key gives correct output for all supported max graphlet sizes", { - correct_graphlet_key_2 <- list(max_nodes = 2, id = c("G0"), node_count = c(2)) - correct_graphlet_key_3 <- list( - max_nodes = 3, id = c("G0", "G1", "G2"), - node_count = c(2, 3, 3) - ) - correct_graphlet_key_4 <- list( - max_nodes = 4, - id = c( - "G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8" - ), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) - ) - correct_graphlet_key_5 <- list( - max_nodes = 5, - id = c( - "G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8", "G9", "G10", "G11", "G12", - "G13", "G14", "G15", "G16", "G17", - "G18", "G19", "G20", "G21", "G22", - "G23", "G24", "G25", "G26", "G27", - "G28", "G29" - ), - node_count = c( - 2, 3, 3, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5 +test_that( + "graphlet_key gives correct output for all supported max graphlet sizes", + { + correct_graphlet_key_2 <- list( + max_nodes = 2, id = c("G0"), + node_count = c(2) ) - ) - expect_equal(graphlet_key(2), correct_graphlet_key_2) - expect_equal(graphlet_key(3), correct_graphlet_key_3) - expect_equal(graphlet_key(4), correct_graphlet_key_4) - expect_equal(graphlet_key(5), correct_graphlet_key_5) -}) + correct_graphlet_key_3 <- list( + max_nodes = 3, id = c("G0", "G1", "G2"), + node_count = c(2, 3, 3) + ) + correct_graphlet_key_4 <- list( + max_nodes = 4, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8" + ), + node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) + ) + correct_graphlet_key_5 <- list( + max_nodes = 5, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8", "G9", "G10", "G11", "G12", + "G13", "G14", "G15", "G16", "G17", + "G18", "G19", "G20", "G21", "G22", + "G23", "G24", "G25", "G26", "G27", + "G28", "G29" + ), + node_count = c( + 2, 3, 3, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5 + ) + ) + expect_equal(graphlet_key(2), correct_graphlet_key_2) + expect_equal(graphlet_key(3), correct_graphlet_key_3) + expect_equal(graphlet_key(4), correct_graphlet_key_4) + expect_equal(graphlet_key(5), correct_graphlet_key_5) + } +) test_that("graphlet_key gives error for unsupported max graphlet sizes", { max_size_too_low <- c(1, 0, -1, -2, -3, -4, -5, -6) @@ -60,70 +72,73 @@ test_that("graphlet_key gives error for unsupported max graphlet sizes", { }) context("ORCA interface: Orbit key") -test_that("orbit_key gives correct output for all supported max graphlet sizes", { - correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) - correct_orbit_key_3 <- list( - max_nodes = 3, id = c("O0", "O1", "O2", "O3"), - node_count = c(2, 3, 3, 3) - ) - correct_orbit_key_4 <- list( - max_nodes = 4, - id = c( - "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14" - ), - node_count = c( - 2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +test_that( + "orbit_key gives correct output for all supported max graphlet sizes", + { + correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) + correct_orbit_key_3 <- list( + max_nodes = 3, id = c("O0", "O1", "O2", "O3"), + node_count = c(2, 3, 3, 3) ) - ) - correct_orbit_key_5 <- list( - max_nodes = 5, - id = c( - "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", - "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72" - ), - node_count = c( - 2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5 + correct_orbit_key_4 <- list( + max_nodes = 4, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 + ) ) - ) - expect_equal(orbit_key(2), correct_orbit_key_2) - expect_equal(orbit_key(3), correct_orbit_key_3) - expect_equal(orbit_key(4), correct_orbit_key_4) - expect_equal(orbit_key(5), correct_orbit_key_5) -}) + correct_orbit_key_5 <- list( + max_nodes = 5, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", + "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5 + ) + ) + expect_equal(orbit_key(2), correct_orbit_key_2) + expect_equal(orbit_key(3), correct_orbit_key_3) + expect_equal(orbit_key(4), correct_orbit_key_4) + expect_equal(orbit_key(5), correct_orbit_key_5) + } +) context("ORCA interface: Graph cross comparison") test_that("cross_comparison_spec works for virus PPI data", { # Load viurs PPI network data in ORCA-compatible edge list format - expected_name_A <- c( + exp_name_a <- c( rep("EBV", 4), rep("ECL", 3), rep("HSV-1", 2), rep("KSHV", 1), rep("VZV", 0) ) - expected_index_A <- c(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1), rep(5, 0)) - expected_name_B <- c( + exp_index_a <- c(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1), rep(5, 0)) + exp_name_b <- c( c("ECL", "HSV-1", "KSHV", "VZV"), c("HSV-1", "KSHV", "VZV"), c("KSHV", "VZV"), c("VZV") ) - expected_index_B <- c(c(2, 3, 4, 5), c(3, 4, 5), c(4, 5), c(5)) + exp_index_b <- c(c(2, 3, 4, 5), c(3, 4, 5), c(4, 5), c(5)) expected <- as.data.frame(cbind( - expected_name_A, expected_name_B, - expected_index_A, expected_index_B + exp_name_a, exp_name_b, + exp_index_a, exp_index_b )) colnames(expected) <- c("name_a", "name_b", "index_a", "index_b") @@ -143,7 +158,10 @@ test_that("cross_comparison_spec works for virus PPI data", { context("ORCA interface: Orbit count wrapper") test_that("Single and zero node graphs are gracefully handled", { - single_node_graph <- igraph::graph_from_adjacency_matrix(0, mode = "undirected") + single_node_graph <- igraph::graph_from_adjacency_matrix( + 0, + mode = "undirected" + ) zero_node_graph <- igraph::delete.vertices(single_node_graph, 1) names4 <- c( "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", @@ -159,31 +177,31 @@ test_that("Single and zero node graphs are gracefully handled", { "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", "O70", "O71", "O72" )) - expected_zero_node_counts4 <- matrix(0, nrow = 0, ncol = length(names4)) - colnames(expected_zero_node_counts4) <- names4 - expected_zero_node_counts5 <- matrix(0, nrow = 0, ncol = length(names5)) - colnames(expected_zero_node_counts5) <- names5 + exp_zero_node_counts4 <- matrix(0, nrow = 0, ncol = length(names4)) + colnames(exp_zero_node_counts4) <- names4 + exp_zero_node_counts5 <- matrix(0, nrow = 0, ncol = length(names5)) + colnames(exp_zero_node_counts5) <- names5 - expected_single_node_counts4 <- matrix(0, nrow = 1, ncol = length(names4)) - colnames(expected_single_node_counts4) <- names4 - expected_single_node_counts5 <- matrix(0, nrow = 1, ncol = length(names5)) - colnames(expected_single_node_counts5) <- names5 + exp_single_node_counts4 <- matrix(0, nrow = 1, ncol = length(names4)) + colnames(exp_single_node_counts4) <- names4 + exp_single_node_counts5 <- matrix(0, nrow = 1, ncol = length(names5)) + colnames(exp_single_node_counts5) <- names5 expect_equal( - expected_zero_node_counts4, + exp_zero_node_counts4, count_orbits_per_node(zero_node_graph, max_graphlet_size = 4) ) expect_equal( - expected_zero_node_counts5, + exp_zero_node_counts5, count_orbits_per_node(zero_node_graph, max_graphlet_size = 5) ) expect_equal( - expected_single_node_counts4, + exp_single_node_counts4, count_orbits_per_node(single_node_graph, max_graphlet_size = 4) ) expect_equal( - expected_single_node_counts5, + exp_single_node_counts5, count_orbits_per_node(single_node_graph, max_graphlet_size = 5) ) }) @@ -248,7 +266,10 @@ test_that("simplify_graph works", { # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -259,7 +280,10 @@ test_that("simplify_graph works", { # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -270,7 +294,10 @@ test_that("simplify_graph works", { # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -281,7 +308,10 @@ test_that("simplify_graph works", { # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -292,7 +322,10 @@ test_that("simplify_graph works", { # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -303,7 +336,10 @@ test_that("simplify_graph works", { # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -314,7 +350,10 @@ test_that("simplify_graph works", { # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -339,7 +378,10 @@ test_that("simplify_graph works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -350,7 +392,10 @@ test_that("simplify_graph works", { # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -358,11 +403,14 @@ test_that("simplify_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -373,7 +421,10 @@ test_that("simplify_graph works", { # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -381,11 +432,14 @@ test_that("simplify_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -393,11 +447,15 @@ test_that("simplify_graph works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -464,7 +522,10 @@ test_that("gdd simplifies works", { # 1a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -475,7 +536,10 @@ test_that("gdd simplifies works", { # 1b. Multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -486,7 +550,10 @@ test_that("gdd simplifies works", { # 1c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -497,7 +564,10 @@ test_that("gdd simplifies works", { # 1ab. Loop + multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -508,7 +578,10 @@ test_that("gdd simplifies works", { # 1ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -519,7 +592,10 @@ test_that("gdd simplifies works", { # 1bc. Multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -530,7 +606,10 @@ test_that("gdd simplifies works", { # 1abc. Loop + multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -555,7 +634,10 @@ test_that("gdd simplifies works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -566,7 +648,10 @@ test_that("gdd simplifies works", { # 2c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), gdd(simplify_graph( graph, @@ -574,11 +659,14 @@ test_that("gdd simplifies works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -589,7 +677,10 @@ test_that("gdd simplifies works", { # 2ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), gdd(simplify_graph( graph, @@ -597,11 +688,14 @@ test_that("gdd simplifies works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -609,11 +703,15 @@ test_that("gdd simplifies works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -658,7 +756,10 @@ test_that("Features to Histograms Test", { expect_equal(res[[1]]$locations, c(10^-10, 10^-9, 10^-8, 10^-2, 10^3)) expect_equal(res[[1]]$masses, c(1, 1, 2, 1, 1)) # irrational - c1 <- matrix(c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), nrow = 6) + c1 <- matrix( + c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), + nrow = 6 + ) res <- graph_features_to_histograms(c1) expect_equal(res[[1]]$locations, c(sqrt(2) / pi, sqrt(2), sqrt(3), pi)) expect_equal(res[[1]]$masses, c(2, 2, 1, 1)) @@ -736,7 +837,10 @@ test_that("read_simple_graph works", { # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -747,7 +851,10 @@ test_that("read_simple_graph works", { # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -758,7 +865,10 @@ test_that("read_simple_graph works", { # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -769,7 +879,10 @@ test_that("read_simple_graph works", { # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -780,7 +893,10 @@ test_that("read_simple_graph works", { # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -791,7 +907,10 @@ test_that("read_simple_graph works", { # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -802,7 +921,10 @@ test_that("read_simple_graph works", { # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -827,7 +949,10 @@ test_that("read_simple_graph works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -838,7 +963,10 @@ test_that("read_simple_graph works", { # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -846,11 +974,14 @@ test_that("read_simple_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -861,7 +992,10 @@ test_that("read_simple_graph works", { # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -869,11 +1003,14 @@ test_that("read_simple_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -881,11 +1018,15 @@ test_that("read_simple_graph works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -912,10 +1053,26 @@ test_that("read_simple_files works (all files in a directory)", { # Save graphs to temp directory format <- "graphml" base_dir <- tempdir() - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_1.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_2.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_3.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_4.txt"), + format = format + ) # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { @@ -967,8 +1124,9 @@ test_that("orbit_to_graphlet_counts summation works", { edges <- graph_to_indexed_edges(graph) orbit_counts_4 <- orca::count4(edges) orbit_counts_5 <- orca::count5(edges) - # Define orbit indexes belonging to each graphlet using the xero-based indexing - # from the journal papers, adding one to conver tot he one-based indexing of R + # Define orbit indexes belonging to each graphlet using the xero-based + # indexing from the journal papers, adding one to conver to the one-based + # indexing of R g0_indexes <- c(0) + 1 g1_indexes <- c(1:2) + 1 g2_indexes <- c(3) + 1 @@ -1031,7 +1189,7 @@ test_that("orbit_to_graphlet_counts summation works", { g28_counts <- rowSums(orbit_counts_5[, g28_indexes, drop = FALSE]) g29_counts <- rowSums(orbit_counts_5[, g29_indexes, drop = FALSE]) # Define expected graphlet count matrix for graphlets up to 5 nodes - expected_graphlet_counts_5 <- + exp_graphlet_counts_5 <- cbind( g0_counts, g1_counts, g2_counts, g3_counts, g4_counts, g5_counts, g6_counts, g7_counts, g8_counts, g9_counts, g10_counts, g11_counts, @@ -1040,156 +1198,161 @@ test_that("orbit_to_graphlet_counts summation works", { g22_counts, g23_counts, g24_counts, g25_counts, g26_counts, g27_counts, g28_counts, g29_counts ) - colnames(expected_graphlet_counts_5) <- + colnames(exp_graphlet_counts_5) <- c( "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", "G29" ) - # Define epected graphlet count matrix for graphlets up to 4 nodes by selecting - # a subset of the matrix for graphlets up to 5 nodes - expected_graphlet_counts_4 <- expected_graphlet_counts_5[, 1:9] + # Define epected graphlet count matrix for graphlets up to 4 nodes by + # selecting a subset of the matrix for graphlets up to 5 nodes + exp_graphlet_counts_4 <- exp_graphlet_counts_5[, 1:9] # Calculate actual graphlet counts from functions under test actual_graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) actual_graphlet_counts_5 <- orbit_to_graphlet_counts(orbit_counts_5) # Check expected and actual graphlet counts match - expect_equal(actual_graphlet_counts_4, expected_graphlet_counts_4) - expect_equal(actual_graphlet_counts_5, expected_graphlet_counts_5) + expect_equal(actual_graphlet_counts_4, exp_graphlet_counts_4) + expect_equal(actual_graphlet_counts_5, exp_graphlet_counts_5) }) context("ORCA interface: Named ego networks") -test_that("make_named_ego_graph labels each ego-network with the correct node name", { - # Helper function to sort edgelists in consistent order - sort_edge_list <- function(edge_list) { - edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] - } - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # The expectation below is based on igraph::graph_from_edgelist adding nodes - # in the order they appear in the edge list, and igraph::V returning them - # in this same order - expected_node_names <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10") +test_that( + "make_named_ego_graph labels each ego-network with the correct node name", + { + # Helper function to sort edgelists in consistent order + sort_edge_list <- function(edge_list) { + edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] + } + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + # The expectation below is based on igraph::graph_from_edgelist adding nodes + # in the order they appear in the edge list, and igraph::V returning them + # in this same order + exp_node_names <- c( + "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10" + ) - # Expected edgelists for ego networks of order 1 - expected_ego_elist_n1_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6") - ) - expected_ego_elist_n2_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n2", "n3"), - c("n2", "n4"), - c("n2", "n5") - ) - expected_ego_elist_n3_o1 <- rbind( - c("n2", "n3") - ) - expected_ego_elist_n4_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n1", "n6"), - c("n2", "n4"), - c("n4", "n6") - ) - expected_ego_elist_n5_o1 <- rbind( - c("n2", "n5") - ) - expected_ego_elist_n6_o1 <- rbind( - c("n1", "n4"), - c("n1", "n6"), - c("n4", "n6"), - c("n6", "n8") - ) - expected_ego_elist_n7_o1 <- rbind( - c("n1", "n7"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n8_o1 <- rbind( - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n9_o1 <- rbind( - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n10_o1 <- rbind( - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) + # Expected edgelists for ego networks of order 1 + exp_ego_elist_n1_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6") + ) + exp_ego_elist_n2_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n2", "n3"), + c("n2", "n4"), + c("n2", "n5") + ) + exp_ego_elist_n3_o1 <- rbind( + c("n2", "n3") + ) + exp_ego_elist_n4_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n2", "n4"), + c("n4", "n6") + ) + exp_ego_elist_n5_o1 <- rbind( + c("n2", "n5") + ) + exp_ego_elist_n6_o1 <- rbind( + c("n1", "n4"), + c("n1", "n6"), + c("n4", "n6"), + c("n6", "n8") + ) + exp_ego_elist_n7_o1 <- rbind( + c("n1", "n7"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + exp_ego_elist_n8_o1 <- rbind( + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + exp_ego_elist_n9_o1 <- rbind( + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + exp_ego_elist_n10_o1 <- rbind( + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) - # Test ego-networks of order 1. - # We compare edgelists as igraphs do not implement comparison - order <- 1 - min_ego_nodes <- 0 - min_ego_edges <- 0 + # Test ego-networks of order 1. + # We compare edgelists as igraphs do not implement comparison + order <- 1 + min_ego_nodes <- 0 + min_ego_edges <- 0 - expected_ego_elists_o1 <- list( - n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), - n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), - n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), - n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), - n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), - n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), - n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), - n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), - n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), - n10 = dplyr::arrange(data.frame(expected_ego_elist_n10_o1), X1, X2) - ) - # Generate actual ego-networks and convert to edge lists for comparison - actual_ego_elists_o1 <- - purrr::map( - make_named_ego_graph(graph, order, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ), - function(g) { - dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) - } + exp_ego_elists_o1 <- list( + n1 = dplyr::arrange(data.frame(exp_ego_elist_n1_o1), X1, X2), + n2 = dplyr::arrange(data.frame(exp_ego_elist_n2_o1), X1, X2), + n3 = dplyr::arrange(data.frame(exp_ego_elist_n3_o1), X1, X2), + n4 = dplyr::arrange(data.frame(exp_ego_elist_n4_o1), X1, X2), + n5 = dplyr::arrange(data.frame(exp_ego_elist_n5_o1), X1, X2), + n6 = dplyr::arrange(data.frame(exp_ego_elist_n6_o1), X1, X2), + n7 = dplyr::arrange(data.frame(exp_ego_elist_n7_o1), X1, X2), + n8 = dplyr::arrange(data.frame(exp_ego_elist_n8_o1), X1, X2), + n9 = dplyr::arrange(data.frame(exp_ego_elist_n9_o1), X1, X2), + n10 = dplyr::arrange(data.frame(exp_ego_elist_n10_o1), X1, X2) ) - expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) -}) + # Generate actual ego-networks and convert to edge lists for comparison + actual_ego_elists_o1 <- + purrr::map( + make_named_ego_graph(graph, order, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ), + function(g) { + dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) + } + ) + expect_equal(actual_ego_elists_o1, exp_ego_elists_o1) + } +) context("ORCA interface: Graphlet counts") test_that("count_graphlets_for_graph works", { @@ -1215,546 +1378,301 @@ test_that("count_graphlets_for_graph works", { graph <- igraph::graph_from_edgelist(elist, directed = FALSE) # Setgraphlet labels to use for names in expected counts - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) # Manually verified graphlet counts - expected_counts <- c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1) - names(expected_counts) <- graphlet_labels + exp_counts <- c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1) + names(exp_counts) <- graphlet_labels # Test actual_counts <- count_graphlets_for_graph(graph, max_graphlet_size = 4) - expect_equal(expected_counts, actual_counts) + expect_equal(exp_counts, actual_counts) }) context("ORCA interface: Ego-network graphlet counts") -test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +test_that( + paste( + "count_graphlets_ego: Ego-network 4-node graphlet counts match manually", + "verified totals for test graph" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) - max_graphlet_size <- 4 - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), - c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + exp_counts_order_1 <- rbind( + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(exp_counts_order_1) <- node_labels + colnames(exp_counts_order_1) <- graphlet_labels + # 2-step ego networks + exp_counts_order_2 <- rbind( + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(exp_counts_order_2) <- node_labels + colnames(exp_counts_order_2) <- graphlet_labels - # Count graphlets in each ego network of the graph with only counts requested - min_ego_nodes <- 0 - min_ego_edges <- 0 + # Count graphlets in each ego network of the graph with only counts + # requested + min_ego_nodes <- 0 + min_ego_edges <- 0 - actual_counts_order_1 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, + actual_counts_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + actual_counts_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Test that actual counts match expected with only counts requested + # (default) + expect_equal(actual_counts_order_1, exp_counts_order_1) + expect_equal(actual_counts_order_2, exp_counts_order_2) + + # Test that actual and returned ego networks match expected + # 1. Define expected + exp_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - actual_counts_order_2 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, + exp_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - - # Test that actual counts match expected with only counts requested (default) - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) - - # Test that actual and returned ego networks match expected - # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - expected_counts_with_networks_order_1 <- - list( - graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1 - ) - expected_counts_with_networks_order_2 <- - list( - graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2 - ) - # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE + exp_counts_with_networks_order_1 <- # nolint: object_length_linter + list( + graphlet_counts = exp_counts_order_1, + ego_networks = exp_ego_networks_order_1 + ) + exp_counts_with_networks_order_2 <- # nolint: object_length_linter + list( + graphlet_counts = exp_counts_order_2, + ego_networks = exp_ego_networks_order_2 + ) + # 2. Calculate actual + actual_counts_with_networks_order_1 <- # nolint: object_length_linter + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- # nolint: object_length_linter + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + # Test that actual counts match expected with ego-networks requested + expect_equal( + actual_counts_with_networks_order_1$graphlet_counts, + exp_counts_order_1 ) - actual_counts_with_networks_order_2 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE + expect_equal( + actual_counts_with_networks_order_2$graphlet_counts, + exp_counts_order_2 ) - # Test that actual counts match expected with ego-networks requested - expect_equal(actual_counts_with_networks_order_1$graphlet_counts, expected_counts_order_1) - expect_equal(actual_counts_with_networks_order_2$graphlet_counts, expected_counts_order_2) - # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to - # indexed edge list and then compare. Do in-situ replacement of igraphs with - # indexed edge lists to ensure we are checking full properties of returned - # objects (i.e. named lists with matching elements). - # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map( - expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map( - expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map( - actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges + # 3. Compare + # Comparison is not implemented for igraph objects, so convert all igraphs + # to indexed edge list and then compare. Do in-situ replacement of igraphs + # with indexed edge lists to ensure we are checking full properties of + # returned objects (i.e. named lists with matching elements). + # 3a. Convert expected and actual ego networks from igraphs to indexed edges + exp_counts_with_networks_order_1$ego_networks <- + purrr::map( + exp_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + exp_counts_with_networks_order_2$ego_networks <- + purrr::map( + exp_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + # 3b. Do comparison + expect_equal( + actual_counts_with_networks_order_1, + exp_counts_with_networks_order_1 ) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map( - actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges + expect_equal( + actual_counts_with_networks_order_2, + exp_counts_with_networks_order_2 ) - # 3b. Do comparison - expect_equal( - actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1 - ) - expect_equal( - actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2 - ) -}) + } +) context("ORCA interface: Ego-network graphlet counts") -test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually verified totals for test graph", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") +test_that( + paste( + "ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually", + "verified totals for test graph" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - max_graphlet_size <- 4 - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), - c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) - # Count graphlets in each ego network of the graph with only counts requested - min_ego_nodes <- 0 - min_ego_edges <- 0 + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + exp_counts_order_1 <- rbind( + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(exp_counts_order_1) <- node_labels + colnames(exp_counts_order_1) <- graphlet_labels + # 2-step ego networks + exp_counts_order_2 <- rbind( + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(exp_counts_order_2) <- node_labels + colnames(exp_counts_order_2) <- graphlet_labels - # Test that actual and returned ego graphlet counts match - # 1. Generate ego networks with previously tested function. - ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) + # Count graphlets in each ego network of the graph with only counts + # requested + min_ego_nodes <- 0 + min_ego_edges <- 0 - # 2. Calculate counts with ego_to_graphlet_counts. - actual_counts_order_1 <- - ego_to_graphlet_counts(ego_networks_order_1, - max_graphlet_size = max_graphlet_size + # Test that actual and returned ego graphlet counts match + # 1. Generate ego networks with previously tested function. + ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges ) - actual_counts_order_2 <- - ego_to_graphlet_counts(ego_networks_order_2, - max_graphlet_size = max_graphlet_size + ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges ) - # 3. Test that actual counts match expected - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) -}) + # 2. Calculate counts with ego_to_graphlet_counts. + actual_counts_order_1 <- + ego_to_graphlet_counts(ego_networks_order_1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_order_2 <- + ego_to_graphlet_counts(ego_networks_order_2, + max_graphlet_size = max_graphlet_size + ) -# context("ORCA interface: Graphlet-based degree distributions") -# test_that("gdd works", { -# graph <- netdist::virusppi$EBV -# edges <- graph_to_indexed_edges(graph) -# # Caclulate expected outputs (NOTE: relies on orbit_to_graphlet_counts and -# # orca_counts_to_graphlet_orbit_degree_distribution methods) -# orbit_counts_4 <- orca::count4(edges) -# orbit_counts_5 <- orca::count5(edges) -# graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) -# graphlet_counts_5 <- orbit_to_graphlet_counts(orbit_counts_5) -# gdd_orbit_4_expected <- orca_counts_to_graphlet_orbit_degree_distribution(orbit_counts_4) -# gdd_orbit_5_expected <- orca_counts_to_graphlet_orbit_degree_distribution(orbit_counts_5) -# gdd_graphlet_4_expected <- orca_counts_to_graphlet_orbit_degree_distribution(graphlet_counts_4) -# gdd_graphlet_5_expected <- orca_counts_to_graphlet_orbit_degree_distribution(graphlet_counts_5) -# # Calculate actual outputs from code under test -# gdd_orbit_4_actual <- gdd(graph, feature_type = "orbit", max_graphlet_size = 4) -# gdd_orbit_5_actual <- gdd(graph, feature_type = "orbit", max_graphlet_size = 5) -# gdd_graphlet_4_actual <- gdd(graph, feature_type = "graphlet", max_graphlet_size = 4) -# gdd_graphlet_5_actual <- gdd(graph, feature_type = "graphlet", max_graphlet_size = 5) -# gdd_default_4_actual <- gdd(graph, max_graphlet_size = 4) -# gdd_default_5_actual <- gdd(graph, max_graphlet_size = 5) -# gdd_orbit_default_actual <- gdd(graph, feature_type = "orbit") -# gdd_graphlet_default_actual <- gdd(graph, feature_type = "graphlet") -# gdd_default_default_actual <- gdd(graph) -# # Compare actual gdd with expected gdd -# expect_equal(gdd_orbit_4_actual, gdd_orbit_4_expected) -# expect_equal(gdd_orbit_5_actual, gdd_orbit_5_expected) -# expect_equal(gdd_graphlet_4_actual, gdd_graphlet_4_expected) -# expect_equal(gdd_graphlet_5_actual, gdd_graphlet_5_expected) -# expect_equal(gdd_default_4_actual, gdd_orbit_4_expected) -# expect_equal(gdd_default_5_actual, gdd_orbit_5_expected) -# expect_equal(gdd_orbit_default_actual, gdd_orbit_4_expected) -# expect_equal(gdd_graphlet_default_actual, gdd_graphlet_4_expected) -# expect_equal(gdd_default_default_actual, gdd_orbit_4_expected) -# -# # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5)) -# # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 2)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 3)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 6)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6)) -# -# }) -# -# context("ORCA interface: Ego-network graphlet outputs for manually verified networks") -# test_that("Ego-network 4-node graphlet counts match manually verified totals -# and gdd gives expected discrete histograms",{ -# # Set up a small sample network with at least one ego-network that contains -# # at least one of each graphlets -# elist <- rbind( -# c("n1","n2"), -# c("n2","n3"), -# c("n1","n4"), -# c("n2","n5"), -# c("n1","n6"), -# c("n1","n7"), -# c("n2","n4"), -# c("n4","n6"), -# c("n6","n8"), -# c("n7","n8"), -# c("n7","n9"), -# c("n7","n10"), -# c("n8","n9"), -# c("n8","n10"), -# c("n9","n10") -# ) -# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# -# # Set node and graphlet labels to use for row and col names in expected counts -# node_labels <- igraph::V(graph)$name -# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# -# # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 -# max_graphlet_size <- 4 -# actual_counts_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1) -# actual_counts_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2) -# -# # Set manually verified ego-network graphlet counts -# # 1-step ego networks -# expected_counts_order_1 <- rbind( -# c(6, 5, 2, 0, 1, 0, 2, 1, 0), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0), -# c(5, 2, 2, 0, 0, 0, 0, 1, 0), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0), -# c(4, 2, 1, 0, 0, 0, 1, 0, 0), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) -# ) -# rownames(expected_counts_order_1) <- node_labels -# colnames(expected_counts_order_1) <- graphlet_labels -# # 2-step ego networks -# expected_counts_order_2 <- rbind( -# c(15, 18, 6, 21, 3, 1, 11, 1, 1), -# c( 8, 10, 2, 6, 3, 0, 4, 1, 0), -# c( 5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(10, 14, 2, 11, 3, 1, 5, 1, 0), -# c( 5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1), -# c(11, 10, 5, 10 ,0 ,1, 8, 0, 1), -# c( 9, 8, 4, 4, 0, 1, 6, 0, 1), -# c( 9, 8, 4, 4, 0, 1, 6, 0, 1) -# ) -# rownames(expected_counts_order_2) <- node_labels -# colnames(expected_counts_order_2) <- graphlet_labels -# -# # Test that actual counts match expected with only counts requested (default) -# expect_equal(actual_counts_order_1, expected_counts_order_1) -# expect_equal(actual_counts_order_2, expected_counts_order_2) -# -# # Test that actual counts and returned ego networks match expected -# # 1. Define expected -# expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) -# expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) -# expected_counts_with_networks_order_1 <- -# list(graphlet_counts = expected_counts_order_1, -# ego_networks = expected_ego_networks_order_1) -# expected_counts_with_networks_order_2 <- -# list(graphlet_counts = expected_counts_order_2, -# ego_networks = expected_ego_networks_order_2) -# # 2. Calculate actual -# actual_counts_with_networks_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, return_ego_networks = TRUE) -# actual_counts_with_networks_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, return_ego_networks = TRUE) -# # 3. Compare -# # Comparison is not implemented for igraph objects, so convert all igraphs to -# # indexed edge list and then compare. Do in-situ replacement of igraphs with -# # indexed edge lists to ensure we are checking full properties of returned -# # objects (i.e. named lists with matching elements). -# # 3a. Convert expected and actual ego networks from igraphs to indexed edges -# expected_counts_with_networks_order_1$ego_networks <- -# purrr::map(expected_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges) -# expected_counts_with_networks_order_2$ego_networks <- -# purrr::map(expected_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges) -# actual_counts_with_networks_order_1$ego_networks <- -# purrr::map(actual_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges) -# actual_counts_with_networks_order_2$ego_networks <- -# purrr::map(actual_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges) -# # 3b. Do comparison -# expect_equal(actual_counts_with_networks_order_1, -# expected_counts_with_networks_order_1) -# expect_equal(actual_counts_with_networks_order_2, -# expected_counts_with_networks_order_2) -# -# # Test that gdd method gives the expected graphlet degree distributions -# # 1-step ego-networks -# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", -# max_graphlet_size = 4, ego_neighbourhood_size = 1) -# expected_gdd_order_1 <- list( -# G0 = dhist(locations = c(1, 4, 5, 6, 7), masses = c(2, 1, 2, 3, 2)), -# G1 = dhist(locations = c(0, 2, 3, 5), masses = c(4, 2, 2, 2)), -# G2 = dhist(locations = c(0, 1, 2, 4), masses = c(2, 2, 2, 4)), -# G3 = dhist(locations = c(0), masses = c(10)), -# G4 = dhist(locations = c(0, 1, 2), masses = c(8, 1, 1)), -# G5 = dhist(locations = c(0), masses = c(10)), -# G6 = dhist(locations = c(0, 1, 2, 3), masses = c(5, 1, 2, 2)), -# G7 = dhist(locations = c(0, 1), masses = c(8, 2)), -# G8 = dhist(locations = c(0, 1), masses = c(6, 4)) -# ) -# expect_equal(actual_gdd_order_1, expected_gdd_order_1) -# # 2-step ego-networks -# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", -# max_graphlet_size = 4, ego_neighbourhood_size = 2) -# expected_gdd_order_2 <- list( -# G0 = dhist(locations = c(5, 8, 9, 10, 11, 13, 15), masses = c(2, 1, 2, 1, 1, 2, 1)), -# G1 = dhist(locations = c(5, 8, 10, 13, 14, 18), masses = c(2, 2, 2, 2, 1, 1)), -# G2 = dhist(locations = c(1, 2, 4, 5, 6), masses = c(2, 2, 2, 1, 3)), -# G3 = dhist(locations = c(0, 4, 6, 10, 11, 15, 21), masses = c(2, 2, 1, 1, 1, 2, 1)), -# G4 = dhist(locations = c(0, 1, 2, 3), masses = c(3, 2, 2, 3)), -# G5 = dhist(locations = c(0, 1), masses = c(3, 7)), -# G6 = dhist(locations = c(2, 4, 5, 6, 8, 9, 11), masses = c(2, 1, 1, 2, 1, 2, 1)), -# G7 = dhist(locations = c(0, 1), masses = c(5, 5)), -# G8 = dhist(locations = c(0, 1), masses = c(4, 6)) -# ) -# expect_equal(actual_gdd_order_2, expected_gdd_order_2) -# -# # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, -# ego_neighbourhood_size = 1)) -# # We don't support orbit feature type for ego networks (i.e. neighbourhood > 0) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, -# ego_neighbourhood_size = 1)) -# # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, -# ego_neighbourhood_size = 1)) -# }) -# -# context("ORCA interface: GDD for all graphs in a directory") -# test_that("gdd_for_all_graphs works", { -# # Set source directory and file properties for Virus PPI graph edge files -# source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# edge_format = "ncol" -# file_pattern = ".txt" -# -# # Set number of threads to use at once for parallel processing. -# num_threads = getOption("mc.cores", 2L) -# -# # Use previously tested gdd code to calculate expected gdds -# expected_gdd_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdds <- list( -# gdd(virusppi$EBV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$ECL, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$HSV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$KSHV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$VZV, feature_type, max_graphlet_size, ego_neighbourhood_size) -# ) -# names(gdds) <- c("EBV", "ECL", "HSV-1", "KSHV", "VZV") -# gdds -# } -# -# # Use code under test to generate actual gdds -# actual_gdd_fn <- function (feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, -# pattern = file_pattern, feature_type = feature_type, -# max_graphlet_size = max_graphlet_size, -# ego_neighbourhood_size = ego_neighbourhood_size, -# mc.cores = num_threads) -# } -# # Helper function to make comparison code clearer -# compare_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { -# actual_gdds <- actual_gdd_fn(feature_type, max_graphlet_size, ego_neighbourhood_size) -# expected_gdds <- expected_gdd_fn(feature_type, max_graphlet_size, ego_neighbourhood_size) -# expect_equal(actual_gdds, expected_gdds) -# } -# # Map over test parameters, comparing actual gdds to expected -# # No ego-networks -# compare_fn(feature_type = "orbit", max_graphlet_size = 4, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 0) -# # Ego networks of order 1 -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1) -# # Ego networks of order 2 -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2) -# }) + # 3. Test that actual counts match expected + expect_equal(actual_counts_order_1, exp_counts_order_1) + expect_equal(actual_counts_order_2, exp_counts_order_2) + } +) diff --git a/tests/testthat/test_utility_functions.R b/tests/testthat/test_utility_functions.R index 091e02af..370e6e8b 100755 --- a/tests/testthat/test_utility_functions.R +++ b/tests/testthat/test_utility_functions.R @@ -2,63 +2,171 @@ context("Utility Functions") test_that("rotl_vec rotates left by specified number of places", { test_vec <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) - expect_equal(rotl_vec(test_vec, -13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, -12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, -11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, -9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, -8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, -7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotl_vec(test_vec, -6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotl_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotl_vec(test_vec, -4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotl_vec(test_vec, -3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, -2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, -1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, 1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, 2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, 3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotl_vec(test_vec, 4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotl_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotl_vec(test_vec, 6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotl_vec(test_vec, 7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, 8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, 9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, 11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, 12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, 13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) + expect_equal( + rotl_vec(test_vec, -13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal(rotl_vec( + test_vec, -12 + ), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) + expect_equal( + rotl_vec(test_vec, -11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, -9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, -8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, -7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotl_vec(test_vec, -6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotl_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotl_vec(test_vec, -4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotl_vec(test_vec, -3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotl_vec(test_vec, -2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotl_vec(test_vec, -1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, 1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, 2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, 3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotl_vec(test_vec, 4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotl_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotl_vec(test_vec, 6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotl_vec(test_vec, 7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotl_vec(test_vec, 8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotl_vec(test_vec, 9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, 11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, 12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, 13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) }) test_that("rotr_vec rotates right by specified number of places", { test_vec <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) - expect_equal(rotr_vec(test_vec, 13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, 12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, 11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, 9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, 8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, 7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotr_vec(test_vec, 6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotr_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotr_vec(test_vec, 4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotr_vec(test_vec, 3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, 2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, 1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, -1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, -2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, -3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotr_vec(test_vec, -4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotr_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotr_vec(test_vec, -6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotr_vec(test_vec, -7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, -8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, -9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, -11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, -12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, -13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) + expect_equal( + rotr_vec(test_vec, 13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, 12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, 11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, 9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, 8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, 7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotr_vec(test_vec, 6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotr_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotr_vec(test_vec, 4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotr_vec(test_vec, 3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, 2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, 1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, -1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, -2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, -3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotr_vec(test_vec, -4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotr_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotr_vec(test_vec, -6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotr_vec(test_vec, -7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, -8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, -9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, -11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, -12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, -13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) }) diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd index 722a0c0b..ffc7bf06 100644 --- a/vignettes/ManyToMany.Rmd +++ b/vignettes/ManyToMany.Rmd @@ -12,10 +12,10 @@ editor_options: --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -46,20 +46,20 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 ```{r, netwokrs,fig.align='center',fig.dim=c(8,4)} # Create networks set.seed(3171) -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(40,40)) -gRing_1 <- igraph::make_ring(20^2) -gRing_2 <- igraph::make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) - -par(mfrow=c(1,2)) -plot(gLat_1,vertex.size=0.8,vertex.label=NA) -plot(gLat_2,vertex.size=0.8,vertex.label=NA) -plot(gRing_1,vertex.size=0.8,vertex.label=NA) -plot(gRing_2,vertex.size=0.8,vertex.label=NA) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +glat_1 <- igraph::graph.lattice(c(20, 20)) +glat_2 <- igraph::graph.lattice(c(40, 40)) +gring_1 <- igraph::make_ring(20^2) +gring_2 <- igraph::make_ring(40^2) +gtree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gtree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) + +par(mfrow = c(1, 2)) +plot(glat_1, vertex.size = 0.8, vertex.label = NA) +plot(glat_2, vertex.size = 0.8, vertex.label = NA) +plot(gring_1, vertex.size = 0.8, vertex.label = NA) +plot(gring_2, vertex.size = 0.8, vertex.label = NA) +plot(gtree_1, vertex.size = 0.8, vertex.label = NA) +plot(gtree_2, vertex.size = 0.8, vertex.label = NA) ``` ## NetEmd using subgraph counts @@ -67,9 +67,23 @@ plot(gTree_2,vertex.size=0.8,vertex.label=NA) Subgraph count based NetEmd comparisons: ```{r, netemdorbits,fig.align='center'} # NetEMD using subgraph counts. -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) - -netemdlist <- netemd_many_to_many(graphs = glist,smoothing_window_width = 1,mc.cores = 1) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +glist <- list( + Lat_1 = glat_1, + Lat_2 = glat_2, + Ring_1 = gring_1, + Ring_2 = gring_1, + Tree_1 = gtree_1, + Tree_2 = gtree_2 +) + +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing_window_width +# equal to zero is recommended. +netemdlist <- netemd_many_to_many( + graphs = glist, + smoothing_window_width = 1, + mc.cores = 1 +) netemdlist ``` @@ -77,13 +91,19 @@ netemdlist To read the results in a matrix form: ```{r,fig.align='center'} # Creating a comparison matrix: -mat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) +mat <- cross_comp_to_matrix( + measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec +) mat ``` Illustration of the multiple NetEmd comparisons based on subgraph counts. ```{r,netemdorbitsPLOT,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd subgraph counts") +netemd_plot( + netemdlist = netemdlist, + clustering_method = "ward.D", + main = "NetEmd subgraph counts" +) ``` ## NetEmd using the Laplacian and Normalized Laplacian Spectrum @@ -91,20 +111,29 @@ netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd subgra Pre-compute the Laplacian and normalized Laplacian for each graph considered: ```{r, netemdspectrum} # NetEMD using the Laplacian and normalized Laplacian Spectrum. -SPECT<-list() +spect <- list() -#This step may take several minutes. -for(i in 1:length(glist)){ - Lapg <- igraph::laplacian_matrix(graph = glist[[i]],normalized = FALSE,sparse = FALSE) - NLap <- igraph::laplacian_matrix(graph = glist[[i]],normalized = TRUE,sparse = FALSE) - SPECT[[ names(glist)[i] ]] <- cbind(L.Spectra= eigen(Lapg)$values, NL.Spectra= eigen(NLap)$values) +# This step may take several minutes. +for (i in 1:length(glist)) { + lapg <- igraph::laplacian_matrix( + graph = glist[[i]], normalized = FALSE, sparse = FALSE + ) + nlap <- igraph::laplacian_matrix( + graph = glist[[i]], normalized = TRUE, sparse = FALSE + ) + spect[[names(glist)[i]]] <- cbind( + L.Spectra = eigen(lapg)$values, NL.Spectra = eigen(nlap)$values + ) } -str(SPECT) +str(spect) ``` Compute NetEmd: ```{r} -netemdlist <- netemd_many_to_many(dhists = SPECT,smoothing_window_width = 0) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing +# window_width equal to zero is recommended. +netemdlist <- netemd_many_to_many(dhists = spect, smoothing_window_width = 0) netemdlist ``` @@ -112,7 +141,9 @@ netemdlist ### Illustration of the multiple NetEmd comparisons based on the Laplacian and Normalized Laplacian spectra ```{r,netemdspectrumPLOT ,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd Spectra") +netemd_plot( + netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd Spectra" +) ``` @@ -135,26 +166,33 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 ```{r, netdisgoldstandnetworks,fig.align='center',fig.dim=c(8,4)} # Create networks set.seed(3171) -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(40,40)) -gRing_1 <- igraph::make_ring(20^2) -gRing_2 <- igraph::make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +glat_1 <- igraph::graph.lattice(c(20, 20)) +glat_2 <- igraph::graph.lattice(c(40, 40)) +gring_1 <- igraph::make_ring(20^2) +gring_2 <- igraph::make_ring(40^2) +gtree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gtree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) # Create a random graph to be used as a gold-standard -gst_1 <- igraph::as.undirected( graph.star(20^2) ) -gst_2 <- igraph::as.undirected( graph.star(40^2) ) +gst_1 <- igraph::as.undirected(graph.star(20^2)) +gst_2 <- igraph::as.undirected(graph.star(40^2)) -par(mfrow=c(1,2)) -plot(gst_1,vertex.size=0.8,vertex.label=NA) -plot(gst_2,vertex.size=0.8,vertex.label=NA) +par(mfrow = c(1, 2)) +plot(gst_1, vertex.size = 0.8, vertex.label = NA) +plot(gst_2, vertex.size = 0.8, vertex.label = NA) ``` Obtain the comparison via Netdis using each of the reference graph networks. ```{r,netdisgoldstand ,fig.align='center'} -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) +glist <- list( + Lat_1 = glat_1, + Lat_2 = glat_2, + Ring_1 = gring_1, + Ring_2 = gring_1, + Tree_1 = gtree_1, + Tree_2 = gtree_2 +) # Netdis using the goldstd_1 graph as gold-standard reference point netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = gst_1) @@ -170,18 +208,28 @@ netdis_mat_gst2 To read the results in a matrix form: ```{r,fig.align='center'} # Creating a comparison matrix: -cross_comp_to_matrix(measure = netdis_mat_gst1$netdis, cross_comparison_spec = netdis_mat_gst1$comp_spec) - -cross_comp_to_matrix(measure = netdis_mat_gst2$netdis, cross_comparison_spec = netdis_mat_gst2$comp_spec) +cross_comp_to_matrix( + measure = netdis_mat_gst1$netdis, + cross_comparison_spec = netdis_mat_gst1$comp_spec +) + +cross_comp_to_matrix( + measure = netdis_mat_gst2$netdis, + cross_comparison_spec = netdis_mat_gst2$comp_spec +) ``` Heatmap of the Netdis comparisons: ```{r,netdisgoldstandPLOT ,fig.align='center',fig.dim=c(8,8)} -#Network comparisons heatmap with Gold-Standard 1 -netdis.plot(netdislist = netdis_mat_gst1, whatrow = 2,main = "Netdis GoldStd-1") - -#Network comparisons heatmap with Gold-Standard 2 -netdis.plot(netdislist = netdis_mat_gst2, whatrow = 2,main = "Netdis GoldStd-2") +# Network comparisons heatmap with Gold-Standard 1 +netdis_plot( + netdislist = netdis_mat_gst1, whatrow = 2, main = "Netdis GoldStd-1" +) + +# Network comparisons heatmap with Gold-Standard 2 +netdis_plot( + netdislist = netdis_mat_gst2, whatrow = 2, main = "Netdis GoldStd-2" +) ``` @@ -196,7 +244,7 @@ netdis_mat ``` ```{r,netdisGPPLOT ,fig.align='center',fig.dim=c(8,8)} -netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-GP") +netdis_plot(netdislist = netdis_mat, whatrow = 2, main = "Netdis-GP") ``` ### Using Netdis with no expectation ($E_w=0$) @@ -210,7 +258,7 @@ netdis_mat ``` ```{r,netdiszeroPLOT ,fig.align='center',fig.dim=c(8,8)} -netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-zero") +netdis_plot(netdislist = netdis_mat, whatrow = 2, main = "Netdis-zero") ``` ------------------------- diff --git a/vignettes/NetEmdTimeOrdering.Rmd b/vignettes/NetEmdTimeOrdering.Rmd index 01c91197..3f4ead81 100644 --- a/vignettes/NetEmdTimeOrdering.Rmd +++ b/vignettes/NetEmdTimeOrdering.Rmd @@ -28,17 +28,17 @@ For other vignettes in this package see the ["Menu"](V-Menu.html). The package contains the world trade networks and pre-computed subgraph/graphlet counts in the R data object ` worldtradesub`. This object contains a list of two lists. The first list is `worldtradesub$wtnets` which contains a small sample of the trade networks (2001-2014) and the second `worldtradesub$Counts` which contains pre-computed counts for a larger set of trade networks going from 1985 to 2014. ```{r, message=FALSE} - library("netdist") - library("igraph") - data(worldtradesub) - summary(worldtradesub) - wtnets<- worldtradesub$wtnets - summary(wtnets) +library("netdist") +library("igraph") +data(worldtradesub) +summary(worldtradesub) +wtnets <- worldtradesub$wtnets +summary(wtnets) ``` These world trade networks are denser than typically sparse social networks. For example, the edge density for the network in 2001 is `r igraph::graph.density(worldtradesub$wtnets$wtn2001)`. Here is a plot of this network highlighting the relatively large number of edges: ```{r,fig.align='center',fig.dim=c(5,5)} - plot(wtnets$wtn2001,vertex.size=5,vertex.label.cex=0.4) +plot(wtnets$wtn2001, vertex.size = 5, vertex.label.cex = 0.4) ``` @@ -47,11 +47,11 @@ These world trade networks are denser than typically sparse social networks. For In this example **NetEmd** will consider orbit counts of subgraphs containing up to 5 nodes. If NetEmd is to be called a single time, then the command `netemd_many_to_many(graphs = wtnets)` would suffice. The following code provides such an example: ```{r} - # As the trade networks are considerable dense, this example first considers a small number of networks. - #This example may take some minutes to run. - netemd_result <- netemd_many_to_many(graphs = wtnets[1:4],mc.cores = 1) +# As the trade networks are considerable dense, this example first considers a +# small number of networks. This example may take some minutes to run. +netemd_result <- netemd_many_to_many(graphs = wtnets[1:4], mc.cores = 1) - print(netemd_result) +print(netemd_result) ``` However, if there are pre-computed counts or features NetEmd can be called via these features instead. @@ -60,27 +60,34 @@ However, if there are pre-computed counts or features NetEmd can be called via t World trade networks consist of relatively dense networks, thus leading to longer computational times for the calculation of the subgraph counts. Hence, it is advisable to pre-compute counts in case there is a need to call NetEmd multiple times. This may, for example, be the case when adding a new network to the data set. The following illustrates the extraction of subgraph counts for the small network sample. ```{r} -# This example may take more than a few minutes to run (approx. 20 mins) , and it is not necessary to run it for the upcoming examples as a larger set of counts has been already computed. -if(FALSE){# It is not necessary to run, as these counts are already available in. - Counts <- list() - for(i in 1:length(wtnets)){ - Counts[[ names(wtnets)[i] ]] <- count_orbits_per_node(graph = wtnets[[i]],max_graphlet_size = 5) - } +# This example may take more than a few minutes to run (approx. 20 mins), and +# it is not necessary to run it for the upcoming examples as a larger set of +# counts has been already computed. +if (FALSE) { + counts <- list() + for (i in 1:length(wtnets)) { + counts[[names(wtnets)[i]]] <- count_orbits_per_node( + graph = wtnets[[i]], max_graphlet_size = 5 + ) + } } ``` Now, with pre-computed counts NetEmd can be calculated more rapidly as the computations of the counts are often the bottle neck in the computational time of NetEmd. NetEmd will be called `r length(worldtradesub$Counts) * (length(worldtradesub$Counts) - 1)/2 ` times in order to obtain all pairwise comparisons between the trade networks from 1985 to 2014 (networks with pre-computed subgraph counts): ```{r} - # The pre-computed counts already in the package - Counts <- worldtradesub$Counts - - #Calling NetEmd - netemd_result <- netemd_many_to_many(dhists = Counts ,mc.cores = 1) - - #Results - netemd_matrix <- cross_comp_to_matrix(measure = netemd_result$netemds, cross_comparison_spec = netemd_result$comp_spec) - - print(netemd_matrix[1:10,1:5]) +# The pre-computed counts already in the package +counts <- worldtradesub$Counts + +# Calling NetEmd +netemd_result <- netemd_many_to_many(dhists = counts, mc.cores = 1) + +# Results +netemd_matrix <- cross_comp_to_matrix( + measure = netemd_result$netemds, + cross_comparison_spec = netemd_result$comp_spec +) + +print(netemd_matrix[1:10, 1:5]) ``` # Evidence of change in world trade @@ -88,7 +95,12 @@ Now, with pre-computed counts NetEmd can be calculated more rapidly as the compu Based on the comparison of the world trade networks across the years, we can identify periods of time where possible considerable changes in world trade have occurred. The following heat map clearly shows the existence of two potential changes in the world trade system, and which correspond to 1995-1996 and 2010-2011. ```{r,fig.align='center',fig.dim=c(8.5,8.5)} - netemd.plot(netemdlist=netemd_result,clustering_method="ward.D",main="NetEmd",docluster = FALSE) +netemd_plot( + netemdlist = netemd_result, + clustering_method = "ward.D", + main = "NetEmd", + docluster = FALSE +) ``` The World Trade Organization (WTO) said the following about these years: diff --git a/vignettes/NetdisGPStepByStep.Rmd b/vignettes/NetdisGPStepByStep.Rmd index f7ee738f..5e4288db 100644 --- a/vignettes/NetdisGPStepByStep.Rmd +++ b/vignettes/NetdisGPStepByStep.Rmd @@ -13,10 +13,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -92,11 +92,11 @@ Generation of tree-like networks with 400 nodes and 1600 nodes. ```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} # Create networks set.seed(34) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gtree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gtree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +plot(gtree_1, vertex.size = 0.8, vertex.label = NA) +plot(gtree_2, vertex.size = 0.8, vertex.label = NA) ``` @@ -116,7 +116,8 @@ neighbourhood_size <- 2 min_ego_nodes <- 3 min_ego_edges <- 1 -# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +# Ego-network density binning parameters. Here, the minimum number of +# ego-networks per bin and the starting number of bins min_bin_count <- 5 num_bins <- 100 ``` @@ -126,25 +127,31 @@ num_bins <- 100 One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: ```{r} # Get ego-networks for query graphs -ego_1 <- make_named_ego_graph(gTree_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(gTree_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) -head(ego_1,n=2) -head(ego_2,n=2) +ego_1 <- make_named_ego_graph(gtree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(gtree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) +head(ego_1, n = 2) +head(ego_2, n = 2) ``` ## Count the number of nodes and the subgraphs in the ego-networks of each graph ($N_w$) Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: ```{r} # Subgraphs counts for ego-networks in query graphs -subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) -subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) +subgraph_counts_1 <- ego_to_graphlet_counts( + ego_networks = ego_1, max_graphlet_size = max_subgraph_size +) +subgraph_counts_2 <- ego_to_graphlet_counts( + ego_networks = ego_2, max_graphlet_size = max_subgraph_size +) head(subgraph_counts_1) head(subgraph_counts_2) @@ -159,13 +166,17 @@ densities_1 <- ego_network_density(graphlet_counts = subgraph_counts_1) densities_2 <- ego_network_density(graphlet_counts = subgraph_counts_2) # Adaptively bin ego-network densities -binned_densities_1 <- binned_densities_adaptive(densities = densities_1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - -binned_densities_2 <- binned_densities_adaptive(densities = densities_2, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities_1 <- binned_densities_adaptive( + densities = densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) + +binned_densities_2 <- binned_densities_adaptive( + densities = densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) str(binned_densities_1) str(binned_densities_2) @@ -174,13 +185,17 @@ str(binned_densities_2) ## Calculate expected subgraph counts in each density bin by the Geometric-Poisson approximation ($E_w$) With the ego-network binning obtained, the Geometric-Poisson approximation of the expected subgraph counts, $E_w$, can be obtained for each subgraph $w$ and each density bin: ```{r} -binned_gp_subgraph_counts_1 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_1, - density_interval_indexes = binned_densities_1$interval_indexes, - max_graphlet_size = max_subgraph_size) - -binned_gp_subgraph_counts_2 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_2, - density_interval_indexes = binned_densities_2$interval_indexes, - max_graphlet_size = max_subgraph_size) +binned_gp_subgraph_counts_1 <- density_binned_counts_gp( + graphlet_counts = subgraph_counts_1, + density_interval_indexes = binned_densities_1$interval_indexes, + max_graphlet_size = max_subgraph_size +) + +binned_gp_subgraph_counts_2 <- density_binned_counts_gp( + graphlet_counts = subgraph_counts_2, + density_interval_indexes = binned_densities_2$interval_indexes, + max_graphlet_size = max_subgraph_size +) binned_gp_subgraph_counts_1 binned_gp_subgraph_counts_2 ``` @@ -189,27 +204,35 @@ binned_gp_subgraph_counts_2 With $E_w$ now obtained, Netdis-GP, can be compute as per its construction by first centring the observed counts: ```{r} # Calculate expected subgraph counts for each ego network -exp_gp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, - density_breaks = binned_densities_1$breaks, - density_binned_reference_counts = binned_gp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size, - scale_fn=NULL) - - -exp_gp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, - density_breaks = binned_densities_2$breaks, - density_binned_reference_counts = binned_gp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size, - scale_fn=NULL) +exp_gp_subgraph_counts_1 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_1$breaks, + density_binned_ref_counts = binned_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size, + scale_fn = NULL +) + + +exp_gp_subgraph_counts_2 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_2$breaks, + density_binned_ref_counts = binned_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size, + scale_fn = NULL +) # Centre subgraph counts by subtracting expected counts -centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, - exp_graphlet_counts = exp_gp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size) - -centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, - exp_graphlet_counts = exp_gp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +centred_subgraph_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size +) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) head(centred_subgraph_counts_1) head(centred_subgraph_counts_2) @@ -229,9 +252,11 @@ sum_subgraph_counts_2 Finally, the total centred counts can be used to obtain the Netdis statistic: ```{r} -netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, - centred_graphlet_count_vector_2 = sum_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +netdis_result <- netdis_uptok( + centred_graphlet_counts_1 = sum_subgraph_counts_1, + centred_graphlet_counts_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) print(netdis_result) ``` diff --git a/vignettes/NetdisStepByStep.Rmd b/vignettes/NetdisStepByStep.Rmd index 9b69845e..198b10ad 100644 --- a/vignettes/NetdisStepByStep.Rmd +++ b/vignettes/NetdisStepByStep.Rmd @@ -13,10 +13,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -80,11 +80,11 @@ Generation of tree-like networks with 400 nodes and 1600 nodes. ```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} # Create networks set.seed(34) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gtree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gtree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +plot(gtree_1, vertex.size = 0.8, vertex.label = NA) +plot(gtree_2, vertex.size = 0.8, vertex.label = NA) ``` @@ -104,7 +104,8 @@ neighbourhood_size <- 2 min_ego_nodes <- 3 min_ego_edges <- 1 -# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +# Ego-network density binning parameters. Here, the minimum number of +# ego-networks per bin and the starting number of bins min_bin_count <- 5 num_bins <- 100 ``` @@ -115,17 +116,19 @@ num_bins <- 100 One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: ```{r} # Get ego-networks for query graphs -ego_1 <- make_named_ego_graph(gTree_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(gTree_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) -tail(ego_1,n=2) -tail(ego_2,n=2) +ego_1 <- make_named_ego_graph(gtree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(gtree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) +tail(ego_1, n = 2) +tail(ego_2, n = 2) ``` ## Count the number of nodes and the subgraphs in ego-networks of query graphs ($N_w$) @@ -133,8 +136,12 @@ tail(ego_2,n=2) Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: ```{r} # Subgraphs counts for ego-networks in query graphs -subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) -subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) +subgraph_counts_1 <- ego_to_graphlet_counts( + ego_networks = ego_1, max_graphlet_size = max_subgraph_size +) +subgraph_counts_2 <- ego_to_graphlet_counts( + ego_networks = ego_2, max_graphlet_size = max_subgraph_size +) tail(subgraph_counts_1) tail(subgraph_counts_2) @@ -158,8 +165,8 @@ For this case the user must provide the gold-standard network of their choosing. The following considers a tree-like network with `r 30^2` nodes as the gold-standard. ```{r,fig.align='center',fig.dim=c(6,6)} # Network used as gold-standard -gst_1 <- erdos.renyi.game(n = 30^2,p.or.m = graph.density(graph = gTree_2)) -plot(gst_1,vertex.size=0.8,vertex.label=NA) +gst_1 <- erdos.renyi.game(n = 30^2, p.or.m = graph.density(graph = gtree_2)) +plot(gst_1, vertex.size = 0.8, vertex.label = NA) ``` ### Obtain the gold-standard ego-network counts and their binning according to their edge-density ($\rho(.)$) @@ -167,25 +174,31 @@ plot(gst_1,vertex.size=0.8,vertex.label=NA) To calculate the expected counts, $E_w$, the counts of the ego-networks of the gold-standard network need to be obtained first: ```{r} # Obtain subgraph counts and binning for gold-standard -ego_gst_1 <- make_named_ego_graph(graph = gst_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -subgraph_counts_gst_1 <- ego_to_graphlet_counts(ego_networks = ego_gst_1, - max_graphlet_size = max_subgraph_size) +ego_gst_1 <- make_named_ego_graph( + graph = gst_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +subgraph_counts_gst_1 <- ego_to_graphlet_counts( + ego_networks = ego_gst_1, + max_graphlet_size = max_subgraph_size +) head(subgraph_counts_gst_1) ``` Subsequently, these ego-networks are binned according to their edge density: ```{r} -densities_gst_1<- ego_network_density(graphlet_counts = subgraph_counts_gst_1) +densities_gst_1 <- ego_network_density(graphlet_counts = subgraph_counts_gst_1) # Adaptively bin ego-network densities -binned_densities_gst_1 <- binned_densities_adaptive(densities = densities_gst_1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities_gst_1 <- binned_densities_adaptive( + densities = densities_gst_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) str(binned_densities_gst_1) ``` @@ -195,17 +208,22 @@ str(binned_densities_gst_1) $E_w$ is estimated based on the average subgraph counts of ego-networks per density bin for each given subgraph. However, as the query networks and the gold-standard may have different number of nodes, the counts of the gold-standard network are first scaled to a "standard" or "canonical" scale from which they can be scaled back towards networks of different sizes. The following code first shows the computation of the subgraph counts for the ego-networks in the gold-standard network with their corresponding scaling: ```{r} -# Scale ego-network subgraph counts by dividing by the total number of k-tuples in the -# ego-network (where k is the subgraph size) -scaled_subgraph_counts_ref <- scale_graphlet_counts_ego(graphlet_counts = subgraph_counts_gst_1, - max_graphlet_size =max_subgraph_size) +# Scale ego-network subgraph counts by dividing by the total number of k-tuples +# in the ego-network (where k is the subgraph size) +scaled_subgraph_counts_ref <- scale_graphlet_counts_ego( + graphlet_counts = subgraph_counts_gst_1, + max_graphlet_size = max_subgraph_size +) str(scaled_subgraph_counts_ref) ``` Finally, the standard or canonical $E_w$ can be obtained by taking the average per bin of the scaled subgraph counts: ```{r} # Average of the scaled reference subgraph counts in each density bin -ref_binned_canonical_subgraph_counts <- mean_density_binned_graphlet_counts(graphlet_counts = scaled_subgraph_counts_ref, density_interval_indexes = binned_densities_gst_1$interval_indexes) +ref_binned_canonical_subgraph_counts <- mean_density_binned_graphlet_counts( + graphlet_counts = scaled_subgraph_counts_ref, + density_interval_indexes = binned_densities_gst_1$interval_indexes +) ref_binned_canonical_subgraph_counts ``` @@ -214,28 +232,37 @@ ref_binned_canonical_subgraph_counts After obtaining the average scaled subgraph counts per density bin, the subgraph counts of the query networks can be centred: ```{r} -# Scale the reference counts of the gold-standard network to the sizes of each of the query ego-networks. -exp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, - density_breaks = binned_densities_gst_1$breaks, - density_binned_reference_counts = ref_binned_canonical_subgraph_counts, - max_graphlet_size = max_subgraph_size, - scale_fn=count_graphlet_tuples) - - -exp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, - density_breaks = binned_densities_gst_1$breaks, - density_binned_reference_counts = ref_binned_canonical_subgraph_counts, - max_graphlet_size = max_subgraph_size, - scale_fn=count_graphlet_tuples) +# Scale the reference counts of the gold-standard network to the sizes of each +# of the query ego-networks. +exp_subgraph_counts_1 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_gst_1$breaks, + density_binned_ref_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn = count_graphlet_tuples +) + + +exp_subgraph_counts_2 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_gst_1$breaks, + density_binned_ref_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn = count_graphlet_tuples +) # Centre subgraph counts by subtracting expected counts -centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, - exp_graphlet_counts = exp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size) - -centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, - exp_graphlet_counts = exp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +centred_subgraph_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size +) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) tail(centred_subgraph_counts_1) tail(centred_subgraph_counts_2) @@ -256,9 +283,11 @@ sum_subgraph_counts_2 Finally, the total centred counts can be used to obtain the Netdis statistic: ```{r} -netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, - centred_graphlet_count_vector_2 = sum_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +netdis_result <- netdis_uptok( + centred_graphlet_counts_1 = sum_subgraph_counts_1, + centred_graphlet_counts_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) print(netdis_result) ``` diff --git a/vignettes/NewNetdisCustomisations.Rmd b/vignettes/NewNetdisCustomisations.Rmd index ba6c49b6..7e69dd5f 100644 --- a/vignettes/NewNetdisCustomisations.Rmd +++ b/vignettes/NewNetdisCustomisations.Rmd @@ -12,10 +12,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -43,14 +43,21 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 # Create lattice, Ring and Tree like networks of sizes 20^2 and 40^2. # Create networks set.seed(3171) -gLat_1 <- graph.lattice(c(20,20)) -gLat_2 <- graph.lattice(c(40,40)) -gRing_1 <- make_ring(20^2) -gRing_2 <- make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) - -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) +glat_1 <- graph.lattice(c(20, 20)) +glat_2 <- graph.lattice(c(40, 40)) +gring_1 <- make_ring(20^2) +gring_2 <- make_ring(40^2) +gtree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gtree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) + +glist <- list( + Lat_1 = glat_1, + Lat_2 = glat_2, + Ring_1 = gring_1, + Ring_2 = gring_1, + Tree_1 = gtree_1, + Tree_2 = gtree_2 +) # Create a random graph to be used as a gold-standard gst <- igraph::as.undirected(graph.star(1000)) @@ -61,35 +68,42 @@ gst <- igraph::as.undirected(graph.star(1000)) ## Using Netdis with a reference graph as a proxy for $E_w$ For this variant a reference graph or gold-standard graph has to be given in `ref_graph`: ```{r,netdisgoldstand,fig.align='center',fig.dim=c(7,7)} -netdis_mat_gst <- netdis_many_to_many(graphs = glist, - ref_graph = gst - ) -netdis.plot(netdislist = netdis_mat_gst,whatrow = 2, main = "Netdis with reference graph") +netdis_mat_gst <- netdis_many_to_many( + graphs = glist, + ref_graph = gst +) +netdis_plot( + netdislist = netdis_mat_gst, whatrow = 2, + main = "Netdis with reference graph" +) ``` ## Using Netdis with a constant valued expectation, $E_w=k$ For this variant please set `ref_graph` to the desired constant $k$. In this example we consider $k=0$ and $k=5$. Considering $k=0$ is equivalent to computing Netdis without background expectations: ```{r,netdisconstant,fig.align='center',fig.dim=c(7,7)} -netdis_mat_zero <- netdis_many_to_many(graphs = glist, - ref_graph = 0 - ) -netdis.plot(netdislist = netdis_mat_zero,whatrow = 2, main = "Netdis Ew=0") - -netdis_mat_5 <- netdis_many_to_many(graphs = glist, - ref_graph = 5 - ) -netdis.plot(netdislist = netdis_mat_5,whatrow = 2, main = "Netdis Ew=5") +netdis_mat_zero <- netdis_many_to_many( + graphs = glist, + ref_graph = 0 +) +netdis_plot(netdislist = netdis_mat_zero, whatrow = 2, main = "Netdis Ew=0") + +netdis_mat_5 <- netdis_many_to_many( + graphs = glist, + ref_graph = 5 +) +netdis_plot(netdislist = netdis_mat_5, whatrow = 2, main = "Netdis Ew=5") ``` ## Using Netdis-GP, Geometric-Poisson approximation for $E_w$ In order to obtain the Netdis-GP variant set `ref_graph=NULL` (default). ```{r,fig.align='center',fig.dim=c(7,7)} -netdisgp_mat <- netdis_many_to_many(graphs = glist, - ref_graph = NULL - ) -netdis.plot(netdisgp_mat, whatrow = 2, main = "Netdis-GP") +netdisgp_mat <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL +) +netdis_plot(netdisgp_mat, whatrow = 2, main = "Netdis-GP") ``` --------------------------- @@ -103,7 +117,9 @@ mybinning <- function(densities) { min_counts_per_interval <- 5 num_intervals <- 3 # - if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) { + min_counts_per_interval <- length(densities) + } breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals @@ -118,7 +134,7 @@ mybinning <- function(densities) { breaks = breaks ) } - + # Let us see an example output of the binning function binning_example <- mybinning(runif(20)) binning_example$breaks @@ -126,12 +142,17 @@ binning_example$interval_indexes binning_example$densities # Calculate Netdis -netdisgp_mat_mybin <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - binning_fn = mybinning - ) - -netdis.plot(netdislist = netdisgp_mat_mybin,whatrow = 2, main = "Netdis-GP with mybinning") +netdisgp_mat_mybin <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + binning_fn = mybinning +) + +netdis_plot( + netdislist = netdisgp_mat_mybin, + whatrow = 2, + main = "Netdis-GP with mybinning" +) ``` Note that whenever $E_w$ is taken as a constant value, then the binning will not have an effect on the computation of Netdis. @@ -162,43 +183,55 @@ num_bins <- 100 These values can be directly imputed and changed into the shortcut Netdis function calls. However, not all combinations may be possible. The following shows the use of subgraphs up to size 4, with 3-step ego-networks and where only ego-networks with at least 5 nodes and 4 edges can be considered. Furthermore, the binning of the ego-networks will be sett to start with 20 bins and each bin will be required to have at least 20 elements. ```{r,fig.align='center',fig.dim=c(7,7)} -# (We only recommend changing these default values for those users that have a clear understanding of graph theory behind it) -#(change values with care as not all combinations may be possible). +# (We only recommend changing these default values for those users that have a +# clear understanding of graph theory behind it. Change values with care as not +# all combinations may be possible). -#Defining a new binning function: +# Defining a new binning function: binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 10, #10-egos required per bin - num_intervals = 20) #Start binning with 20 bins - -#Changing parameter values in Netdis: -netdisgp_mat_custom <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - max_graphlet_size = 4, #Subgraphs/graphlets up to size 4 considered. - neighbourhood_size = 3,# 3-step ego-networks - min_ego_nodes = 5, #ego-networks with at least five nodes - min_ego_edges = 4, #ego-networks with at least 4 edges - binning_fn = binning_fn #Providing a custom binning function - ) + min_counts_per_interval = 10, # 10-egos required per bin + num_intervals = 20 +) # Start binning with 20 bins + +# Changing parameter values in Netdis: +netdisgp_mat_custom <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + max_graphlet_size = 4, # Subgraphs/graphlets up to size 4 considered. + neighbourhood_size = 3, # 3-step ego-networks + min_ego_nodes = 5, # ego-networks with at least five nodes + min_ego_edges = 4, # ego-networks with at least 4 edges + binning_fn = binning_fn # Providing a custom binning function +) ``` Here the default parameters are used, and a heatmap of the result of Netdis with default parameters and Netdis with the previously modified parameters is given: ```{r ,fig.align='center',fig.dim=c(7,7)} -#Default binning +# Default binning binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) -#Default computation of Netdis -netdisgp_mat <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn - ) -netdis.plot(netdislist = netdisgp_mat,whatrow = 2, main = "Netdis-GP: Default parameter values") -netdis.plot(netdislist = netdisgp_mat_custom,whatrow = 2, main = "Netdis-GP: illustrative parameter changes") - + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) +# Default computation of Netdis +netdisgp_mat <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn +) +netdis_plot( + netdislist = netdisgp_mat, + whatrow = 2, + main = "Netdis-GP: Default parameter values" +) +netdis_plot( + netdislist = netdisgp_mat_custom, + whatrow = 2, + main = "Netdis-GP: illustrative parameter changes" +) ``` diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd index 815f7e85..c258820d 100644 --- a/vignettes/PreComputedProps.Rmd +++ b/vignettes/PreComputedProps.Rmd @@ -41,35 +41,44 @@ NetEmd and Netdis use subgraph counts, however, NetEmd takes counts directly fro ```{r, netemd,fig.align='center',fig.dim=c(8,4)} # Create lattice networks -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(44,44)) +glat_1 <- igraph::graph.lattice(c(20, 20)) +glat_2 <- igraph::graph.lattice(c(44, 44)) -par(mfrow=c(1,2)) -plot(gLat_1,vertex.size=4,vertex.label=NA) -plot(gLat_2,vertex.size=4,vertex.label=NA) +par(mfrow = c(1, 2)) +plot(glat_1, vertex.size = 4, vertex.label = NA) +plot(glat_2, vertex.size = 4, vertex.label = NA) ``` The simple computation of NetEmd without pre-computed features: ```{r} -netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",smoothing_window_width = 1) +netemd_one_to_one( + graph_1 = glat_1, + graph_2 = glat_2, + feature_type = "orbit", + smoothing_window_width = 1 +) ``` ### Providing a matrix of network features ```{r} -counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) -counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) -head(counts_1[,1:4]) +counts_1 <- count_orbits_per_node(graph = glat_1, max_graphlet_size = 5) +counts_2 <- count_orbits_per_node(graph = glat_2, max_graphlet_size = 5) +head(counts_1[, 1:4]) -netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2,smoothing_window_width = 1) +netemd_one_to_one( + dhists_1 = counts_1, dhists_2 = counts_2, smoothing_window_width = 1 +) ``` ### Providing the network features as lists of dhist objects ```{r} -dhists_1<- graph_features_to_histograms(features_matrix = counts_1) -dhists_2<- graph_features_to_histograms(features_matrix = counts_2) +dhists_1 <- graph_features_to_histograms(features_matrix = counts_1) +dhists_2 <- graph_features_to_histograms(features_matrix = counts_2) -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2,smoothing_window_width = 1) +netemd_one_to_one( + dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 1 +) ``` ___ @@ -78,35 +87,52 @@ ___ Computation of the Laplacian and Normalized Laplacian: ```{r, netemdEigen} # Networks -gLat_1 <- graph.lattice(c(20,20)) -gLat_2 <- graph.lattice(c(44,44)) +glat_1 <- graph.lattice(c(20, 20)) +glat_2 <- graph.lattice(c(44, 44)) -#Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) +# Laplacian +lapg_1 <- igraph::laplacian_matrix( + graph = glat_1, normalized = FALSE, sparse = FALSE +) +lapg_2 <- igraph::laplacian_matrix( + graph = glat_2, normalized = FALSE, sparse = FALSE +) -#Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) +# Normalized Laplacian +nlapg_1 <- igraph::laplacian_matrix( + graph = glat_1, normalized = TRUE, sparse = FALSE +) +nlapg_2 <- igraph::laplacian_matrix( + graph = glat_2, normalized = TRUE, sparse = FALSE +) -# Providing a matrix of network features (e.g. Spectra). (This may take a couple of minutes). -spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +# Providing a matrix of network features (e.g. Spectra). (This may take a +# couple of minutes). +spec_1 <- cbind( + L.Spectra = eigen(lapg_1)$values, NL.Spectra = eigen(nlapg_1)$values +) +spec_2 <- cbind( + L.Spectra = eigen(lapg_2)$values, NL.Spectra = eigen(nlapg_2)$values +) head(spec_1) ``` Similarly to counts, all other features can be given as a matrix or as dhist objects: ```{r} -netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) +netemd_one_to_one( + dhists_1 = spec_1, dhists_2 = spec_2, smoothing_window_width = 0 +) # Providing pre-computed dhist objects from network features -dhists_1<- graph_features_to_histograms(spec_1) -dhists_2<- graph_features_to_histograms(spec_2) +dhists_1 <- graph_features_to_histograms(spec_1) +dhists_2 <- graph_features_to_histograms(spec_2) -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +netemd_one_to_one( + dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 0 +) ``` ------------------------- @@ -120,19 +146,20 @@ Netdis uses counts from the resulting ego-networks of each of the nodes in a gra The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard graph and which is summarized in $E_w$. ```{r,netdisgoldstand} -# Set source directory for virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") ``` For illustration purposes consider the lattice networks as possible gold-standard networks: ```{r,fig.align='center'} # Lattice graphs to be used as gold-standard as a reference point comparison -goldstd_1 <- igraph::graph.lattice(c(20,20)) #Graph with 8^2 nodes -goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes +goldstd_1 <- igraph::graph.lattice(c(20, 20)) # Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44, 44)) # Graph with 44^2 nodes ``` Now obtain the subgraph counts for all networks. @@ -146,17 +173,25 @@ props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) Compute Netdis using the pre-computed counts and any of the example gold-standard networks. ```{r} -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_1 +) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_2 +) ``` Comparison to the result of Netdis without pre-computed counts. ```{r} # Netdis using the goldstd_1 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) # Netdis using the goldstd_2 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) ``` @@ -165,21 +200,25 @@ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) This Netdis variant focuses on detecting more meso-level discrepancies between the ego-network structures. ```{r, netdisGP} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +# Netdis using the Geometric-Poisson approximation as a way to obtain background +# expectations. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) +netdis_one_to_one( + graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL +) ``` @@ -188,21 +227,24 @@ Comparing the networks via their observed ego counts without centring them, (equ ```{r,netdiszero} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -#Netdis using no expectations (or equivalently, expectation equal to zero). -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) +# Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) # Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +netdis_one_to_one( + graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0 +) ``` ------------------------- diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index 74db95f1..c1bf5206 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -11,8 +11,8 @@ vignette: > ```{r, include = FALSE} knitr::opts_chunk$set( -collapse = TRUE, -comment = "#>" + collapse = TRUE, + comment = "#>" ) ``` @@ -37,29 +37,37 @@ The `netdist` package also includes examples of a few real networks. These are p Although the `virusppi` list of PPI networks is loaded along with the `netdist` package, the following code shows how to read a network data from a file in disk: ```{r, graphs,fig.align='center',fig.dim=c(6,6)} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# Load query graphs as undirected igraph objects, with no loops, multiple edges or degree zero nodes. -graph_1 <- read_simple_graph(file = file.path(source_dir, "EBV.txt"), - format = "ncol") +# Load query graphs as undirected igraph objects, with no loops, multiple edges +# or degree zero nodes. +graph_1 <- read_simple_graph( + file = file.path(source_dir, "EBV.txt"), + format = "ncol" +) -graph_2 <- read_simple_graph(file = file.path(source_dir, "ECL.txt"), - format = "ncol") +graph_2 <- read_simple_graph( + file = file.path(source_dir, "ECL.txt"), + format = "ncol" +) -# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +# Herpes virus EBV protein-protein interaction graph with 60 nodes +# and 208 edges. graph_1 -#Note this graph is the same as +# Note this graph is the same as # virusppi$EBV -# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +# Herpes virus ECL protein-protein interaction graph with 1941 nodes +# and 3989 edges. graph_2 -#Note this graph is the same as +# Note this graph is the same as # virusppi$ECL -#A simple visualization of the graphs. -plot(graph_1,vertex.size=4,vertex.label=NA) -plot(graph_2,vertex.size=4,vertex.label=NA) +# A simple visualization of the graphs. +plot(graph_1, vertex.size = 4, vertex.label = NA) +plot(graph_2, vertex.size = 4, vertex.label = NA) ``` Other networks loaded in this package are discussed in ["NetEmd: World trade networks"](NetEmdTimeOrdering.html). You can also see `?virusppi` and `?worldtradesub`. @@ -91,40 +99,69 @@ and where $p_{t_i}(G)$ and $p_{t_i}(G')$ are the distributions of ${t_i}$ on $G$ ## Comparing two graphs with NetEmd ```{r, netemd,fig.align='center'} -# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction network edge files +# stored in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +# Herpes virus EBV protein-protein interaction graph with 60 nodes +# and 208 edges. graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) -# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +# Herpes virus ECL protein-protein interaction graph with 1941 nodes +# and 3989 edges. graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") + format = "ncol" +) # One to one NetEmd comparison. -netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing_window_width +# equal to zero is recommended. +netemd_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + feature_type = "orbit", + smoothing_window_width = 1 +) ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum ```{r, netemdEigen,fig.align='center'} -#Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) +# Laplacian +lapg_1 <- igraph::laplacian_matrix( + graph = graph_1, normalized = FALSE, sparse = FALSE +) +lapg_2 <- igraph::laplacian_matrix( + graph = graph_2, normalized = FALSE, sparse = FALSE +) -#Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) +# Normalized Laplacian +nlapg_1 <- igraph::laplacian_matrix( + graph = graph_1, normalized = TRUE, sparse = FALSE +) +nlapg_2 <- igraph::laplacian_matrix( + graph = graph_2, normalized = TRUE, sparse = FALSE +) -#Spectra (this may take a couple of minutes). -props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +# Spectra (this may take a couple of minutes). +props_1 <- cbind( + L.Spectra = eigen(lapg_1)$values, NL.Spectra = eigen(nlapg_1)$values +) +props_2 <- cbind( + L.Spectra = eigen(lapg_2)$values, NL.Spectra = eigen(nlapg_2)$values +) -head(props_1,n=3) -head(props_2,n=3) +head(props_1, n = 3) +head(props_2, n = 3) -netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# If the network features are considered continuous variables +# smoothing_window_width equal to zero is recommended. +netemd_one_to_one( + dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0 +) ``` ------------------------- @@ -168,18 +205,18 @@ The selection of a gold-standard graph as a substitute for $E_w$ could be done w ```{r,netdisgoldstand,fig.align='center',fig.dim=c(6,6)} # Lattice graphs to be used as a gold-standard reference point -goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes -goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes +goldstd_1 <- igraph::graph.lattice(c(8, 8)) # Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44, 44)) # Graph with 44^2 nodes -plot(goldstd_1,vertex.size=4,vertex.label=NA) -plot(goldstd_2,vertex.size=4,vertex.label=NA) +plot(goldstd_1, vertex.size = 4, vertex.label = NA) +plot(goldstd_2, vertex.size = 4, vertex.label = NA) # Netdis using the goldstd_1 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) # Netdis using the goldstd_2 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) ``` ## Netdis-GP: Using a Geometric-Poisson approximation @@ -201,8 +238,9 @@ where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ f This variant focuses on detecting more meso-level discrepancies between the ego-network structures. ```{r, netdisGP} -#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +# Netdis using the Geometric-Poisson approximation as a way to obtain background +# expectations. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) ``` @@ -210,9 +248,8 @@ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) Comparing the networks via their observed ego counts without centring them, (equivalent to using expectation equal to zero). This variant thus focus on detecting small discrepancies between the networks. ```{r,netdiszero} -#Netdis using no expectations (or equivalently, expectation equal to zero). -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) - +# Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) ``` ------------------------- diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index 5c0a3445..b091e175 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -33,31 +33,35 @@ supported values for the `format` parameter. ```{r, message=FALSE} library("netdist") library("igraph") -edge_format = "ncol" +edge_format <- "ncol" # Load reference graph (used for Netdis. Not required for NetEmd -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), - "ER_1250_10_1") +ref_path <- file.path( + system.file(file.path("extdata", "random"), package = "netdist"), + "ER_1250_10_1" +) ref_graph <- read_simple_graph(ref_path, format = edge_format) # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), - package = "netdist") + package = "netdist" +) edge_format <- "ncol" file_pattern <- "*" # Load all graphs in the source folder matching the filename pattern query_graphs <- read_simple_graphs(source_dir, - format = edge_format, - pattern = file_pattern) + format = edge_format, + pattern = file_pattern +) print(names(query_graphs)) ``` ```{r,fig.align='center',fig.dim=c(5,5)} -plot(query_graphs$EBV,vertex.label=NA,vertex.size=8) -plot(query_graphs$`HSV-1`,vertex.label=NA,vertex.size=8) -plot(query_graphs$KSHV,vertex.label=NA,vertex.size=8) -plot(query_graphs$VZV,vertex.label=NA,vertex.size=8) -plot(query_graphs$ECL,vertex.label=NA,vertex.size=4) +plot(query_graphs$EBV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$`HSV-1`, vertex.label = NA, vertex.size = 8) +plot(query_graphs$KSHV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$VZV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$ECL, vertex.label = NA, vertex.size = 4) ``` # Generate Netdis measures between each pair of query graphs @@ -74,14 +78,18 @@ neighbourhood_size <- 2 ## Netdis using an ER reference graph ```{r} # Calculate netdis measure for subgraphs up to size max_subgraph_size -netdis_result <- netdis_many_to_many(graphs = query_graphs, - ref_graph = ref_graph, - max_graphlet_size = max_subgraph_size, - neighbourhood_size = neighbourhood_size) +netdis_result <- netdis_many_to_many( + graphs = query_graphs, + ref_graph = ref_graph, + max_graphlet_size = max_subgraph_size, + neighbourhood_size = neighbourhood_size +) # Netdis measure for subgraphs of size 3 res3 <- netdis_result$netdis["netdis3", ] -netdis3_mat <- cross_comp_to_matrix(measure = res3, cross_comparison_spec = netdis_result$comp_spec) +netdis3_mat <- cross_comp_to_matrix( + measure = res3, cross_comparison_spec = netdis_result$comp_spec +) print("Netdis: subgraph size = 3") print(netdis3_mat) @@ -105,55 +113,67 @@ cex <- 1 # Dendrogram based on Netdis measure for subgraphs of size 3 title <- paste("Netdis: subgraph size = ", 3, sep = "") plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) + use.edge.length = FALSE, + edge.width = cex * 2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex +) # Dendrogram based on Netdis measure for subgraphs of size 4 -title = paste("Netdis: subgraph size = ", 4, sep = "") +title <- paste("Netdis: subgraph size = ", 4, sep = "") plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) + use.edge.length = FALSE, + edge.width = cex * 2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex +) ``` diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd index 4a70fb8d..061ff1b6 100644 --- a/vignettes/dendrogram_example_net_emd.Rmd +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -13,41 +13,42 @@ vignette: > library("netdist") # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = ".txt" +edge_format <- "ncol" +file_pattern <- ".txt" -# Calculate graphlet-based degree distributions for all orbits in graphlets -# comprising up to 4 nodes for all graphs. This only needs to be done once -# per graph (feature_type = "orbit", max_graphlet_size = 4).. +# Calculate graphlet-based degree distributions for all orbits in graphlets +# comprising up to 4 nodes for all graphs. This only needs to be done once +# per graph (feature_type = "orbit", max_graphlet_size = 4).. # If feature_type is set to "feature_type", orbit counts for orbits in the # same graphlet will be summed to generate graphlet counts -# If max_graphlet_size is set to 5, graphlet-based degree distributions will +# If max_graphlet_size is set to 5, graphlet-based degree distributions will # be calculated for graphlets comprising up to 5 nodes. virus_gdds <- gdd_for_all_graphs( - source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 4) + source_dir = source_dir, format = edge_format, pattern = file_pattern, + feature_type = "orbit", max_graphlet_size = 4 +) names(virus_gdds) -# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- +# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- # based degree distributions using the default fast "optimise" method and no # smoothing (default). The "optimise" method uses the built-in R optimise # function to efficiently find the offset with the minimum EMD, but is not # guaranteed to find the global minimum if EMD as a function of offset -# is non-convex and/or multimodal. The smoothing window width determines +# is non-convex and/or multimodal. The smoothing window width determines # whether to calculate the NetEMD from the unaltered discrete GDD histograms -# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" -# smoothing by "smearing" the discrete GDD histogram point masses across bins +# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" +# smoothing by "smearing" the discrete GDD histogram point masses across bins # of unit width (smoothing_window_width = 1). Returns a named list containing: -# (i) the NetEMDs and (ii) a table containing the graph names and indices +# (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists = virus_gdds, smoothing_window_width = 0) -# You can also specify method = "fixed_step" to use the much slower method of -# exhaustively evaluating the EMD at all offsets separated by a fixed step. -# The default step size is 1/2 the the minimum spacing between locations in -# either histogram after normalising to unit variance. However, you can +# You can also specify method = "fixed_step" to use the much slower method of +# exhaustively evaluating the EMD at all offsets separated by a fixed step. +# The default step size is 1/2 the the minimum spacing between locations in +# either histogram after normalising to unit variance. However, you can # specifiy your own fixed step using the optional "step_size" parameter. -# Note that this step size is applied to the histograms after they have been +# Note that this step size is applied to the histograms after they have been # normalised to unit variance # Convert to matrix for input to dendrogram method @@ -56,25 +57,42 @@ netemd_mat ``` ```{r} -cex=1 -title = paste("NetEMD: max graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +cex <- 1 +title <- paste("NetEMD: max graphlet size = ", 4, sep = "") +plot( + phangorn::upgma(as.dist(netemd_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex * 2, + main = title, + cex.lab = cex, + cex.axis = cex, + cex.main = cex, + cex.sub = cex, + cex = cex +) -# The gdd_for_all_graphs and netemd_many_to_many functions will run in +# The gdd_for_all_graphs and netemd_many_to_many functions will run in # parallel using multiple threads where supported. The number of threads -# used is determined by the global R option "mc.cores". You can inspect the -# current value of this using options("mc.cores") and set it with +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with # options("mc.cores" = ). To fully utilise a modern consumer -# processor, this should be set to 2x the number of available processor +# processor, this should be set to 2x the number of available processor # cores as each core supports two threads. ``` ```{r} -cex=1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("NetEMD: max graphlet size = ", 4, sep = "") -heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +cex <- 1.5 +col <- colorRampPalette(colors = c("blue", "white"))(100) +title <- paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap( + netemd_mat, + Rowv = NULL, + Colv = NULL, + col = col, + main = title, + cexRow = cex, + cexCol = cex, + symm = TRUE +) ``` diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd index 0c1c724a..0040450c 100644 --- a/vignettes/netdis_customisations.Rmd +++ b/vignettes/netdis_customisations.Rmd @@ -29,10 +29,10 @@ min_ego_nodes <- 3 min_ego_edges <- 1 # Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") - ``` ## Load query graphs @@ -46,11 +46,12 @@ graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) print(results$netdis) print(results$comp_spec) @@ -60,23 +61,23 @@ print(results$comp_spec) ```{r} binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 10, - num_intervals = 50) + min_counts_per_interval = 10, + num_intervals = 50 +) # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn +) print(results$netdis) print(results$comp_spec) - - ``` ## With Modified Expected Counts: Geometric Poisson @@ -84,17 +85,19 @@ print(results$comp_spec) bin_counts_fn <- density_binned_counts_gp exp_counts_fn <- purrr::partial(netdis_expected_counts, - scale_fn = NULL) + scale_fn = NULL +) # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn +) print(results$netdis) print(results$comp_spec) @@ -108,15 +111,16 @@ exp_counts_fn <- netdis_expected_counts # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn +) print(results$netdis) print(results$comp_spec) -``` \ No newline at end of file +``` diff --git a/vignettes/netdis_pairwise_comparisons.Rmd b/vignettes/netdis_pairwise_comparisons.Rmd index d5809c4d..650454c5 100644 --- a/vignettes/netdis_pairwise_comparisons.Rmd +++ b/vignettes/netdis_pairwise_comparisons.Rmd @@ -33,10 +33,10 @@ min_bin_count <- 5 num_bins <- 100 # Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") - ``` ## Compare two graphs @@ -45,18 +45,21 @@ ref_graph <- read_simple_graph(ref_path, format = "ncol") source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") + format = "ncol" +) # Calculate netdis statistics netdis_one_to_one(graph_1, graph_2, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Compare one graph to many other graphs @@ -68,11 +71,12 @@ graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] # Calculate netdis statistics netdis_one_to_many(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Do pairwise netdis calculations for many graphs @@ -83,12 +87,13 @@ graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) print(results$netdis) print(results$comp_spec) -``` \ No newline at end of file +``` diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index a55f1d82..65b8d8a6 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -23,11 +23,12 @@ source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - + format = "ncol" +) ``` ## Set Netdis parameters @@ -50,86 +51,108 @@ num_bins <- 100 ## Generate ego networks ```{r} # Get ego networks for query graphs and reference graph -ego_1 <- make_named_ego_graph(graph_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(graph_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Count graphlets in ego networks ```{r} # Count graphlets for ego networks in query and reference graphs -graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) -graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +graphlet_counts_1 <- ego_to_graphlet_counts( + ego_1, + max_graphlet_size = max_graphlet_size +) +graphlet_counts_2 <- ego_to_graphlet_counts( + ego_2, + max_graphlet_size = max_graphlet_size +) ``` ## Use a reference graph to calculate expected graphlet counts in ego network density bins ```{r} # Load reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") -ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) -graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) +graphlet_counts_ref <- ego_to_graphlet_counts( + ego_ref, + max_graphlet_size = max_graphlet_size +) # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) -scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, - max_graphlet_size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego( + graphlet_counts_ref, + max_graphlet_size +) # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) # Adaptively bin ref ego-network densities -binned_densities <- binned_densities_adaptive(densities_ref, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) ref_ego_density_bins <- binned_densities$breaks # Average ref graphlet counts across density bins ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts_ref, - binned_densities$interval_indexes) - + scaled_graphlet_counts_ref, + binned_densities$interval_indexes +) ``` ## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples +) -exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples +) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) - -centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size +) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size +) ``` ## Sum centred graphlet counts across all ego networks @@ -142,9 +165,11 @@ sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) ## Calculate netdis statistics ```{r} -netdis_result <- netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) +netdis_result <- netdis_uptok( + sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size +) print(netdis_result) ```