From 93ea66c24c973cb3d4a99451ed3c6c567751465e Mon Sep 17 00:00:00 2001 From: ande Date: Fri, 1 Jun 2018 12:12:17 +0100 Subject: [PATCH 01/84] added smooth c++ file --- src/fastSmooth.cpp | 153 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 src/fastSmooth.cpp diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp new file mode 100644 index 00000000..ac664ff5 --- /dev/null +++ b/src/fastSmooth.cpp @@ -0,0 +1,153 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] +#include +#include +#include +#include +#include +using namespace Rcpp; + +// [[Rcpp::plugins("cpp11")]] + +//' @title +//' Compute EMD +////' +////' @param loc1 numeric vector. +////' @param val1 numeric vector. +////' @param loc2 numeric vector. +////' @param val2 numeric vector. +//' +//' @export +// [[Rcpp::export]] +double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) +{ + //init + double res=0; + double curVal1,curVal2; + double curPos; + double temp1; + int count; + int i,j,k; + //place start of windows before + //start of histogram so we can start the loop + // stores the result + res=0; + //TODO be worried about adding lots of small numbers + + // current location on hist 1 and hist 2 + i=0; + j=0; + double cdfLower=0; + double loc1SegStart=loc1[0]; + double loc1SegEnd=loc1[0]+binWidth1; + double loc1SegValStart=0; + double loc1SegValEnd=val1[0]; + double loc2SegStart=loc2[0]; + double loc2SegEnd=loc2[0]+binWidth2; + double loc2SegValStart=0; + double loc2SegValEnd=val2[0]; + double curStartVal; + double curEndVal; + double loc1Start; + double loc2Start; + double loc1End; + double loc2End; + double h; + res=0; + while (1) + { + // lets compute the area for these segments + if (loc1SegValStart Date: Fri, 1 Jun 2018 18:03:34 +0100 Subject: [PATCH 02/84] fixed a few annoying bugs in the implementation. This version appears to work, not sure if doing this from the other perspective would be better --- src/fastSmooth.cpp | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index ac664ff5..eda90de4 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -54,6 +54,7 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer double loc2End; double h; res=0; + int count123=0; while (1) { // lets compute the area for these segments @@ -61,38 +62,37 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer { curStartVal=loc2SegValStart; loc2Start=loc2SegStart; - loc1Start=loc1SegStart+(loc1SegEnd-loc1SegValStart)*(loc2SegValStart-loc1SegValStart)/(loc1SegValEnd-loc1SegValStart); + loc1Start=loc1SegStart+(loc1SegEnd-loc1SegStart)*(loc2SegValStart-loc1SegValStart)/(loc1SegValEnd-loc1SegValStart); } else { curStartVal=loc1SegValStart; loc1Start=loc1SegStart; - loc2Start=loc2SegStart+(loc2SegEnd-loc2SegValStart)*(loc1SegValStart-loc2SegValStart)/(loc2SegValEnd-loc2SegValStart); + loc2Start=loc2SegStart+(loc2SegEnd-loc2SegStart)*(loc1SegValStart-loc2SegValStart)/(loc2SegValEnd-loc2SegValStart); } if (loc1SegValEnd Date: Sun, 3 Jun 2018 16:35:40 +0100 Subject: [PATCH 03/84] Modifications to add the c code into the function Also remoing some uneeded c code. --- NAMESPACE | 3 +++ R/RcppExports.R | 10 ++++++++++ R/emd.R | 45 ++++++++++++++++++++++++++++++++++++++++++--- src/RcppExports.cpp | 17 +++++++++++++++++ src/fastSmooth.cpp | 5 ++--- 5 files changed, 74 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 55867c79..7031e634 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(NetEmdSmooth) export(adaptive_breaks) export(area_between_dhist_ecmfs) export(as_smoothed_dhist) @@ -64,5 +65,7 @@ export(shift_dhist) export(simplify_graph) export(sort_dhist) export(zeros_to_ones) +import(Rcpp) +importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) useDynLib(netdist) diff --git a/R/RcppExports.R b/R/RcppExports.R index f85dc216..87d17d1b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -44,3 +44,13 @@ emd_fast_no_smoothing <- function(locations1, values1, locations2, values2) { .Call('_netdist_emd_fast_no_smoothing', PACKAGE = 'netdist', locations1, values1, locations2, values2) } +#' @title +#' Compute EMD +NULL + +#' +#' @export +NetEmdSmooth <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { + .Call('_netdist_NetEmdSmooth', PACKAGE = 'netdist', loc1, val1, binWidth1, loc2, val2, binWidth2) +} + diff --git a/R/emd.R b/R/emd.R index 696e72cd..678be30f 100644 --- a/R/emd.R +++ b/R/emd.R @@ -42,6 +42,9 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @return Earth Mover's Distance between the two discrete histograms #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { + print("there is me") + print(dhist1$smoothing_window_width) + print(dhist2$smoothing_window_width) # Can we run the C++ fast implementation (only works with no smoothing)? if ((dhist1$smoothing_window_width==0) && (dhist2$smoothing_window_width==0)) { @@ -80,11 +83,47 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { min_offset <- soln$minimum return(list(min_emd = min_emd, min_offset = min_offset)) } - else + else #if ((dhist1$smoothing_window_width==1) && (dhist2$smoothing_window_width==1)) { - # Fall back on other version if either dhist is smoothed - return(min_emd_optimise(dhist1, dhist2)); + print("hi im here") + # Determine minimum and maximum offset of range in which histograms overlap + # (based on sliding histogram 1) + min_offset <- min(dhist2$locations) - max(dhist1$locations) + max_offset <- max(dhist2$locations) - min(dhist1$locations) + # Set lower and upper range for optimise algorithm to be somewhat wider than + # range defined by the minimum and maximum offset. This guards against a + # couple of issues that arise if the optimise range is exactly min_offset + # to max_offset + # 1) If lower and upper are equal, the optimise method will throw and error + # 2) It seems that optimise is not guaranteed to explore its lower and upper + # bounds, even in the case where one of them is the offset with minimum + # EMD + buffer <- 0.1 + min_offset <- min_offset - buffer + max_offset <- max_offset + buffer + # Define a single parameter function to minimise emd as a function of offset + val1 <- cumsum(dhist1$masses) + val2 <- cumsum(dhist2$masses) + val1 <- val1/val1[length(val1)] + val2 <- val2/val2[length(val2)] + loc1=dhist1$locations + loc2=dhist2$locations + binWidth1=dhist1$smoothing_window_width + binWidth2=dhist2$smoothing_window_width + count=0 + emd_offset <- function(offset) { + temp1<- NetEmdSmooth(loc1+offset,val1,binWidth1,loc2,val2,binWidth2) + temp1 + } + # Get solution from optimiser + soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, + tol = .Machine$double.eps*1000) + # Return mnimum EMD and associated offset + min_emd <- soln$objective + min_offset <- soln$minimum + return(list(min_emd = min_emd, min_offset = min_offset)) } + } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a9ac60fb..8db3345e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -30,10 +30,27 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// NetEmdSmooth +double NetEmdSmooth(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); +RcppExport SEXP _netdist_NetEmdSmooth(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type loc1(loc1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val1(val1SEXP); + Rcpp::traits::input_parameter< double >::type binWidth1(binWidth1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type loc2(loc2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val2(val2SEXP); + Rcpp::traits::input_parameter< double >::type binWidth2(binWidth2SEXP); + rcpp_result_gen = Rcpp::wrap(NetEmdSmooth(loc1, val1, binWidth1, loc2, val2, binWidth2)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_netdist_counts_from_observations", (DL_FUNC) &_netdist_counts_from_observations, 1}, {"_netdist_emd_fast_no_smoothing", (DL_FUNC) &_netdist_emd_fast_no_smoothing, 4}, + {"_netdist_NetEmdSmooth", (DL_FUNC) &_netdist_NetEmdSmooth, 6}, {NULL, NULL, 0} }; diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index eda90de4..a0161147 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -55,18 +55,17 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer double h; res=0; int count123=0; + curStartVal=0; while (1) { // lets compute the area for these segments if (loc1SegValStart Date: Sun, 3 Jun 2018 17:05:30 +0100 Subject: [PATCH 04/84] removed 1 if statement out of the loop --- src/fastSmooth.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index a0161147..a6f84274 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -56,19 +56,20 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer res=0; int count123=0; curStartVal=0; + if (loc1SegValStart Date: Tue, 5 Jun 2018 11:18:11 +0100 Subject: [PATCH 05/84] added speed test for smooth version --- R/net_emd_speed_benchmark.R | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/R/net_emd_speed_benchmark.R b/R/net_emd_speed_benchmark.R index 285e3e14..757811e8 100644 --- a/R/net_emd_speed_benchmark.R +++ b/R/net_emd_speed_benchmark.R @@ -1,3 +1,4 @@ +#' @export netEMDSpeedTest <- function() { ##load the data @@ -32,4 +33,41 @@ netEMDSpeedTest <- function() } } list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) +} + +#' @export +netEMDSpeedTestSmooth <- function() +{ + ##load the data + source_dir <- system.file(file.path("extdata", "random"), package = "netdist") + print(source_dir) + edge_format = "ncol" + file_pattern = "" + # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + # edge_format = "ncol" + # file_pattern = ".txt" + graphs <- read_simple_graphs(source_dir = source_dir, format = edge_format, pattern = file_pattern) + n1=names(graphs) + lab1=c() + gddBuildTime=c() + netEMDtime=c() + for (i in 1:length(graphs)) + { + for (j in 1:(i)) + { + g1=graphs[[i]] + g2=graphs[[j]] + lab1=append(lab1,paste(n1[i],n1[j],sep=',')) + print(paste(n1[i],n1[j],sep=',')) + fulltimeStart=Sys.time() + gdd1=gdd(g1) + gdd2=gdd(g2) + netEMDStart=Sys.time() + net_emd(gdd1,gdd2,smoothing_window_width = 1) + endTime=Sys.time() + gddBuildTime=append(gddBuildTime,as.double(netEMDStart-fulltimeStart)) + netEMDtime=append(netEMDtime,as.double(endTime-netEMDStart)) + } + } + list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) } \ No newline at end of file From 449609733a0e1e8fc387a3ee21983423d56a5bd4 Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 18 Jun 2018 13:14:41 +0100 Subject: [PATCH 06/84] removed some of the print messages --- R/emd.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/emd.R b/R/emd.R index 678be30f..c83c05c0 100644 --- a/R/emd.R +++ b/R/emd.R @@ -42,9 +42,6 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @return Earth Mover's Distance between the two discrete histograms #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { - print("there is me") - print(dhist1$smoothing_window_width) - print(dhist2$smoothing_window_width) # Can we run the C++ fast implementation (only works with no smoothing)? if ((dhist1$smoothing_window_width==0) && (dhist2$smoothing_window_width==0)) { @@ -85,7 +82,6 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { } else #if ((dhist1$smoothing_window_width==1) && (dhist2$smoothing_window_width==1)) { - print("hi im here") # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) From 80cd6df8951662daae65334a8a3915655b56431b Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 18 Jun 2018 13:15:16 +0100 Subject: [PATCH 07/84] added the new methods to the namespace for exports --- NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 7031e634..56e0b0fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,8 @@ export(min_emd) export(min_emd_exhaustive) export(min_emd_optimise) export(min_emd_optimise_fast) +export(netEMDSpeedTest) +export(netEMDSpeedTestSmooth) export(net_emd) export(net_emds_for_all_graphs) export(netdis) From c0a28e8aa72072e9a3cea0089760c041b69346c9 Mon Sep 17 00:00:00 2001 From: ande Date: Thu, 14 Nov 2019 09:30:56 +0000 Subject: [PATCH 08/84] commented and removing unneeded code --- src/fastSmooth.cpp | 62 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index a6f84274..66ebe166 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -37,27 +37,36 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer // current location on hist 1 and hist 2 i=0; j=0; - double cdfLower=0; - double loc1SegStart=loc1[0]; - double loc1SegEnd=loc1[0]+binWidth1; - double loc1SegValStart=0; - double loc1SegValEnd=val1[0]; + // hist1 variables + double loc1SegStart=loc1[0]; //- start of a Segment in x + double loc1SegEnd=loc1[0]+binWidth1; //- end of a Segment in x + double loc1SegValStart=0; //- start of a Segment in y + double loc1SegValEnd=val1[0]; //- end of a Segment in y + + // hist2 variables double loc2SegStart=loc2[0]; double loc2SegEnd=loc2[0]+binWidth2; double loc2SegValStart=0; double loc2SegValEnd=val2[0]; - double curStartVal; - double curEndVal; - double loc1Start; - double loc2Start; - double loc1End; - double loc2End; + + double curStartVal; // start value in y + double curEndVal; // end value in y + double loc1Start; // start value in x hist1 + double loc2Start; // start value in x hist2 + double loc1End; // end value in x hist1 + double loc2End; // end value in x hist2 double h; res=0; - int count123=0; + // set as 0 as at bottom of hist curStartVal=0; + + // need to know if first y segment ends with hist1 or hist2 + // Need to set the first start locations + // Commented this section as they are both set to zero + /* if (loc1SegValStart Date: Thu, 14 Nov 2019 09:55:06 +0000 Subject: [PATCH 09/84] figuring out what is wrong --- R/RcppExports.R | 2 +- src/fastSmooth.cpp | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 81fc83ae..1185ed09 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -32,6 +32,6 @@ NULL #' #' @export NetEmdSmooth <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { - .Call('_netdist_NetEmdSmooth', PACKAGE = 'netdist', loc1, val1, binWidth1, loc2, val2, binWidth2) + .Call(`_netdist_NetEmdSmooth`, loc1, val1, binWidth1, loc2, val2, binWidth2) } diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index 66ebe166..5b5d99bf 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -63,7 +63,6 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer // need to know if first y segment ends with hist1 or hist2 // Need to set the first start locations // Commented this section as they are both set to zero - /* if (loc1SegValStart Date: Thu, 14 Nov 2019 10:18:49 +0000 Subject: [PATCH 10/84] removing changes to see if it helps --- src/fastSmooth.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index 5b5d99bf..b2d43847 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -74,11 +74,10 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer loc1Start=loc1SegStart; loc2Start=loc2SegStart+(loc2SegEnd-loc2SegStart)*(loc1SegValStart-loc2SegValStart)/(loc2SegValEnd-loc2SegValStart); } - loc1Start=loc1SegStart; - loc2Start=loc2SegStart; + // loc1Start=loc1SegStart; + // loc2Start=loc2SegStart; while (1) { - std::cout << res << "\n"; // lets compute the area for this segments // // Case where hist1 ends first From 761898d6be4a7ec2c60ea007fc783e833e226e9d Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 14:34:44 +0000 Subject: [PATCH 11/84] Inital working version of fastEmdSmooth Very inefficient (double for loop), but it works and can easily be refined --- src/fastSmoothV2.cpp | 268 ++++++++++++++++++++++++++++++++++ tests/testthat/test_fastEMD.R | 154 +++++++++++++++++++ 2 files changed, 422 insertions(+) create mode 100644 src/fastSmoothV2.cpp create mode 100644 tests/testthat/test_fastEMD.R diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp new file mode 100644 index 00000000..c22d7d11 --- /dev/null +++ b/src/fastSmoothV2.cpp @@ -0,0 +1,268 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] +#include +#include +#include +#include +#include +using namespace Rcpp; + + +double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double length; + length = end - start; + double topTriangle; + double topRectangle; + double bottomTriangle; + double midPoint; + double midValue; + double res=0; + if (val1_start > val2_start) + { + if (val1_end >= val2_end) + { + // They are in the same order no bowtie + // top triangle + std::cout << "\n Path1"; + topTriangle = 0.5*length*(val1_end-val1_start); + topRectangle = length*(val1_start-val2_start); + bottomTriangle = 0.5*length*(val2_end-val2_start); + return topTriangle+topRectangle-bottomTriangle; + } + else + { + std::cout << "\n Path2"; + //bowtie + // lets make this really simple as the compiler + // will combine the expressions as needed + midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); + midValue = val1_start + midPoint*(val1_end-val1_start); + midPoint = midPoint*length; + std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; + + topTriangle = 0.5*midPoint*(midValue-val1_start); + topRectangle = midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*midPoint*(midValue-val2_start); + + res = topTriangle+topRectangle-bottomTriangle; + + topTriangle = 0.5*(length-midPoint)*(val2_end-midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*(length - midPoint)*(val1_end - midValue); + res += topTriangle+topRectangle-bottomTriangle; + return res; + } + } + else + { + if (val1_end > val2_end) + { + std::cout << "\n Path3"; + //bowtie + midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); + midValue = val2_start + midPoint*(val2_end-val2_start); + midPoint = midPoint*length; + std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; + + topTriangle = 0.5*midPoint*(midValue-val2_start); + topRectangle = midPoint*(val2_start-val1_start); + bottomTriangle = 0.5*midPoint*(midValue-val1_start); + + res = topTriangle+topRectangle-bottomTriangle; + + topTriangle = 0.5*(length-midPoint)*(val1_end-midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*(length - midPoint)*(val2_end - midValue); + res += topTriangle+topRectangle-bottomTriangle; + return res; + + } + else // same order + { + std::cout << "\n Path4"; + topTriangle = 0.5*length*(val2_end-val2_start); + topRectangle = length*(val2_start-val1_start); + bottomTriangle = 0.5*length*(val1_end-val1_start); + return topTriangle+topRectangle-bottomTriangle; + } + } +} + + +//' @title +//' Compute EMD +////' +////' @param loc1 numeric vector. +////' @param val1 numeric vector. +////' @param loc2 numeric vector. +////' @param val2 numeric vector. +//' +//' @export +// [[Rcpp::export]] +double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) +{ + double loc1_start, loc1_end, loc2_start, loc2_end; + double val1_start, val1_end, val2_start, val2_end; + int index1,index2; + + // Index in both lists + index1 = 0; + index2 = 0; + + + // Loc1 data + loc1_start = loc1[0]; + loc1_end = loc1[0]; + val1_start = 0; + val1_end = 0; + + // Loc2 data + loc2_start = loc2[0]; + loc2_end = loc2[0]; + val2_start = 0; + val2_end = 0; + + double endX; + endX = std::min(loc2_end,loc1_end); + // while statement + double res=0; + + // Hist 1 + double curSeg1Loc1; + double curSeg1Loc2; + double curSeg1Val1; + double curSeg1Val2; + + // Hist 1 + double curSeg2Loc1; + double curSeg2Loc2; + double curSeg2Val1; + double curSeg2Val2; + + // need to iterate through regions of constant gradient + double h1,h2; + h1=0; + h2=0; + int i123,j123; + double tempStart; + double tempEnd; + double valStart1; + double valEnd1; + double valStart2; + double valEnd2; + double tempDouble; + for (index1=-1;index1(x1[length(v1)]+w1) + res1 = res1 + m1*v1[length(v1)] + + + + res2= rep(0,length(x)) + m1 = x2[1](w2+x2[length(v2)]) + res2 = res2 + m1*v2[length(v2)] + + abs(res1-res2) + } + f1 +} + +test_that("3 element test", { + + sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") + for (w1 in (1:10)/10.0) + { + for (w2 in (1:10)/10.0) + { + x1 <- c(1,2,3) + v1 <- c(0.25,0.70,1.00) + x2 <- c(1,2,3) + v2 <- c(0.25,0.70,1.00) + print(x1) + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] + + res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + + expect_lt(abs(res1-res2),10**(-3)) + } + } + }) + +test_that("2 element test w1=0.1, w2=0.2", { + + sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") + w1 = 0.1 + w2 = 0.2 + x1 <- c(1,2) + v1 <- c(0.25,0.75) + x2 <- c(1,2) + v2 <- c(0.5,1.00) + print(x1) + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] + res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + + expect_lt(abs(res1-res2),10**(-4)) +}) + + + +test_that("3 element test Mixture", { + + sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") + for (w1 in (1:10)/10.0) + { + for (w2 in (1:10)/10.0) + { + x1 <- c(1,2,3) + v1 <- c(0.65,0.70,1.00) + x2 <- c(1,2,3) + v2 <- c(0.25,0.70,1.00) + print(x1) + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] + + res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + + expect_lt(abs(res1-res2),10**(-3)) + } + } + }) + + + + +test_that("3 element test w1=0.1, w2=0.2", { + + sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") + w1 = 0.1 + w2 = 0.2 + x1 <- c(1,2,3) + v1 <- c(0.25,0.70,1.00) + x2 <- c(1,2,3) + v2 <- c(0.25,0.70,1.00) + print(x1) + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] + res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + + expect_lt(abs(res1-res2),10**(-4)) + }) From b7a667436517b93b564dda0db84146626396e857 Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 14:42:37 +0000 Subject: [PATCH 12/84] fixed small bug --- tests/testthat/test_fastEMD.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index 6d024356..9c5764cd 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -113,7 +113,6 @@ test_that("2 element test w1=0.1, w2=0.2", { test_that("3 element test Mixture", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") for (w1 in (1:10)/10.0) { for (w2 in (1:10)/10.0) @@ -138,7 +137,6 @@ test_that("3 element test Mixture", { test_that("3 element test w1=0.1, w2=0.2", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2,3) From 6b64eb2d7fe8f9e4c96e73acbb5e1f678135432a Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 14:50:07 +0000 Subject: [PATCH 13/84] missed a few internal paths used for testing --- tests/testthat/test_fastEMD.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index 9c5764cd..3e17928c 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -72,7 +72,6 @@ makeFunction <- function(x1,v1,w1,x2,v2,w2) test_that("3 element test", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") for (w1 in (1:10)/10.0) { for (w2 in (1:10)/10.0) @@ -94,7 +93,6 @@ test_that("3 element test", { test_that("2 element test w1=0.1, w2=0.2", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2) From 40dddeb9a166245074e1eebf6d6a798cda72d476 Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 15:05:42 +0000 Subject: [PATCH 14/84] adding the V2 to namespace --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 7e023e49..24fa6913 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(NetEmdSmoothV2) export(NetEmdSmooth) export(adaptive_breaks) export(area_between_dhist_ecmfs) From b19b6fe2bcc478e49336ba5efb837d07b9908944 Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 15:11:52 +0000 Subject: [PATCH 15/84] updated versions of wrapper files --- R/RcppExports.R | 10 ++++++++++ src/RcppExports.cpp | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index 1185ed09..e718b542 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -35,3 +35,13 @@ NetEmdSmooth <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { .Call(`_netdist_NetEmdSmooth`, loc1, val1, binWidth1, loc2, val2, binWidth2) } +#' @title +#' Compute EMD +NULL + +#' +#' @export +NetEmdSmoothV2 <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { + .Call(`_netdist_NetEmdSmoothV2`, loc1, val1, binWidth1, loc2, val2, binWidth2) +} + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2f77e0ea..99086611 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -46,6 +46,22 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// NetEmdSmoothV2 +double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); +RcppExport SEXP _netdist_NetEmdSmoothV2(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type loc1(loc1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val1(val1SEXP); + Rcpp::traits::input_parameter< double >::type binWidth1(binWidth1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type loc2(loc2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val2(val2SEXP); + Rcpp::traits::input_parameter< double >::type binWidth2(binWidth2SEXP); + rcpp_result_gen = Rcpp::wrap(NetEmdSmoothV2(loc1, val1, binWidth1, loc2, val2, binWidth2)); + return rcpp_result_gen; +END_RCPP +} RcppExport SEXP run_testthat_tests(); @@ -53,6 +69,7 @@ static const R_CallMethodDef CallEntries[] = { {"_netdist_counts_from_observations", (DL_FUNC) &_netdist_counts_from_observations, 1}, {"_netdist_emd_fast_no_smoothing", (DL_FUNC) &_netdist_emd_fast_no_smoothing, 4}, {"_netdist_NetEmdSmooth", (DL_FUNC) &_netdist_NetEmdSmooth, 6}, + {"_netdist_NetEmdSmoothV2", (DL_FUNC) &_netdist_NetEmdSmoothV2, 6}, {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, {NULL, NULL, 0} }; From 15ee86ba11606daef02ab76ac2eff0657917a5bd Mon Sep 17 00:00:00 2001 From: ande Date: Wed, 20 Nov 2019 23:03:39 +0000 Subject: [PATCH 16/84] stable version --- src/fastSmoothV2.cpp | 6 +- tests/testthat/test_fastEMD.R | 122 ++++++++++++++++++++++++++++++++-- 2 files changed, 123 insertions(+), 5 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index c22d7d11..9fe8a84f 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -152,10 +152,13 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num double valStart2; double valEnd2; double tempDouble; + int secondStart=-1; + int h; for (index1=-1;index10.001) + { + browser() + } + # Swapped to percentage error + expect_lt(abs(res1-res2),10**(-3)) + + } + } + } + } + }) test_that("3 element test w1=0.1, w2=0.2", { + sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2,3) @@ -143,7 +257,7 @@ test_that("3 element test w1=0.1, w2=0.2", { v2 <- c(0.25,0.70,1.00) print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) - res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] + res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.0000000001)[[1]] res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) expect_lt(abs(res1-res2),10**(-4)) From d959a79c94dd9f33ef05d854923b3b0d5a12f917 Mon Sep 17 00:00:00 2001 From: ande Date: Thu, 21 Nov 2019 11:58:29 +0000 Subject: [PATCH 17/84] updated iupdated --- src/fastSmoothV2.cpp | 187 ++++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 64 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 9fe8a84f..788a3458 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -24,7 +24,7 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou { // They are in the same order no bowtie // top triangle - std::cout << "\n Path1"; +// std::cout << "\n Path1"; topTriangle = 0.5*length*(val1_end-val1_start); topRectangle = length*(val1_start-val2_start); bottomTriangle = 0.5*length*(val2_end-val2_start); @@ -32,14 +32,14 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou } else { - std::cout << "\n Path2"; +// std::cout << "\n Path2"; //bowtie // lets make this really simple as the compiler // will combine the expressions as needed midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); midValue = val1_start + midPoint*(val1_end-val1_start); midPoint = midPoint*length; - std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; +// std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; topTriangle = 0.5*midPoint*(midValue-val1_start); topRectangle = midPoint*(val1_start-val2_start); @@ -58,12 +58,12 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou { if (val1_end > val2_end) { - std::cout << "\n Path3"; +// std::cout << "\n Path3"; //bowtie midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); midValue = val2_start + midPoint*(val2_end-val2_start); midPoint = midPoint*length; - std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; +// std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; topTriangle = 0.5*midPoint*(midValue-val2_start); topRectangle = midPoint*(val2_start-val1_start); @@ -80,7 +80,7 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou } else // same order { - std::cout << "\n Path4"; +// std::cout << "\n Path4"; topTriangle = 0.5*length*(val2_end-val2_start); topRectangle = length*(val2_start-val1_start); bottomTriangle = 0.5*length*(val1_end-val1_start); @@ -154,61 +154,134 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num double tempDouble; int secondStart=-1; int h; + +auto start = std::chrono::high_resolution_clock::now(); + std::chrono::duration t1=start-start; +int count=0; +double avCount=0; for (index1=-1;index10) + { + if (index2curSeg1Loc2) + {break;} + if (tempStartcurSeg1Loc2) + {break;} if (tempStart Date: Thu, 21 Nov 2019 15:55:38 +0000 Subject: [PATCH 18/84] updated test --- tests/testthat/test_fastEMD.R | 82 ++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index 3e17928c..71c1eed7 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -10,26 +10,28 @@ makeFunction <- function(x1,v1,w1,x2,v2,w2) m2 = x<=(x1[1]+w1) m3 = m1*m2 res1 = res1 + m3*(x-x1[1])*(v1[1]-0)/w1 - - m1 = (x1[1]+w1)1) { - m1 = x1[i](x1[length(v1)]+w1) res1 = res1 + m1*v1[length(v1)] @@ -42,11 +44,13 @@ makeFunction <- function(x1,v1,w1,x2,v2,w2) m3 = m1*m2 res2 = m3*(x-x2[1])*(v2[1]-0)/w2 - - m1 = (x2[1]+w2)1) + { + m1 = (x2[1]+w2) Date: Thu, 21 Nov 2019 16:21:28 +0000 Subject: [PATCH 19/84] updated tests --- tests/testthat/test_emd.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 58e39c22..59145923 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -14,7 +14,7 @@ test_that("cost_matrix returns zeros along diagonal when both sets of bin bin_centres2 <- bin_centres1 expected <- rep(0, length(bin_centres1)) expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) - }) + }) test_that("cost_matrix returns zeros along diagonal and taxicab distance from all zeros for all other elements when both sets of bin locations are @@ -24,7 +24,7 @@ test_that("cost_matrix returns zeros along diagonal and taxicab distance from num_bins <- length(bin_centres1) expected <- toeplitz(1:num_bins)-1 expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) - }) + }) test_that("cost_matrix is correct size when the two histograms are of different lengths", { @@ -35,7 +35,7 @@ test_that("cost_matrix is correct size when the two histograms are of different expect_equal(nrow(cm), length(bin_centres1)) expect_equal(ncol(cm), length(bin_centres2)) - }) + }) context("EMD: EMD") # EMD: Property-based tests @@ -53,7 +53,7 @@ test_that("EMD methods return 0 when comparing a 1D feature distribution to bin_centres1, bin_centres2), expected) expect_equal(emd_cs(histogram1, histogram2), expected) expect_equal(emd(histogram1, histogram2), expected) - }) + }) test_that("EMD methods return numBins/2 when offsetting a symmetric discrete triangle distribution by 1", { @@ -170,7 +170,7 @@ test_that("EMD methods return numBins/2 when offsetting a symmetric discrete expect_equal(emd_cs(histogram1, histogram2), expected) expect_equal(emd(histogram1, histogram2), expected) - }) + }) test_that("EMD methods return same result for densely and sparsely specified bins", { @@ -203,7 +203,7 @@ test_that("EMD methods return same result for densely and sparsely specified emd_cs(sparse_histogram1,sparse_histogram2)) expect_equal(emd(dense_histogram1, dense_histogram2), emd(sparse_histogram1, sparse_histogram2)) - }) + }) test_that("EMD methods return same result when order of densely specified bins is changed", { From 5ed553ef63c5efa8168c36697b31a73a747ce9e2 Mon Sep 17 00:00:00 2001 From: ande Date: Fri, 22 Nov 2019 11:50:14 +0000 Subject: [PATCH 20/84] added old failure case. as a test --- R/emd.R | 25 +++++++++++++------------ tests/testthat/test_fastEMD.R | 15 +++++++++++++++ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/R/emd.R b/R/emd.R index 5219dd49..3d486c81 100644 --- a/R/emd.R +++ b/R/emd.R @@ -82,10 +82,19 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { } else #if ((dhist1$smoothing_window_width==1) && (dhist2$smoothing_window_width==1)) { + val1 <- cumsum(dhist1$masses) + val2 <- cumsum(dhist2$masses) + val1 <- val1/val1[length(val1)] + val2 <- val2/val2[length(val2)] + loc1=dhist1$locations + loc2=dhist2$locations + binWidth1=dhist1$smoothing_window_width + binWidth2=dhist2$smoothing_window_width + count=0 # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) - min_offset <- min(dhist2$locations) - max(dhist1$locations) - max_offset <- max(dhist2$locations) - min(dhist1$locations) + min_offset <- min(dhist2$locations) - max(dhist1$locations) - max(binWidth1,binWidth2) + max_offset <- max(dhist2$locations) - min(dhist1$locations) + max(binWidth1,binWidth2) # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a # couple of issues that arise if the optimise range is exactly min_offset @@ -97,18 +106,10 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { buffer <- 0.1 min_offset <- min_offset - buffer max_offset <- max_offset + buffer + # Define a single parameter function to minimise emd as a function of offset - val1 <- cumsum(dhist1$masses) - val2 <- cumsum(dhist2$masses) - val1 <- val1/val1[length(val1)] - val2 <- val2/val2[length(val2)] - loc1=dhist1$locations - loc2=dhist2$locations - binWidth1=dhist1$smoothing_window_width - binWidth2=dhist2$smoothing_window_width - count=0 emd_offset <- function(offset) { - temp1<- NetEmdSmooth(loc1+offset,val1,binWidth1,loc2,val2,binWidth2) + temp1<- NetEmdSmoothV2(loc1+offset,val1,binWidth1,loc2,val2,binWidth2) temp1 } # Get solution from optimiser diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index 23c3df36..8af30326 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -304,3 +304,18 @@ test_that("3 element test w1=0.1, w2=0.2", { expect_lt(abs(res1-res2),10**(-4)) }) + +test_that("Old failure case", { + d1 = list() + attr(d1,'class') <- "dhist" + d1$locations <- 0 + d1$masses <- 1000 + d2 = list() + attr(d2,'class') <- "dhist" + d2$locations <- c(0,1,2,3) + d2$masses <- c(8634,1242,114,10) + sq1 <- net_emd_single_pair(d1,d2,method='optimise',smoothing_window_width = 1) + sq2 <- net_emd_single_pair(d1,d2,method='optimiseRonly',smoothing_window_width = 1) + expect_lt(abs(sq1$min_emd-sq2$min_emd),10**(-4)) +}) + \ No newline at end of file From cac3bab0640c1876d17ce6f5405c271ecee93560 Mon Sep 17 00:00:00 2001 From: ande Date: Fri, 22 Nov 2019 13:33:05 +0000 Subject: [PATCH 21/84] first real fast version --- src/fastSmoothV2.cpp | 98 +++++++++----------------------------------- 1 file changed, 20 insertions(+), 78 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 788a3458..383ab488 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -144,7 +144,7 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num double h1,h2; h1=0; h2=0; - int i123,j123; + int i123; double tempStart; double tempEnd; double valStart1; @@ -155,15 +155,10 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num int secondStart=-1; int h; -auto start = std::chrono::high_resolution_clock::now(); - std::chrono::duration t1=start-start; -int count=0; -double avCount=0; for (index1=-1;index10) - { - if (index20) + { + if (index2curSeg1Loc2) - {break;} if (tempStartcurSeg1Loc2) - {break;} if (tempStart Date: Fri, 22 Nov 2019 14:20:48 +0000 Subject: [PATCH 22/84] working partially cleaned version next step pulling edge case out of loop --- src/fastSmoothV2.cpp | 184 +++++++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 85 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 383ab488..871cb07b 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -8,7 +8,7 @@ using namespace Rcpp; -double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) { double length; length = end - start; @@ -89,6 +89,16 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou } } +inline double get_segment_constrained(double start,double end, double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) +{ + //We have a valid range + double valStart1, valEnd1, valStart2, valEnd2; + valStart1 = seg1V1 + (seg1V2-seg1V1)*(start - seg1L1)/(seg1L2 - seg1L1); + valEnd1 = seg1V1 + (seg1V2-seg1V1)*(end - seg1L1)/(seg1L2 - seg1L1); + valStart2 = seg2V1 + (seg2V2-seg2V1)*(start - seg2L1)/(seg2L2 - seg2L1); + valEnd2 = seg2V1 + (seg2V2-seg2V1)*(end - seg2L1)/(seg2L2 - seg2L1); + return get_segment(start,end,valStart1,valEnd1,valStart2,valEnd2); +} //' @title //' Compute EMD @@ -102,30 +112,8 @@ double get_segment(double start,double end,double val1_start,double val1_end,dou // [[Rcpp::export]] double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) { - double loc1_start, loc1_end, loc2_start, loc2_end; - double val1_start, val1_end, val2_start, val2_end; int index1,index2; - // Index in both lists - index1 = 0; - index2 = 0; - - - // Loc1 data - loc1_start = loc1[0]; - loc1_end = loc1[0]; - val1_start = 0; - val1_end = 0; - - // Loc2 data - loc2_start = loc2[0]; - loc2_end = loc2[0]; - val2_start = 0; - val2_end = 0; - - double endX; - endX = std::min(loc2_end,loc1_end); - // while statement double res=0; // Hist 1 @@ -141,57 +129,97 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num double curSeg2Val2; // need to iterate through regions of constant gradient - double h1,h2; - h1=0; - h2=0; int i123; double tempStart; double tempEnd; - double valStart1; - double valEnd1; - double valStart2; - double valEnd2; - double tempDouble; - int secondStart=-1; - int h; + double secondStart=-1; + + double maxLoc = std::max(loc1[loc1.size()-1] +binWidth1,loc2[loc2.size()-1]+binWidth2 ); + double minLoc = std::min(loc1[0],loc2[0]); + + if (loc2[0]0) @@ -207,7 +235,7 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num } if (index2==-1) { - curSeg2Loc1=std::min(loc1[0],loc2[0]); + curSeg2Loc1=minLoc; curSeg2Loc2=loc2[0]; curSeg2Val1=0; curSeg2Val2=0; @@ -227,14 +255,7 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num tempEnd = std::min(curSeg1Loc2,curSeg2Loc2); if (tempStart Date: Mon, 25 Nov 2019 08:35:51 +0000 Subject: [PATCH 23/84] current working version --- src/fastSmoothV2.cpp | 226 +++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 128 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 871cb07b..46166ed5 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -7,7 +7,7 @@ #include using namespace Rcpp; - +//compute segment inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) { double length; @@ -23,8 +23,6 @@ inline double get_segment(double start,double end,double val1_start,double val1_ if (val1_end >= val2_end) { // They are in the same order no bowtie - // top triangle -// std::cout << "\n Path1"; topTriangle = 0.5*length*(val1_end-val1_start); topRectangle = length*(val1_start-val2_start); bottomTriangle = 0.5*length*(val2_end-val2_start); @@ -32,7 +30,6 @@ inline double get_segment(double start,double end,double val1_start,double val1_ } else { -// std::cout << "\n Path2"; //bowtie // lets make this really simple as the compiler // will combine the expressions as needed @@ -89,15 +86,26 @@ inline double get_segment(double start,double end,double val1_start,double val1_ } } -inline double get_segment_constrained(double start,double end, double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) +// cut down and compute segment +inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) { - //We have a valid range - double valStart1, valEnd1, valStart2, valEnd2; - valStart1 = seg1V1 + (seg1V2-seg1V1)*(start - seg1L1)/(seg1L2 - seg1L1); - valEnd1 = seg1V1 + (seg1V2-seg1V1)*(end - seg1L1)/(seg1L2 - seg1L1); - valStart2 = seg2V1 + (seg2V2-seg2V1)*(start - seg2L1)/(seg2L2 - seg2L1); - valEnd2 = seg2V1 + (seg2V2-seg2V1)*(end - seg2L1)/(seg2L2 - seg2L1); - return get_segment(start,end,valStart1,valEnd1,valStart2,valEnd2); + //We have a valid range + double valStart1, valEnd1, valStart2, valEnd2; + double start,end; + start = std::max(seg1L1,seg2L1); + end = std::min(seg1L2,seg2L2); + if (start0) - { - if (index2 Date: Mon, 25 Nov 2019 14:00:00 +0000 Subject: [PATCH 24/84] updated file naming --- src/{fastSmoothV2.cpp => fastSmoothV2_old.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{fastSmoothV2.cpp => fastSmoothV2_old.cpp} (100%) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2_old.cpp similarity index 100% rename from src/fastSmoothV2.cpp rename to src/fastSmoothV2_old.cpp From 9c99088e587504230f67a28e85fa300a2e4249b6 Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 25 Nov 2019 14:01:00 +0000 Subject: [PATCH 25/84] new fast smooth file --- src/fastSmoothV2.cpp | 289 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 src/fastSmoothV2.cpp diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp new file mode 100644 index 00000000..587310c2 --- /dev/null +++ b/src/fastSmoothV2.cpp @@ -0,0 +1,289 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] +#include +#include +#include +#include +#include +using namespace Rcpp; + +//compute segment +inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double length; + length = end - start; + double topTriangle; + double topRectangle; + double bottomTriangle; + double midPoint; + double midValue; + double res=0; + if (val1_start > val2_start) + { + if (val1_end >= val2_end) + { + // They are in the same order no bowtie + // seg1 is above seg2 + // triangle of seg1 + topTriangle = 0.5*length*(val1_end-val1_start); + // rectangle between seg1 and seg2 + topRectangle = length*(val1_start-val2_start); + // triangle of seg2 (to be removed) + bottomTriangle = 0.5*length*(val2_end-val2_start); + return topTriangle+topRectangle-bottomTriangle; + } + else + { + //bowtie + // lets make this really simple as the compiler + // will combine the expressions as needed + midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); + midValue = val1_start + midPoint*(val1_end-val1_start); + midPoint = midPoint*length; + + topTriangle = 0.5*midPoint*(midValue-val1_start); + topRectangle = midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*midPoint*(midValue-val2_start); + + res = topTriangle+topRectangle-bottomTriangle; + + topTriangle = 0.5*(length-midPoint)*(val2_end-midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*(length - midPoint)*(val1_end - midValue); + res += topTriangle+topRectangle-bottomTriangle; + return res; + } + } + else + { + if (val1_end > val2_end) + { + //bowtie + // Find the point where they cross. + // (Solution of linear equations) + midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); + midValue = val2_start + midPoint*(val2_end-val2_start); + midPoint = midPoint*length; + + topTriangle = 0.5*midPoint*(midValue-val2_start); + topRectangle = midPoint*(val2_start-val1_start); + bottomTriangle = 0.5*midPoint*(midValue-val1_start); + + res = topTriangle+topRectangle-bottomTriangle; + + topTriangle = 0.5*(length-midPoint)*(val1_end-midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5*(length - midPoint)*(val2_end - midValue); + res += topTriangle+topRectangle-bottomTriangle; + return res; + + } + else // same order + { + // seg2 is above seg1 + // Triangle seg2 above seg1 + topTriangle = 0.5*length*(val2_end-val2_start); + // rectangle between seg2 and seg1 + topRectangle = length*(val2_start-val1_start); + // Seg1 triangle to be removed + bottomTriangle = 0.5*length*(val1_end-val1_start); + return topTriangle+topRectangle-bottomTriangle; + } + } +} + +// cut down and compute segment +inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) +{ + //We have a valid range + double valStart1, valEnd1, valStart2, valEnd2; + double start,end; + start = std::max(seg1L1,seg2L1); + end = std::min(seg1L2,seg2L2); + if (startcurSeg2Loc3) + {break;} + } + } + else + { + // loc2 starts before loc1 so lets deal with those segments first + // Fix the position of Seg2 and then interate over Seg1 until we have all + // of the segments of Seg1 before Seg2 starts. + curSeg2Loc1=minLoc; + curSeg2Loc2=loc2[0]; + curSeg2Loc3=loc2[0]; + curSeg2Val1=0; + curSeg2Val2=0; + curSeg1Val2=0; + for (index1=0;index1curSeg1Loc3) + {break;} + } + } + // Add both the overlapping sections and the non overlapping section on the right + for (index1=0;index1curSeg1Loc3) + {break;} + res += get_double_segment_constrained(curSeg1Loc1,curSeg1Loc2,curSeg1Loc3,curSeg1Val1,curSeg1Val2,curSeg2Loc1,curSeg2Loc2,curSeg2Loc3,curSeg2Val1,curSeg2Val2); + } + } + return res; +} From 10d7de8a6453d61d3aff636376835ccc7b96b8d3 Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 25 Nov 2019 14:01:51 +0000 Subject: [PATCH 26/84] changed name of old version --- src/fastSmoothV2_old.cpp | 42 +++++++++++++++++++---- tests/testthat/test_fastEMD.R | 64 ++++++++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 19 deletions(-) diff --git a/src/fastSmoothV2_old.cpp b/src/fastSmoothV2_old.cpp index 46166ed5..8eb72d8c 100644 --- a/src/fastSmoothV2_old.cpp +++ b/src/fastSmoothV2_old.cpp @@ -118,7 +118,7 @@ inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L //' //' @export // [[Rcpp::export]] -double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) +double NetEmdSmoothV2_old(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) { int index1,index2; @@ -138,8 +138,6 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num // need to iterate through regions of constant gradient int i123; - double tempStart; - double tempEnd; double secondStart=-1; double maxLoc = std::max(loc1[loc1.size()-1] +binWidth1,loc2[loc2.size()-1]+binWidth2 ); @@ -178,9 +176,41 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num if (curSeg1Loc20.001) - { - browser() - } + # if (abs(res1-res2)>0.001) + # { + # browser() + # } # Swapped to percentage error expect_lt(abs(res1-res2),10**(-3)) @@ -290,7 +300,7 @@ test_that("many element test Mixture ", { test_that("3 element test w1=0.1, w2=0.2", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV2.cpp") + sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2,3) @@ -318,4 +328,32 @@ test_that("Old failure case", { sq2 <- net_emd_single_pair(d1,d2,method='optimiseRonly',smoothing_window_width = 1) expect_lt(abs(sq1$min_emd-sq2$min_emd),10**(-4)) }) - \ No newline at end of file + + +test_that("Old failure case 2", { + x1 = c(0.2862492, 0.6917626) + v1 = c(0.6519357, 1.0000000) + w1 = 0.2027567 + x2 = c(0.9990626, 1.4882579) + v2 = c(0.6519357, 1.0000000) + w2 = 0.2445976 + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res1 <- integrate(f1,min(min(x1),min(x2)),max(max(x1),max(x2))+max(w1,w2))[[1]] + res2 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + expect_lt(abs(res2-res1),10**(-4)) +}) + + + +test_that("Old Failure Case 2 reverse", { + x2 = c(0.2862492, 0.6917626) + v2 = c(0.6519357, 1.0000000) + w2 = 0.2027567 + x1 = c(0.9990626, 1.4882579) + v1 = c(0.6519357, 1.0000000) + w1 = 0.2445976 + f1 <- makeFunction(x1,v1,w1,x2,v2,w2) + res1 <- integrate(f1,min(min(x1),min(x2)),max(max(x1),max(x2))+max(w1,w2))[[1]] + res2 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + expect_lt(abs(res2-res1),10**(-4)) +}) \ No newline at end of file From cc76a3bf10ff10326a66e870dfb95312738a4296 Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 25 Nov 2019 14:46:21 +0000 Subject: [PATCH 27/84] removed sourceCpp --- tests/testthat/test_fastEMD.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index e062ec4a..459b7aec 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -86,7 +86,6 @@ getVal <- function(x1,v1,w1,x2,v2,w2) test_that("3 element test", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") for (w1 in (1:10)/10.0) { for (w2 in (1:10)/10.0) @@ -108,7 +107,6 @@ test_that("3 element test", { test_that("2 element test w1=0.1, w2=0.2", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2) @@ -164,7 +162,6 @@ test_that("1 element vs many test Mixture", { test_that("3 element test Mixture", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") for (w1 in (1:10)/10.0) { for (w2 in (1:10)/10.0) @@ -185,7 +182,6 @@ test_that("3 element test Mixture", { }) test_that("3 element test Mixture MidPoint", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 1 w2 = 1 for (v1_2 in (1:10)/10.0) @@ -209,7 +205,6 @@ test_that("3 element test Mixture MidPoint", { test_that("3 element test Mixture StartPoint", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 1 w2 = 1 for (v1_1 in (1:5)/10.0) @@ -233,7 +228,6 @@ test_that("3 element test Mixture StartPoint", { test_that("3 element test Mixture StartLoc", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 1 w2 = 1 for (x1_1 in (1:9)/10.0) @@ -257,7 +251,6 @@ test_that("3 element test Mixture StartLoc", { test_that("many element test Mixture ", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 1 w2 = 1 for (i in (2:10)*1) @@ -300,7 +293,6 @@ test_that("many element test Mixture ", { test_that("3 element test w1=0.1, w2=0.2", { - sourceCpp("~/Documents/network-comparison/src/fastSmoothV3.cpp") w1 = 0.1 w2 = 0.2 x1 <- c(1,2,3) From aa1f6e82496f2c14317bf3d318c3b2b416bdcde9 Mon Sep 17 00:00:00 2001 From: ande Date: Mon, 25 Nov 2019 15:24:40 +0000 Subject: [PATCH 28/84] fixed introduced bug --- src/fastSmoothV2.cpp | 8 ++++++-- tests/testthat/test_fastEMD.R | 10 ---------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 587310c2..b87cf747 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -237,7 +237,12 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num {break;} } } + std::cout << "res = " << res <<"\n"; // Add both the overlapping sections and the non overlapping section on the right + // Note we reiterate over the first few sections loc1 + // Could store where we are upto from above to save time + // Reset Val counter + curSeg1Val2=0; for (index1=0;index1curSeg1Loc3) {break;} - res += get_double_segment_constrained(curSeg1Loc1,curSeg1Loc2,curSeg1Loc3,curSeg1Val1,curSeg1Val2,curSeg2Loc1,curSeg2Loc2,curSeg2Loc3,curSeg2Val1,curSeg2Val2); } } return res; diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index 459b7aec..cd091401 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -94,7 +94,6 @@ test_that("3 element test", { v1 <- c(0.25,0.70,1.00) x2 <- c(1,2,3) v2 <- c(0.25,0.70,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] @@ -113,7 +112,6 @@ test_that("2 element test w1=0.1, w2=0.2", { v1 <- c(0.25,0.75) x2 <- c(1,2) v2 <- c(0.5,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[2],x1[2])+max(w1,w2),abs.tol=0.000000001)[[1]] res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) @@ -130,7 +128,6 @@ test_that("1 element at 0 vs many test Mixture", { v1 <- c(1.00) x2 <- 1:w1 v2 <- (1:w1)/w1 - print(x1) f1 <- makeFunction(x1,v1,1,x2,v2,1) res2 <- integrate(f1,0,w1+1,abs.tol=0.000000001)[[1]] @@ -149,7 +146,6 @@ test_that("1 element vs many test Mixture", { v1 <- c(1.00) x2 <- 1:w1 v2 <- (1:w1)/w1 - print(x1) f1 <- makeFunction(x1,v1,1,x2,v2,1) res2 <- integrate(f1,0,w1+1,abs.tol=0.000000001)[[1]] @@ -170,7 +166,6 @@ test_that("3 element test Mixture", { v1 <- c(0.65,0.70,1.00) x2 <- c(1,2,3) v2 <- c(0.25,0.70,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] @@ -192,7 +187,6 @@ test_that("3 element test Mixture MidPoint", { v1 <- c(0.1,v1_2,1.00) x2 <- c(1,2,3) v2 <- c(0.1,v2_2,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] @@ -215,7 +209,6 @@ test_that("3 element test Mixture StartPoint", { v1 <- c(v1_1,0.5,1.00) x2 <- c(1,2,3) v2 <- c(v2_1,0.5,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] @@ -238,7 +231,6 @@ test_that("3 element test Mixture StartLoc", { v1 <- c(0.25,0.5,1.00) x2 <- c(x2_1,2,4) v2 <- c(0.3,0.5,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.000000001)[[1]] @@ -269,7 +261,6 @@ test_that("many element test Mixture ", { w2 <- min(diff(x2))/y123 v2 <- cumsum(abs(rnorm(j))) v2 = v2/v2[length(v2)] - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) top1 = max(x2[length(x2)],x1[length(x1)])+max(w1,w2) bottom1 = min(x2[1],x1[1]) @@ -299,7 +290,6 @@ test_that("3 element test w1=0.1, w2=0.2", { v1 <- c(0.25,0.70,1.00) x2 <- c(1,2,3) v2 <- c(0.25,0.70,1.00) - print(x1) f1 <- makeFunction(x1,v1,w1,x2,v2,w2) res2 <- integrate(f1,0,max(x2[3],x1[3])+max(w1,w2),abs.tol=0.0000000001)[[1]] res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) From 6f7a515955db3a929ddb718437aa1b6c95d1e31a Mon Sep 17 00:00:00 2001 From: ande Date: Tue, 26 Nov 2019 12:21:13 +0000 Subject: [PATCH 29/84] updated the code --- R/emd.R | 24 +++++++---------------- src/fastSmoothV2.cpp | 45 ++++++++++++++++++++++---------------------- 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/R/emd.R b/R/emd.R index 3d486c81..99f6ec8d 100644 --- a/R/emd.R +++ b/R/emd.R @@ -91,27 +91,16 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { binWidth1=dhist1$smoothing_window_width binWidth2=dhist2$smoothing_window_width count=0 + # Offset the histograms to make the alignments work + loc1Mod <- loc1 - binWidth1/2 + loc2Mod <- loc2 - binWidth2/2 # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) - min_offset <- min(dhist2$locations) - max(dhist1$locations) - max(binWidth1,binWidth2) - max_offset <- max(dhist2$locations) - min(dhist1$locations) + max(binWidth1,binWidth2) + min_offset <- min(loc2Mod) - max(loc1Mod) - max(binWidth1,binWidth2) + max_offset <- max(loc2Mod) - min(loc1Mod) + max(binWidth1,binWidth2) # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the optimise range is exactly min_offset - # to max_offset - # 1) If lower and upper are equal, the optimise method will throw and error - # 2) It seems that optimise is not guaranteed to explore its lower and upper - # bounds, even in the case where one of them is the offset with minimum - # EMD - buffer <- 0.1 - min_offset <- min_offset - buffer - max_offset <- max_offset + buffer - - # Define a single parameter function to minimise emd as a function of offset - emd_offset <- function(offset) { - temp1<- NetEmdSmoothV2(loc1+offset,val1,binWidth1,loc2,val2,binWidth2) - temp1 - } + # couple of issues that arise if the op # Get solution from optimiser soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, tol = .Machine$double.eps*1000) @@ -165,6 +154,7 @@ min_emd_optimise <- function(dhist1, dhist2) { soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, tol = .Machine$double.eps*1000) + browser() # Return mnimum EMD and associated offset min_emd <- soln$objective min_offset <- soln$minimum diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index b87cf747..07491846 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -130,7 +130,7 @@ double get_double_segment_constrained(double seg1Loc1, double seg1Loc2, double s // This could be easily special cased (saving ~1 if statements ). res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc1, seg2Loc2, seg1Val2, seg1Val2, seg2Val1, seg2Val2); - // compare the flat section with the linear section + // compare the flat section with the flat section // This could be easily special cased (saving ~2 if statements ). res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc2, seg2Loc3, seg1Val2, seg1Val2, seg2Val2, seg2Val2); return res; @@ -157,28 +157,28 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num double res=0; // Hist 1 - double curSeg1Loc1; - double curSeg1Loc2; - double curSeg1Loc3; - double curSeg1Val1; - double curSeg1Val2; + double curSeg1Loc1; // Start of the gradient section in Seg1 + double curSeg1Loc2; // End of the gradient section in Seg1 + double curSeg1Loc3; // End of the flat section in Seg1 + double curSeg1Val1; // Start value in Seg1 + double curSeg1Val2; // End value in Seg1 - // Hist 1 - double curSeg2Loc1; - double curSeg2Loc2; - double curSeg2Loc3; - double curSeg2Val1; - double curSeg2Val2; + // Hist 2 + double curSeg2Loc1; // Start of the gradient section in Seg2 + double curSeg2Loc2; // End of the gradient section in Seg2 + double curSeg2Loc3; // End of the flat section in Seg2 + double curSeg2Val1; // Start value in Seg2 + double curSeg2Val2; // End value in Seg2 - double tempDouble; - // need to iterate through regions of constant gradient + // Starting index for the second histogram double secondStart=0; - + + // Smallest and largest location values double maxLoc = std::max(loc1[loc1.size()-1] +binWidth1,loc2[loc2.size()-1]+binWidth2 ); double minLoc = std::min(loc1[0],loc2[0]); - // warning index1==-1 case is not well tested + // warning area before loc2[0] is not well tested // As values outside of the range appear to be zero // Dealing with segments which are to the left of the region covered by both @@ -189,10 +189,11 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num // Fix the position of Seg1 and then interate over Seg2 until we have all // of the segments of Seg2 before Seg1 starts. curSeg1Loc1=minLoc; - curSeg1Loc2=loc1[0]; + curSeg1Loc2=minLoc; curSeg1Loc3=loc1[0]; curSeg1Val1=0; curSeg1Val2=0; + // Set this value so we can update in the loop curSeg2Val2=0; for (index2=0;index2curSeg2Loc3) {break;} } @@ -217,10 +216,11 @@ double NetEmdSmoothV2(NumericVector loc1,NumericVector val1,double binWidth1,Num // Fix the position of Seg2 and then interate over Seg1 until we have all // of the segments of Seg1 before Seg2 starts. curSeg2Loc1=minLoc; - curSeg2Loc2=loc2[0]; + curSeg2Loc2=minLoc; curSeg2Loc3=loc2[0]; curSeg2Val1=0; curSeg2Val2=0; + // Set this value so we can update in the lopp curSeg1Val2=0; for (index1=0;index1curSeg1Loc3) + if (curSeg2Loc3>curSeg1Loc3) {break;} } } From 8000fa1fc6afda32899f4978f0725c693e7f9c79 Mon Sep 17 00:00:00 2001 From: ande Date: Tue, 26 Nov 2019 13:28:18 +0000 Subject: [PATCH 30/84] replace missing file --- R/emd.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/emd.R b/R/emd.R index 99f6ec8d..e9eee6f3 100644 --- a/R/emd.R +++ b/R/emd.R @@ -100,7 +100,20 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { max_offset <- max(loc2Mod) - min(loc1Mod) + max(binWidth1,binWidth2) # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the op + # couple of issues that arise if the optimise range is exactly min_offset + # to max_offset + # 1) If lower and upper are equal, the optimise method will throw and error + # 2) It seems that optimise is not guaranteed to explore its lower and upper + # bounds, even in the case where one of them is the offset with minimum + # EMD + buffer <- 0.1 + min_offset <- min_offset - buffer + max_offset <- max_offset + buffer + # Define a single parameter function to minimise emd as a function of offset + emd_offset <- function(offset) { + temp1<- NetEmdSmoothV2(loc1Mod+offset,val1,binWidth1,loc2Mod,val2,binWidth2) + temp1 + } # Get solution from optimiser soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, tol = .Machine$double.eps*1000) From 1f5d9d7c57f069ebe65dfd46110c7a5c4b041240 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 07:19:21 +0100 Subject: [PATCH 31/84] Formatting and whitespace --- src/fastSmoothV2.cpp | 441 +++++++++++++++++++++++-------------------- 1 file changed, 235 insertions(+), 206 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 07491846..242a6b8a 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -5,141 +5,149 @@ #include #include #include + using namespace Rcpp; //compute segment -inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +inline double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end) { - double length; - length = end - start; + const double length = end - start; + double topTriangle; double topRectangle; double bottomTriangle; double midPoint; double midValue; - double res=0; - if (val1_start > val2_start) - { - if (val1_end >= val2_end) - { + double res = 0; + + if (val1_start > val2_start) { + if (val1_end >= val2_end) { // They are in the same order no bowtie - // seg1 is above seg2 + // seg1 is above seg2 // triangle of seg1 topTriangle = 0.5*length*(val1_end-val1_start); // rectangle between seg1 and seg2 topRectangle = length*(val1_start-val2_start); // triangle of seg2 (to be removed) - bottomTriangle = 0.5*length*(val2_end-val2_start); + bottomTriangle = 0.5*length*(val2_end-val2_start); return topTriangle+topRectangle-bottomTriangle; } - else - { - //bowtie - // lets make this really simple as the compiler - // will combine the expressions as needed - midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); - midValue = val1_start + midPoint*(val1_end-val1_start); - midPoint = midPoint*length; - - topTriangle = 0.5*midPoint*(midValue-val1_start); - topRectangle = midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*midPoint*(midValue-val2_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val2_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val1_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; + else { + //bowtie + // lets make this really simple as the compiler + // will combine the expressions as needed + midPoint = (val1_start - val2_start) / + ((val2_end - val2_start) - (val1_end - val1_start)); + + midValue = val1_start + midPoint * (val1_end - val1_start); + + midPoint = midPoint * length; + + topTriangle = 0.5 * midPoint * (midValue - val1_start); + topRectangle = midPoint * (val1_start - val2_start); + bottomTriangle = 0.5 * midPoint * (midValue - val2_start); + + res = topTriangle + topRectangle - bottomTriangle; + + topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); + res += topTriangle + topRectangle - bottomTriangle; + return res; } } - else - { - if (val1_end > val2_end) - { - //bowtie - // Find the point where they cross. - // (Solution of linear equations) - midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); - midValue = val2_start + midPoint*(val2_end-val2_start); - midPoint = midPoint*length; - - topTriangle = 0.5*midPoint*(midValue-val2_start); - topRectangle = midPoint*(val2_start-val1_start); - bottomTriangle = 0.5*midPoint*(midValue-val1_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val1_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val2_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; - - } - else // same order - { - // seg2 is above seg1 - // Triangle seg2 above seg1 - topTriangle = 0.5*length*(val2_end-val2_start); - // rectangle between seg2 and seg1 - topRectangle = length*(val2_start-val1_start); - // Seg1 triangle to be removed - bottomTriangle = 0.5*length*(val1_end-val1_start); - return topTriangle+topRectangle-bottomTriangle; - } + else { + if (val1_end > val2_end) { + //bowtie + // Find the point where they cross. + // (Solution of linear equations) + midPoint = (val2_start - val1_start) / + ((val1_end - val1_start) - (val2_end - val2_start)); + + midValue = val2_start + midPoint * (val2_end - val2_start); + midPoint = midPoint * length; + + topTriangle = 0.5 * midPoint * (midValue - val2_start); + topRectangle = midPoint * (val2_start - val1_start); + bottomTriangle = 0.5 * midPoint * (midValue - val1_start); + + res = topTriangle + topRectangle - bottomTriangle; + + topTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); + res += topTriangle + topRectangle - bottomTriangle; + return res; } + else { // same order + // seg2 is above seg1 + // Triangle seg2 above seg1 + topTriangle = 0.5 * length * (val2_end - val2_start); + // rectangle between seg2 and seg1 + topRectangle = length * (val2_start - val1_start); + // Seg1 triangle to be removed + bottomTriangle = 0.5 * length * (val1_end - val1_start); + return topTriangle + topRectangle - bottomTriangle; + } + } } // cut down and compute segment -inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) +inline double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2) { - //We have a valid range - double valStart1, valEnd1, valStart2, valEnd2; - double start,end; - start = std::max(seg1L1,seg2L1); - end = std::min(seg1L2,seg2L2); - if (startcurSeg2Loc3) - {break;} + if (loc2[0] < loc1[0]) { + // loc2 starts before loc1 so lets deal with those segments first + // Fix the position of Seg1 and then interate over Seg2 until we have all + // of the segments of Seg2 before Seg1 starts. + curSeg1Loc1 = minLoc; + curSeg1Loc2 = minLoc; + curSeg1Loc3 = loc1[0]; + curSeg1Val1 = 0; + curSeg1Val2 = 0; + + // Set this value so we can update in the loop + curSeg2Val2 = 0; + for (index2 = 0; index2 < loc2.size(); index2++) { + curSeg2Loc1 = loc2[index2]; + curSeg2Loc2 = loc2[index2] + binWidth2; + if (index2 == loc2.size() - 1) { + curSeg2Loc3 = maxLoc; + } + else { + curSeg2Loc3 = loc2[index2 + 1]; } + curSeg2Val1 = curSeg2Val2; + curSeg2Val2 = val2[index2]; + res += get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + if (curSeg1Loc1 > curSeg2Loc3) { + break; + } + } } else { - // loc2 starts before loc1 so lets deal with those segments first - // Fix the position of Seg2 and then interate over Seg1 until we have all - // of the segments of Seg1 before Seg2 starts. - curSeg2Loc1=minLoc; - curSeg2Loc2=minLoc; - curSeg2Loc3=loc2[0]; - curSeg2Val1=0; - curSeg2Val2=0; - // Set this value so we can update in the lopp - curSeg1Val2=0; - for (index1=0;index1curSeg1Loc3) - {break;} + // loc2 starts before loc1 so lets deal with those segments first + // Fix the position of Seg2 and then interate over Seg1 until we have all + // of the segments of Seg1 before Seg2 starts. + curSeg2Loc1 = minLoc; + curSeg2Loc2 = minLoc; + curSeg2Loc3 = loc2[0]; + curSeg2Val1 = 0; + curSeg2Val2 = 0; + + // Set this value so we can update in the lopp + curSeg1Val2 = 0; + for (index1 = 0; index1 < loc1.size(); index1++) { + curSeg1Loc1 = loc1[index1]; + curSeg1Loc2 = loc1[index1] + binWidth1; + if (index1 == loc1.size() - 1) { + curSeg1Loc3 = maxLoc; } + else { + curSeg1Loc3 = loc1[index1 + 1]; + } + curSeg1Val1 = curSeg1Val2; + curSeg1Val2 = val1[index1]; + res += get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + if (curSeg2Loc1 > curSeg1Loc3) { + break; + } + } } + // Add both the overlapping sections and the non overlapping section on the right // Note we reiterate over the first few sections loc1 // Could store where we are upto from above to save time // Reset Val counter - curSeg1Val2=0; - for (index1=0;index1curSeg1Loc3) - {break;} - } + curSeg1Val2 = 0; + for (index1 = 0; index1 < loc1.size(); index1++) { + // Get the three relevant locations + // Start; end of linear section; end of flat section + curSeg1Loc1 = loc1[index1]; + curSeg1Loc2 = loc1[index1] + binWidth1; + // could pull this check outside of the loop with final case not sure if worth it + if (index1 == loc1.size() - 1) { + curSeg1Loc3 = maxLoc; + } + else { + curSeg1Loc3 = loc1[index1 + 1]; + } + // Update value to the start and end of the current section + curSeg1Val1 = curSeg1Val2; + curSeg1Val2 = val1[index1]; + // Setting up the previous value for the next loop + // Could replace this loop with a while + // but if so would need to be careful about overlaps + if (secondStart == 0) { + curSeg2Val2 = 0; + } + else { + curSeg2Val2 = val2[secondStart - 1]; + } + for (index2 = secondStart; index2 < loc2.size(); index2++) { + // Construct 3 sections for second seg. + curSeg2Loc1=loc2[index2]; + curSeg2Loc2=loc2[index2] + binWidth2; + if (index2 == loc2.size() - 1) { + curSeg2Loc3 = maxLoc; + } + else { + curSeg2Loc3 = loc2[index2 + 1]; + } + //update values + curSeg2Val1 = curSeg2Val2; + curSeg2Val2 = val2[index2]; + // If this section is behind Seg1 + // Do not consider again + if (curSeg2Loc3 < curSeg1Loc1) { + secondStart = index2 + 1; + continue; + } + // If current Seg2 is beyond Seg1 break out of loop + res += get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + if (curSeg2Loc3 > curSeg1Loc3) { + break; + } + } } return res; } From 91c9561da39de69e64c7e9a755d5e76e63242abc Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 08:09:49 +0100 Subject: [PATCH 32/84] Formatting --- src/fastSmoothV2.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 242a6b8a..d222256d 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -26,12 +26,12 @@ inline double get_segment(double start, double end, double val1_start, // They are in the same order no bowtie // seg1 is above seg2 // triangle of seg1 - topTriangle = 0.5*length*(val1_end-val1_start); + topTriangle = 0.5*length * (val1_end - val1_start); // rectangle between seg1 and seg2 - topRectangle = length*(val1_start-val2_start); + topRectangle = length * (val1_start - val2_start); // triangle of seg2 (to be removed) - bottomTriangle = 0.5*length*(val2_end-val2_start); - return topTriangle+topRectangle-bottomTriangle; + bottomTriangle = 0.5 * length * (val2_end - val2_start); + return topTriangle + topRectangle - bottomTriangle; } else { //bowtie From 9151e520eb2eceb39ae1e6e84f6ebf500c2bd2c0 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 09:20:10 +0100 Subject: [PATCH 33/84] Local for-loop index --- src/fastSmoothV2.cpp | 52 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index d222256d..33a76c44 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -161,8 +161,6 @@ double get_double_segment_constrained( double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2) { - int index1, index2; - double res = 0; // Hist 1 @@ -206,17 +204,17 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, // Set this value so we can update in the loop curSeg2Val2 = 0; - for (index2 = 0; index2 < loc2.size(); index2++) { - curSeg2Loc1 = loc2[index2]; - curSeg2Loc2 = loc2[index2] + binWidth2; - if (index2 == loc2.size() - 1) { + for (int index = 0; index < loc2.size(); index++) { + curSeg2Loc1 = loc2[index]; + curSeg2Loc2 = loc2[index] + binWidth2; + if (index == loc2.size() - 1) { curSeg2Loc3 = maxLoc; } else { - curSeg2Loc3 = loc2[index2 + 1]; + curSeg2Loc3 = loc2[index + 1]; } curSeg2Val1 = curSeg2Val2; - curSeg2Val2 = val2[index2]; + curSeg2Val2 = val2[index]; res += get_double_segment_constrained( curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); @@ -239,17 +237,17 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, // Set this value so we can update in the lopp curSeg1Val2 = 0; - for (index1 = 0; index1 < loc1.size(); index1++) { - curSeg1Loc1 = loc1[index1]; - curSeg1Loc2 = loc1[index1] + binWidth1; - if (index1 == loc1.size() - 1) { + for (int index = 0; index < loc1.size(); index++) { + curSeg1Loc1 = loc1[index]; + curSeg1Loc2 = loc1[index] + binWidth1; + if (index == loc1.size() - 1) { curSeg1Loc3 = maxLoc; } else { - curSeg1Loc3 = loc1[index1 + 1]; + curSeg1Loc3 = loc1[index + 1]; } curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index1]; + curSeg1Val2 = val1[index]; res += get_double_segment_constrained( curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); @@ -264,21 +262,21 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, // Could store where we are upto from above to save time // Reset Val counter curSeg1Val2 = 0; - for (index1 = 0; index1 < loc1.size(); index1++) { + for (int index = 0; index < loc1.size(); index++) { // Get the three relevant locations // Start; end of linear section; end of flat section - curSeg1Loc1 = loc1[index1]; - curSeg1Loc2 = loc1[index1] + binWidth1; + curSeg1Loc1 = loc1[index]; + curSeg1Loc2 = loc1[index] + binWidth1; // could pull this check outside of the loop with final case not sure if worth it - if (index1 == loc1.size() - 1) { + if (index == loc1.size() - 1) { curSeg1Loc3 = maxLoc; } else { - curSeg1Loc3 = loc1[index1 + 1]; + curSeg1Loc3 = loc1[index + 1]; } // Update value to the start and end of the current section curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index1]; + curSeg1Val2 = val1[index]; // Setting up the previous value for the next loop // Could replace this loop with a while // but if so would need to be careful about overlaps @@ -288,23 +286,23 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, else { curSeg2Val2 = val2[secondStart - 1]; } - for (index2 = secondStart; index2 < loc2.size(); index2++) { + for (int index = secondStart; index < loc2.size(); index++) { // Construct 3 sections for second seg. - curSeg2Loc1=loc2[index2]; - curSeg2Loc2=loc2[index2] + binWidth2; - if (index2 == loc2.size() - 1) { + curSeg2Loc1 = loc2[index]; + curSeg2Loc2 = loc2[index] + binWidth2; + if (index == loc2.size() - 1) { curSeg2Loc3 = maxLoc; } else { - curSeg2Loc3 = loc2[index2 + 1]; + curSeg2Loc3 = loc2[index + 1]; } //update values curSeg2Val1 = curSeg2Val2; - curSeg2Val2 = val2[index2]; + curSeg2Val2 = val2[index]; // If this section is behind Seg1 // Do not consider again if (curSeg2Loc3 < curSeg1Loc1) { - secondStart = index2 + 1; + secondStart = index + 1; continue; } // If current Seg2 is beyond Seg1 break out of loop From b35563eb90e4dfe050e4970d3eb5fcdca1d6121c Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 10:24:46 +0100 Subject: [PATCH 34/84] Combine some conditional cases in get_segment --- src/fastSmoothV2.cpp | 119 ++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 69 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 33a76c44..8955f429 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -8,7 +8,36 @@ using namespace Rcpp; -//compute segment +int signum(double val) { + return (0.0 < val) - (val < 0.0); +} + +inline double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end) +{ + double midPoint = (val1_start - val2_start) / + ((val2_end - val2_start) - (val1_end - val1_start)); + + const double midValue = val1_start + midPoint * (val1_end - val1_start); + + midPoint = midPoint * length; + + double topTriangle = 0.5 * midPoint * (midValue - val1_start); + double topRectangle = midPoint * (val1_start - val2_start); + double bottomTriangle = 0.5 * midPoint * (midValue - val2_start); + + double res = topTriangle + topRectangle - bottomTriangle; + + topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); + + res += topTriangle + topRectangle - bottomTriangle; + return res; +} + +// Compute the unsigned area between two line segments +// assumes that val1_end > val1_start and val2_end > val2_start inline double get_segment(double start, double end, double val1_start, double val1_end, double val2_start, double val2_end) { @@ -21,75 +50,27 @@ inline double get_segment(double start, double end, double val1_start, double midValue; double res = 0; - if (val1_start > val2_start) { - if (val1_end >= val2_end) { - // They are in the same order no bowtie - // seg1 is above seg2 - // triangle of seg1 - topTriangle = 0.5*length * (val1_end - val1_start); - // rectangle between seg1 and seg2 - topRectangle = length * (val1_start - val2_start); - // triangle of seg2 (to be removed) - bottomTriangle = 0.5 * length * (val2_end - val2_start); - return topTriangle + topRectangle - bottomTriangle; - } - else { - //bowtie - // lets make this really simple as the compiler - // will combine the expressions as needed - midPoint = (val1_start - val2_start) / - ((val2_end - val2_start) - (val1_end - val1_start)); - - midValue = val1_start + midPoint * (val1_end - val1_start); - - midPoint = midPoint * length; - - topTriangle = 0.5 * midPoint * (midValue - val1_start); - topRectangle = midPoint * (val1_start - val2_start); - bottomTriangle = 0.5 * midPoint * (midValue - val2_start); - - res = topTriangle + topRectangle - bottomTriangle; - - topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); - res += topTriangle + topRectangle - bottomTriangle; - return res; - } + bool both_differences_positive = val1_start > val2_start && val1_end >= val2_end; + bool both_differences_negative = val1_start <= val2_start && val1_end <= val2_end; + + if (both_differences_positive || both_differences_negative) + { + // They are in the same order: no bowtie + // triangle of seg1 + topTriangle = 0.5 * length * (val1_end - val1_start); + // rectangle between seg1 and seg2 + topRectangle = length * (val1_start - val2_start); + // triangle of seg2 (to be removed) + bottomTriangle = 0.5 * length * (val2_end - val2_start); + + const double sign = both_differences_positive?1.0:-1.0; + return sign * (topTriangle + topRectangle - bottomTriangle); } - else { - if (val1_end > val2_end) { - //bowtie - // Find the point where they cross. - // (Solution of linear equations) - midPoint = (val2_start - val1_start) / - ((val1_end - val1_start) - (val2_end - val2_start)); - - midValue = val2_start + midPoint * (val2_end - val2_start); - midPoint = midPoint * length; - - topTriangle = 0.5 * midPoint * (midValue - val2_start); - topRectangle = midPoint * (val2_start - val1_start); - bottomTriangle = 0.5 * midPoint * (midValue - val1_start); - - res = topTriangle + topRectangle - bottomTriangle; - - topTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); - res += topTriangle + topRectangle - bottomTriangle; - return res; - } - else { // same order - // seg2 is above seg1 - // Triangle seg2 above seg1 - topTriangle = 0.5 * length * (val2_end - val2_start); - // rectangle between seg2 and seg1 - topRectangle = length * (val2_start - val1_start); - // Seg1 triangle to be removed - bottomTriangle = 0.5 * length * (val1_end - val1_start); - return topTriangle + topRectangle - bottomTriangle; - } + else if (val1_start > val2_start) { // bowtie, first case + return bowtie_area(length, val1_start, val1_end, val2_start, val2_end); + } + else { // bowtie, second case + return bowtie_area(length, val2_start, val2_end, val1_start, val1_end); } } From 0ca7c893be926a1365d145e3cbec67e8629ef35e Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 10:59:53 +0100 Subject: [PATCH 35/84] Combine cases for handling the leftmost segments (NetEmdSmoothV2) --- src/fastSmoothV2.cpp | 115 ++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 62 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 8955f429..604ea673 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -129,6 +129,57 @@ double get_double_segment_constrained( } +// Dealing with segments which are to the left of the region covered by both +// segs +inline double leftmost_segments(const NumericVector& loc1, + const NumericVector& loc2, + const NumericVector& val1, + const NumericVector& val2, + double binWidth1, + double maxLoc) +{ + double res = 0.0; + + assert(loc1[0] < loc2[0]); + + // Fix the position of Seg2 and then interate over Seg1 until we have all + // of the segments of Seg1 before Seg2 starts. + + // are these all used? + + double curSeg2Loc1 = loc1[0]; + double curSeg2Loc2 = loc1[0]; + double curSeg2Loc3 = loc2[0]; + double curSeg2Val1 = 0; + double curSeg2Val2 = 0; + + // Set this value so we can update in the lopp + double curSeg1Val2 = 0; + for (int index = 0; index < loc1.size(); index++) { + double curSeg1Loc1 = loc1[index]; + double curSeg1Loc2 = loc1[index] + binWidth1; + double curSeg1Loc3; + if (index == loc1.size() - 1) { + curSeg1Loc3 = maxLoc; + } + else { + curSeg1Loc3 = loc1[index + 1]; + } + double curSeg1Val1 = curSeg1Val2; + curSeg1Val2 = val1[index]; + res += get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + if (curSeg2Loc1 > curSeg1Loc3) { + break; + } + } + + return res; +} + + //' @title //' Compute EMD ////' @@ -166,76 +217,16 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, double maxLoc = std::max(loc1[loc1.size()-1] + binWidth1, loc2[loc2.size()-1] + binWidth2); - double minLoc = std::min(loc1[0], loc2[0]); // warning area before loc2[0] is not well tested // As values outside of the range appear to be zero - // Dealing with segments which are to the left of the region covered by both - // segs if (loc2[0] < loc1[0]) { - // loc2 starts before loc1 so lets deal with those segments first - // Fix the position of Seg1 and then interate over Seg2 until we have all - // of the segments of Seg2 before Seg1 starts. - curSeg1Loc1 = minLoc; - curSeg1Loc2 = minLoc; - curSeg1Loc3 = loc1[0]; - curSeg1Val1 = 0; - curSeg1Val2 = 0; - - // Set this value so we can update in the loop - curSeg2Val2 = 0; - for (int index = 0; index < loc2.size(); index++) { - curSeg2Loc1 = loc2[index]; - curSeg2Loc2 = loc2[index] + binWidth2; - if (index == loc2.size() - 1) { - curSeg2Loc3 = maxLoc; - } - else { - curSeg2Loc3 = loc2[index + 1]; - } - curSeg2Val1 = curSeg2Val2; - curSeg2Val2 = val2[index]; - res += get_double_segment_constrained( - curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); - - if (curSeg1Loc1 > curSeg2Loc3) { - break; - } - } + res += leftmost_segments(loc2, loc1, val2, val1, binWidth2, maxLoc); } else { - // loc2 starts before loc1 so lets deal with those segments first - // Fix the position of Seg2 and then interate over Seg1 until we have all - // of the segments of Seg1 before Seg2 starts. - curSeg2Loc1 = minLoc; - curSeg2Loc2 = minLoc; - curSeg2Loc3 = loc2[0]; - curSeg2Val1 = 0; - curSeg2Val2 = 0; - - // Set this value so we can update in the lopp - curSeg1Val2 = 0; - for (int index = 0; index < loc1.size(); index++) { - curSeg1Loc1 = loc1[index]; - curSeg1Loc2 = loc1[index] + binWidth1; - if (index == loc1.size() - 1) { - curSeg1Loc3 = maxLoc; - } - else { - curSeg1Loc3 = loc1[index + 1]; - } - curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index]; - res += get_double_segment_constrained( - curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); - if (curSeg2Loc1 > curSeg1Loc3) { - break; - } - } + res += leftmost_segments(loc1, loc2, val1, val2, binWidth1, maxLoc); } // Add both the overlapping sections and the non overlapping section on the right From c36c3f66d1e5f7fed614ca23dacc95d02810a87e Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 15:39:04 +0100 Subject: [PATCH 36/84] Remove unused function --- src/fastSmoothV2.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 604ea673..0fa5f39d 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -8,10 +8,6 @@ using namespace Rcpp; -int signum(double val) { - return (0.0 < val) - (val < 0.0); -} - inline double bowtie_area(double length, double val1_start, double val1_end, double val2_start, double val2_end) { From 4af238d7c9a6280f907758ebeabd9596d35ba63e Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 17 Apr 2020 15:50:06 +0100 Subject: [PATCH 37/84] Rename loop counters --- src/fastSmoothV2.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 0fa5f39d..3598c0f5 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -230,21 +230,21 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, // Could store where we are upto from above to save time // Reset Val counter curSeg1Val2 = 0; - for (int index = 0; index < loc1.size(); index++) { + for (int index1 = 0; index1 < loc1.size(); index1++) { // Get the three relevant locations // Start; end of linear section; end of flat section - curSeg1Loc1 = loc1[index]; - curSeg1Loc2 = loc1[index] + binWidth1; + curSeg1Loc1 = loc1[index1]; + curSeg1Loc2 = loc1[index1] + binWidth1; // could pull this check outside of the loop with final case not sure if worth it - if (index == loc1.size() - 1) { + if (index1 == loc1.size() - 1) { curSeg1Loc3 = maxLoc; } else { - curSeg1Loc3 = loc1[index + 1]; + curSeg1Loc3 = loc1[index1 + 1]; } // Update value to the start and end of the current section curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index]; + curSeg1Val2 = val1[index1]; // Setting up the previous value for the next loop // Could replace this loop with a while // but if so would need to be careful about overlaps @@ -254,23 +254,23 @@ double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, else { curSeg2Val2 = val2[secondStart - 1]; } - for (int index = secondStart; index < loc2.size(); index++) { + for (int index2 = secondStart; index2 < loc2.size(); index2++) { // Construct 3 sections for second seg. - curSeg2Loc1 = loc2[index]; - curSeg2Loc2 = loc2[index] + binWidth2; - if (index == loc2.size() - 1) { + curSeg2Loc1 = loc2[index2]; + curSeg2Loc2 = loc2[index2] + binWidth2; + if (index2 == loc2.size() - 1) { curSeg2Loc3 = maxLoc; } else { - curSeg2Loc3 = loc2[index + 1]; + curSeg2Loc3 = loc2[index2 + 1]; } //update values curSeg2Val1 = curSeg2Val2; - curSeg2Val2 = val2[index]; + curSeg2Val2 = val2[index2]; // If this section is behind Seg1 // Do not consider again if (curSeg2Loc3 < curSeg1Loc1) { - secondStart = index + 1; + secondStart = index2 + 1; continue; } // If current Seg2 is beyond Seg1 break out of loop From 3feb0ae0290de2bf6164bcd81fe31c29d2f2cfa7 Mon Sep 17 00:00:00 2001 From: ande Date: Fri, 17 Apr 2020 18:19:31 +0100 Subject: [PATCH 38/84] added tests --- src/RcppExports.cpp | 19 +++- src/fastSmooth.cpp | 57 +++++++--- src/fastSmoothV2.cpp | 8 +- src/test_emd_fast_smooth.cpp | 202 +++++++++++++++++++++++++++++++++++ 4 files changed, 265 insertions(+), 21 deletions(-) create mode 100644 src/test_emd_fast_smooth.cpp diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 99086611..933f6b71 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -62,6 +62,22 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// NetEmdSmoothV2_old +double NetEmdSmoothV2_old(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); +RcppExport SEXP _netdist_NetEmdSmoothV2_old(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type loc1(loc1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val1(val1SEXP); + Rcpp::traits::input_parameter< double >::type binWidth1(binWidth1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type loc2(loc2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type val2(val2SEXP); + Rcpp::traits::input_parameter< double >::type binWidth2(binWidth2SEXP); + rcpp_result_gen = Rcpp::wrap(NetEmdSmoothV2_old(loc1, val1, binWidth1, loc2, val2, binWidth2)); + return rcpp_result_gen; +END_RCPP +} RcppExport SEXP run_testthat_tests(); @@ -70,7 +86,8 @@ static const R_CallMethodDef CallEntries[] = { {"_netdist_emd_fast_no_smoothing", (DL_FUNC) &_netdist_emd_fast_no_smoothing, 4}, {"_netdist_NetEmdSmooth", (DL_FUNC) &_netdist_NetEmdSmooth, 6}, {"_netdist_NetEmdSmoothV2", (DL_FUNC) &_netdist_NetEmdSmoothV2, 6}, - {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, + {"_netdist_NetEmdSmoothV2_old", (DL_FUNC) &_netdist_NetEmdSmoothV2_old, 6}, + {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, {NULL, NULL, 0} }; diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp index b2d43847..c927547b 100644 --- a/src/fastSmooth.cpp +++ b/src/fastSmooth.cpp @@ -59,7 +59,31 @@ double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,Numer res=0; // set as 0 as at bottom of hist curStartVal=0; - + + std::cout << "Loc1: "; + for (i=0;i val1_start and val2_end > val2_start -inline double get_segment(double start, double end, double val1_start, +double get_segment(double start, double end, double val1_start, double val1_end, double val2_start, double val2_end) { const double length = end - start; @@ -71,7 +71,7 @@ inline double get_segment(double start, double end, double val1_start, } // cut down and compute segment -inline double get_segment_constrained(double seg1L1, double seg1L2, +double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) @@ -127,7 +127,7 @@ double get_double_segment_constrained( // Dealing with segments which are to the left of the region covered by both // segs -inline double leftmost_segments(const NumericVector& loc1, +double leftmost_segments(const NumericVector& loc1, const NumericVector& loc2, const NumericVector& val1, const NumericVector& val2, diff --git a/src/test_emd_fast_smooth.cpp b/src/test_emd_fast_smooth.cpp new file mode 100644 index 00000000..508be859 --- /dev/null +++ b/src/test_emd_fast_smooth.cpp @@ -0,0 +1,202 @@ +/* + * This file uses the Catch unit testing library, alongside + * testthat's simple bindings, to test a C++ function. + * + * This file should begin with `test` and be in the `src/` folder. + * `LinkingTo: testthat` must also be within the DESCRIPTION file. + */ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] + +// All test files should include the +// header file. +#include "fastSmoothV2.h" +#include "emd_fast_no_smoothing.h" +#include +#include +#include + +// Helper function to test tolerance +bool within_toleranceV2(double actual, double expected, double tolerance) { + if(actual > expected) { + return ((actual - expected) <= tolerance); + } + else { + return ((expected - actual) <= tolerance); + } +} + +double simpleSlowArea(double startx,double endx,double starty1,double endy1,double starty2,double endy2) +{ + // Making this step size smaller + double step = (endx-startx)/10000000.0; + double curX; + double curY1; + double curY2; + double res = 0; + for (int i=0;i<10000000;i++) + { + curX = startx + i*step; + curY1 = starty1 +(endy1-starty1)*i/10000000.0; + curY2 = starty2 +(endy2-starty2)*i/10000000.0; + res += step*std::abs(curY1-curY2); + } + return res; +} + + +context("emd_fast_smoothing segment") { + test_that("Test Me") { + double start=0; + double end=1; + double val1_start=0; + double val1_end=1; + double val2_start=1; + double val2_end=0; + double tempVal1; + double tempVal2; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + within_toleranceV2(tempVal1,tempVal2,0.0001); + val1_start=2; + val1_end=1; + val2_start=2; + val2_end=0; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + within_toleranceV2(tempVal1,tempVal2,0.0001); + val1_start=2; + val1_end=0; + val2_start=2; + val2_end=1; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + within_toleranceV2(tempVal1,tempVal2,0.0001); + val1_start=0; + val1_end=0; + val2_start=1; + val2_end=1; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + within_toleranceV2(tempVal1,tempVal2,0.0001); + val1_start=1; + val1_end=1; + val2_start=0; + val2_end=0; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + within_toleranceV2(tempVal1,tempVal2,0.0001); + // for (int startI = 0; startI<10;startI++) + // { + // start = (double)startI/10.0; + // for (int endI = startI+1; endI<10;endI++) + // { + // end = (double)endI/10.0; + // for (int val1_startI = 0; val1_startI<10;val1_startI++) + // { + // val1_start = (double)val1_startI/10.0; + // for (int val2_startI = 0; val2_startI<10;val2_startI++) + // { + // val2_start = (double)val2_startI/10.0; + // for (int val1_endI = 0; val1_endI<10;val1_endI++) + // { + // val1_end = (double)val1_endI/10.0; + // for (int val2_endI = 0; val2_endI<10;val2_endI++) + // { + // val2_end = (double)val2_endI/10.0; + // tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + // tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + // expect_true(tempVal1==tempVal2); + // } + // } + // } + // } + // } + // } + + +} +} + + +// Initialize a unit test context. This is similar to how you +// might begin an R test file with 'context()', expect the +// associated context should be wrapped in braced. +context("emd_fast_no_smoothing v2") { + + // Test Kahan summation helper function + // We also verify that the example data used to test the Kahan summation + // results in an unacceptably large error when using naiive summation. + test_that("mk2 Kahan summation works when naiive summation fails") { + // NOTE: If this test fails when the `add_element_kahan()` function has not + // been changed, there is a small chance that it may be that the package is + // being compiled with the `-ffast-math` compiler flag set (or an aggressive + // optimisation level that sets the fast math flag). If this flag is set, + // the Kahan compensation being optimised away. + + // Set up suitable test data + // ========================= + // NOTE: It is surprisingly easy to come up with test data where the naiive + // summation works fine, so the test to validate that the naiive sum results + // in an unnacceptably high error is important. I think this is for the + // following reasons. + // (A) If the start number is too large in comparison to the expected sum of + // the elements in the small number vector, then the floating point + // representation of the expected total will be the same as that of the + // start number. + // (B) If the start number is too small in comparison to the elements in the + // small number vector, we don't run into the floating point + // representation issue when we add each element to the running total. + // NOTE: Appveyor tests with both 32 and 64 bit builds of R (i386 and ix64 + // respectively). Therefore values have been chosen that all fall + // within a 32-bit floating point range (including the expected total) + // while still causing the naiive summation to fail with 64-bit R. + // + // ========================= + // 1. Define all test data components as powers of a common base to make + // it easy to accurately calculate the expected sum without adding any small + // numbers together + double start_num = 1125899906842624.0; // 2^50 = 1125899906842624 + double element_value = 0.03125; // 2^-5 = 0.03125 + double num_elements = 4096; // 2^12 = 4096 + double expected_total = 1125899906842752.0; // 2^50 + 2^7 + + std::vector input(num_elements, element_value); + + // Uncomment me if debugging test failure + Rcerr << std::fixed << "Num elements: " << num_elements << "; Element value: " + << element_value << "; Exp. total: " << expected_total + << "; Sizeof(double): " << sizeof(double) << "\n"; + + + // Define acceptable tolerance + double tolerance = element_value; + + // Do naiive and Kahan summation + double naiive_total = start_num; + double kahan_total = start_num; + double kahan_compensation = 0.0; + for(auto const& value: input) { + naiive_total += value; + add_element_kahan(kahan_total, value, kahan_compensation); + } + + double naiive_diff = (expected_total - naiive_total); + double kahan_diff = (expected_total - kahan_total); + + // Uncomment me if debugging test failure + Rcerr << std::fixed << "Expected: " << expected_total << "; " << "Naiive: " + << naiive_total + << "; Kahan: " << kahan_total << "; Naiive diff: " << naiive_diff + << "; Kahan diff: " << kahan_diff << "; Tolerance: " << tolerance + << "\n"; + + + // Check naiive summation has unacceptable error + expect_false(within_toleranceV2(naiive_total, expected_total, tolerance)); + // Check Kahan summation has acceptable error + expect_true(within_toleranceV2(kahan_total, expected_total, tolerance)); + + } + +} From a39892029127b01568a2bf5721e8376a8cf28986 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Mon, 20 Apr 2020 09:42:34 +0100 Subject: [PATCH 39/84] Iterable class to interleave two sequences in the required way --- src/interleaved_iterator.h | 159 +++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 src/interleaved_iterator.h diff --git a/src/interleaved_iterator.h b/src/interleaved_iterator.h new file mode 100644 index 00000000..d81e205b --- /dev/null +++ b/src/interleaved_iterator.h @@ -0,0 +1,159 @@ +#ifndef INTERLEAVED_ITERATOR_H +#define INTERLEAVED_ITERATOR_H + +#include +#include +#include +#include +#include + +// Interleave two containers of ordered elements in a particular way. +// +// Supports iteration with Interleave::iterator +template +class Interleaved { + // short alias for the value type of the container + typedef typename Container::value_type CvalT; + typedef long int IndexT; + + Container& xs; + Container& ys; + +public: + Interleaved(Container& xs_, Container& ys_) : xs(xs_), ys(ys_) { } + + class iterator { + const Interleaved * const xys; + + IndexT Nx, Ny; + + // the minimum and maximum contained values; + CvalT minloc, maxloc; + + // the current iteration state + std::pair idx; + + public: + typedef std::pair value_type; + typedef void difference_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::input_iterator_tag iterator_category; + + explicit iterator(Interleaved *xys_) + : xys(xys_), Nx(xys_->xs.size()), Ny(xys_->ys.size()) + { + //// this slightly simpler version assumes both xs and ys are + //// nonempty: + + // if (xys->xs[0] <= xys->ys[0]) { + // minloc = xys->xs[0]; + // idx.first = 0; + // idx.second = -1; + // } + // else { + // minloc = xys->ys[0]; + // idx.first = -1; + // idx.second = 0; + // } + // maxloc = std::max(xys->xs.back(), xys->ys.back()); + + //// no such assumption: + + if (Nx != 0 && Ny != 0) { + maxloc = std::max(xys->xs.back(), xys->ys.back()); + } + else if (Nx != 0) { + maxloc = xys->xs.back(); + } + else if (Ny != 0) { + maxloc = xys->ys.back(); + } + else { + maxloc = 0; + minloc = 0; + idx.first = -1; + idx.second = -1; + } + + if (Nx != 0 && (Ny == 0 || Ny != 0 && xys->xs[0] <= xys->ys[0])) { + minloc = xys->xs[0]; + idx.first = 0; + idx.second = -1; + } + else if (Ny != 0) { + minloc = xys->ys[0]; + idx.first = -1; + idx.second = 0; + } + } + + CvalT get_x(IndexT i) const { + if (i < 0) return minloc; + else if (i >= Nx) return maxloc; + else return xys->xs[i]; + } + + CvalT get_y(IndexT i) const { + if (i < 0) return minloc; + else if (i >= Ny) return maxloc; + else return xys->ys[i]; + } + + std::pair get_loc(const std::pair& i) const { + CvalT x(get_x(i.first)); + CvalT y(get_y(i.second)); + + return std::make_pair(x, y); + } + + std::pair get_current_loc() const { + return get_loc(idx); + } + + iterator& advance_to_end() { + idx.first = Nx; + idx.second = Ny; + return *this; + } + + value_type operator*() const { + return idx; + } + + const value_type *operator->() const { + return &idx; + } + + iterator& operator++() { + std::pair loc(get_current_loc()); + while (idx.second < Ny && get_y(++idx.second) < loc.first) { } + + if (get_y(idx.second + 1) > get_x(idx.first + 1) || idx.second >= Ny) { + if (idx.first < Nx) idx.first++; + } else { + if (idx.second < Ny) idx.second++; + } + + } + + iterator& operator++(int) { + iterator res = *this; + operator++(); + return res; + } + + friend bool operator==(const iterator& lhs, const iterator& rhs) { + return lhs.idx == rhs.idx; + } + + friend bool operator!=(const iterator& lhs, const iterator& rhs) { + return !(lhs == rhs); + } + }; + + iterator begin() { return iterator(this); }; + iterator end() { return iterator(this).advance_to_end(); }; +}; + +#endif // INTERLEAVED_ITERATOR_H From c302c43627f24de8f7d3e906f1302b29e3dbaa21 Mon Sep 17 00:00:00 2001 From: ande Date: Tue, 21 Apr 2020 09:19:29 +0100 Subject: [PATCH 40/84] added fast smooth header --- src/fastSmoothV2.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/fastSmoothV2.h diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h new file mode 100644 index 00000000..d822d846 --- /dev/null +++ b/src/fastSmoothV2.h @@ -0,0 +1,34 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] +#include +#include +#include +#include +#include + +using namespace Rcpp; + +inline double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end); +inline double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end); +inline double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2); + +double get_double_segment_constrained( + double seg1Loc1, double seg1Loc2, double seg1Loc3, + double seg1Val1, double seg1Val2, + double seg2Loc1, double seg2Loc2, double seg2Loc3, + double seg2Val1, double seg2Val2); + +inline double leftmost_segments(const NumericVector& loc1, + const NumericVector& loc2, + const NumericVector& val1, + const NumericVector& val2, + double binWidth1, + double maxLoc); + +double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, + NumericVector loc2, NumericVector val2, double binWidth2); From 89bba30534af8a291d8dac5ba8d006072bd2ca50 Mon Sep 17 00:00:00 2001 From: ande Date: Tue, 21 Apr 2020 10:55:32 +0100 Subject: [PATCH 41/84] updated tests --- src/fastSmoothV2.cpp | 5 +- src/fastSmoothV2.h | 6 +- src/test_emd_fast_smooth.cpp | 217 +++++++++++------------------------ 3 files changed, 72 insertions(+), 156 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index e61f84d2..1b544a2d 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -79,6 +79,7 @@ double get_segment_constrained(double seg1L1, double seg1L2, //We have a valid range double valStart1, valEnd1, valStart2, valEnd2; double start,end; + double result; start = std::max(seg1L1,seg2L1); end = std::min(seg1L2,seg2L2); if (start < end) { @@ -86,8 +87,8 @@ double get_segment_constrained(double seg1L1, double seg1L2, valEnd1 = seg1V1 + (seg1V2 - seg1V1)*(end - seg1L1)/(seg1L2 - seg1L1); valStart2 = seg2V1 + (seg2V2 - seg2V1)*(start - seg2L1)/(seg2L2 - seg2L1); valEnd2 = seg2V1 + (seg2V2 - seg2V1)*(end - seg2L1)/(seg2L2 - seg2L1); - - return get_segment(start, end, valStart1, valEnd1, valStart2, valEnd2); + result = get_segment(start, end, valStart1, valEnd1, valStart2, valEnd2); + return result; } else { return 0; diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h index d822d846..ac3b1b11 100644 --- a/src/fastSmoothV2.h +++ b/src/fastSmoothV2.h @@ -8,11 +8,11 @@ using namespace Rcpp; -inline double bowtie_area(double length, double val1_start, double val1_end, +double bowtie_area(double length, double val1_start, double val1_end, double val2_start, double val2_end); -inline double get_segment(double start, double end, double val1_start, +double get_segment(double start, double end, double val1_start, double val1_end, double val2_start, double val2_end); -inline double get_segment_constrained(double seg1L1, double seg1L2, +double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2); diff --git a/src/test_emd_fast_smooth.cpp b/src/test_emd_fast_smooth.cpp index 508be859..798e0c84 100644 --- a/src/test_emd_fast_smooth.cpp +++ b/src/test_emd_fast_smooth.cpp @@ -29,174 +29,89 @@ bool within_toleranceV2(double actual, double expected, double tolerance) { double simpleSlowArea(double startx,double endx,double starty1,double endy1,double starty2,double endy2) { // Making this step size smaller - double step = (endx-startx)/10000000.0; + double step = (endx-startx)/100000000.0; double curX; double curY1; double curY2; double res = 0; - for (int i=0;i<10000000;i++) + for (int i=0;i<100000000;i++) { curX = startx + i*step; - curY1 = starty1 +(endy1-starty1)*i/10000000.0; - curY2 = starty2 +(endy2-starty2)*i/10000000.0; + curY1 = starty1 +(endy1-starty1)*i/100000000.0; + curY2 = starty2 +(endy2-starty2)*i/100000000.0; res += step*std::abs(curY1-curY2); } return res; } - -context("emd_fast_smoothing segment") { - test_that("Test Me") { - double start=0; - double end=1; - double val1_start=0; - double val1_end=1; - double val2_start=1; - double val2_end=0; +void runSegmentConstraintTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ double tempVal1; double tempVal2; - tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal1 = get_segment_constrained(start,end,start,end,val1_start,val1_end,val2_start,val2_end); tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - within_toleranceV2(tempVal1,tempVal2,0.0001); - val1_start=2; - val1_end=1; - val2_start=2; - val2_end=0; - tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); - tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - within_toleranceV2(tempVal1,tempVal2,0.0001); - val1_start=2; - val1_end=0; - val2_start=2; - val2_end=1; - tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); - tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - within_toleranceV2(tempVal1,tempVal2,0.0001); - val1_start=0; - val1_end=0; - val2_start=1; - val2_end=1; - tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); - tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - within_toleranceV2(tempVal1,tempVal2,0.0001); - val1_start=1; - val1_end=1; - val2_start=0; - val2_end=0; + std::cout << "\n"; + std::cout << "segment constrained " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); +} + + +void runSegmentTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double tempVal1; + double tempVal2; tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - within_toleranceV2(tempVal1,tempVal2,0.0001); - // for (int startI = 0; startI<10;startI++) - // { - // start = (double)startI/10.0; - // for (int endI = startI+1; endI<10;endI++) - // { - // end = (double)endI/10.0; - // for (int val1_startI = 0; val1_startI<10;val1_startI++) - // { - // val1_start = (double)val1_startI/10.0; - // for (int val2_startI = 0; val2_startI<10;val2_startI++) - // { - // val2_start = (double)val2_startI/10.0; - // for (int val1_endI = 0; val1_endI<10;val1_endI++) - // { - // val1_end = (double)val1_endI/10.0; - // for (int val2_endI = 0; val2_endI<10;val2_endI++) - // { - // val2_end = (double)val2_endI/10.0; - // tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); - // tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - // expect_true(tempVal1==tempVal2); - // } - // } - // } - // } - // } - // } - - -} + std::cout << "\n"; + std::cout << "segment test " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); } -// Initialize a unit test context. This is similar to how you -// might begin an R test file with 'context()', expect the -// associated context should be wrapped in braced. -context("emd_fast_no_smoothing v2") { - - // Test Kahan summation helper function - // We also verify that the example data used to test the Kahan summation - // results in an unacceptably large error when using naiive summation. - test_that("mk2 Kahan summation works when naiive summation fails") { - // NOTE: If this test fails when the `add_element_kahan()` function has not - // been changed, there is a small chance that it may be that the package is - // being compiled with the `-ffast-math` compiler flag set (or an aggressive - // optimisation level that sets the fast math flag). If this flag is set, - // the Kahan compensation being optimised away. - - // Set up suitable test data - // ========================= - // NOTE: It is surprisingly easy to come up with test data where the naiive - // summation works fine, so the test to validate that the naiive sum results - // in an unnacceptably high error is important. I think this is for the - // following reasons. - // (A) If the start number is too large in comparison to the expected sum of - // the elements in the small number vector, then the floating point - // representation of the expected total will be the same as that of the - // start number. - // (B) If the start number is too small in comparison to the elements in the - // small number vector, we don't run into the floating point - // representation issue when we add each element to the running total. - // NOTE: Appveyor tests with both 32 and 64 bit builds of R (i386 and ix64 - // respectively). Therefore values have been chosen that all fall - // within a 32-bit floating point range (including the expected total) - // while still causing the naiive summation to fail with 64-bit R. - // - // ========================= - // 1. Define all test data components as powers of a common base to make - // it easy to accurately calculate the expected sum without adding any small - // numbers together - double start_num = 1125899906842624.0; // 2^50 = 1125899906842624 - double element_value = 0.03125; // 2^-5 = 0.03125 - double num_elements = 4096; // 2^12 = 4096 - double expected_total = 1125899906842752.0; // 2^50 + 2^7 - - std::vector input(num_elements, element_value); - - // Uncomment me if debugging test failure - Rcerr << std::fixed << "Num elements: " << num_elements << "; Element value: " - << element_value << "; Exp. total: " << expected_total - << "; Sizeof(double): " << sizeof(double) << "\n"; - - - // Define acceptable tolerance - double tolerance = element_value; +context("emd_fast_smoothing segment constrain simple") { + test_that("emd_fast_smoothing segment constrain simple") { + // Two upward linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentConstraintTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + +context("emd_fast_smoothing segment full") { + test_that("emd_fast_smoothing segment full") { + // Two upward linear segments + runSegmentTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + - // Do naiive and Kahan summation - double naiive_total = start_num; - double kahan_total = start_num; - double kahan_compensation = 0.0; - for(auto const& value: input) { - naiive_total += value; - add_element_kahan(kahan_total, value, kahan_compensation); - } - - double naiive_diff = (expected_total - naiive_total); - double kahan_diff = (expected_total - kahan_total); - - // Uncomment me if debugging test failure - Rcerr << std::fixed << "Expected: " << expected_total << "; " << "Naiive: " - << naiive_total - << "; Kahan: " << kahan_total << "; Naiive diff: " << naiive_diff - << "; Kahan diff: " << kahan_diff << "; Tolerance: " << tolerance - << "\n"; - - - // Check naiive summation has unacceptable error - expect_false(within_toleranceV2(naiive_total, expected_total, tolerance)); - // Check Kahan summation has acceptable error - expect_true(within_toleranceV2(kahan_total, expected_total, tolerance)); - - } - -} From 54363104d9efcacd444ce51b00148c4c83eaea45 Mon Sep 17 00:00:00 2001 From: ande Date: Tue, 21 Apr 2020 16:27:19 +0100 Subject: [PATCH 42/84] remove browser --- R/emd.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/emd.R b/R/emd.R index e9eee6f3..95d4fcc4 100644 --- a/R/emd.R +++ b/R/emd.R @@ -167,7 +167,6 @@ min_emd_optimise <- function(dhist1, dhist2) { soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, tol = .Machine$double.eps*1000) - browser() # Return mnimum EMD and associated offset min_emd <- soln$objective min_offset <- soln$minimum From 5c4dd42d93033567ab61ff6487a909589870ed3b Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Wed, 22 Apr 2020 14:23:14 +0100 Subject: [PATCH 43/84] Iterate over overlapping intervals --- src/interleaved_iterator.h | 197 +++++++++++++++++++++-------------- src/test_emd_fast_smooth.cpp | 152 +++++++++++++++++++++------ 2 files changed, 243 insertions(+), 106 deletions(-) diff --git a/src/interleaved_iterator.h b/src/interleaved_iterator.h index d81e205b..7451a9fb 100644 --- a/src/interleaved_iterator.h +++ b/src/interleaved_iterator.h @@ -1,29 +1,48 @@ -#ifndef INTERLEAVED_ITERATOR_H -#define INTERLEAVED_ITERATOR_H +#ifndef OVERLAPPING_SEGMENTS_H +#define OVERLAPPING_SEGMENTS_H #include +#include #include #include #include #include +#include +#include -// Interleave two containers of ordered elements in a particular way. +// Class used to support iteration (with OverlappingSegments::iterator) +// over pairs of overlapping segments within two sequences. // -// Supports iteration with Interleave::iterator template -class Interleaved { +class OverlappingSegments { // short alias for the value type of the container - typedef typename Container::value_type CvalT; + typedef typename std::remove_reference::type CvalT; typedef long int IndexT; - Container& xs; - Container& ys; + // references the two sequences + const Container& loc1; + const Container& loc2; public: - Interleaved(Container& xs_, Container& ys_) : xs(xs_), ys(ys_) { } - + OverlappingSegments(Container& loc1_, Container& loc2_) + : loc1(loc1_), loc2(loc2_) { + // check the requirement that loc1 and loc2 are nonempty and + // (strictly) sorted + + if (loc1.size() == 0 || loc2.size() == 0) + throw std::invalid_argument("Input vectors must be nonempty"); + + for (int i = 0; i < loc1.size() - 1; i++) + if (loc1[i] > loc1[i + 1]) + throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); + + for (int i = 0; i < loc2.size() - 1; i++) + if (loc2[i] > loc2[i + 1]) + throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); + } + class iterator { - const Interleaved * const xys; + const OverlappingSegments * const segs; IndexT Nx, Ny; @@ -39,102 +58,126 @@ class Interleaved { typedef value_type* pointer; typedef value_type& reference; typedef std::input_iterator_tag iterator_category; - - explicit iterator(Interleaved *xys_) - : xys(xys_), Nx(xys_->xs.size()), Ny(xys_->ys.size()) + + // Iterate over pairs of indices (i,j) into the sequences loc1 and + // loc2, where the intervals [loc1[i], loc1[i+1]] and [loc2[j], + // loc2[j+1]] overlap. + // + // These indices are returned from the iterator as + // std::pair. + // + // A sequence has an implicit segment from minloc (with index -1) + // to its zeroth element. The elements loc1[0] and loc2[0] are + // compared to determine whether, for either sequence, this + // initial implicit segment overlaps the zeroth segment of the + // other one. If both sequences start with the same value, the + // iteration starts at (0,0). + // + explicit iterator(OverlappingSegments *segs_) + : segs(segs_), Nx(segs_->loc1.size()), Ny(segs_->loc2.size()) { - //// this slightly simpler version assumes both xs and ys are - //// nonempty: - - // if (xys->xs[0] <= xys->ys[0]) { - // minloc = xys->xs[0]; - // idx.first = 0; - // idx.second = -1; - // } - // else { - // minloc = xys->ys[0]; - // idx.first = -1; - // idx.second = 0; - // } - // maxloc = std::max(xys->xs.back(), xys->ys.back()); - - //// no such assumption: + // The initial state of the iterator - if (Nx != 0 && Ny != 0) { - maxloc = std::max(xys->xs.back(), xys->ys.back()); - } - else if (Nx != 0) { - maxloc = xys->xs.back(); - } - else if (Ny != 0) { - maxloc = xys->ys.back(); - } - else { - maxloc = 0; - minloc = 0; - idx.first = -1; + if (segs->loc1[0] < segs->loc2[0]) { + minloc = segs->loc1[0]; + idx.first = 0; idx.second = -1; } - - if (Nx != 0 && (Ny == 0 || Ny != 0 && xys->xs[0] <= xys->ys[0])) { - minloc = xys->xs[0]; + else if (segs->loc1[0] == segs->loc2[0]) { + minloc = segs->loc1[0]; // == segs->loc2[0] idx.first = 0; - idx.second = -1; + idx.second = 0; } - else if (Ny != 0) { - minloc = xys->ys[0]; + else { + minloc = segs->loc2[0]; idx.first = -1; idx.second = 0; } + + // maxloc is used to signal the end of the current segment: + // since we consider only left-hand endpoints, the only + // requirement is that it compares greater than any real loc in + // either sequence - in reality it will be the largest loc plus + // the corresponding binwidth + maxloc = std::numeric_limits::infinity(); } - CvalT get_x(IndexT i) const { + CvalT get_loc1(IndexT i) const { if (i < 0) return minloc; else if (i >= Nx) return maxloc; - else return xys->xs[i]; + else return segs->loc1[i]; } - CvalT get_y(IndexT i) const { + CvalT get_loc2(IndexT i) const { if (i < 0) return minloc; else if (i >= Ny) return maxloc; - else return xys->ys[i]; + else return segs->loc2[i]; } - - std::pair get_loc(const std::pair& i) const { - CvalT x(get_x(i.first)); - CvalT y(get_y(i.second)); - return std::make_pair(x, y); + // Does interval i (from the first collection of segments) overlap + // interval j (from the second)? + bool intervals_overlap(IndexT i, IndexT j) const { + return (get_loc1(i) < get_loc2(j + 1) && get_loc2(j) < get_loc1(i + 1)); } - std::pair get_current_loc() const { - return get_loc(idx); - } + bool at_end() const { return idx.first == Nx && idx.second == Ny - 1; } iterator& advance_to_end() { idx.first = Nx; - idx.second = Ny; + idx.second = Ny - 1; return *this; } + + value_type operator*() const { return idx; } - value_type operator*() const { - return idx; - } - - const value_type *operator->() const { - return &idx; - } + const value_type *operator->() const { return &idx; } iterator& operator++() { - std::pair loc(get_current_loc()); - while (idx.second < Ny && get_y(++idx.second) < loc.first) { } - if (get_y(idx.second + 1) > get_x(idx.first + 1) || idx.second >= Ny) { - if (idx.first < Nx) idx.first++; - } else { - if (idx.second < Ny) idx.second++; +#if !NDEBUG + // Verify precondition + if (!intervals_overlap(idx.first, idx.second)) { + throw std::logic_error("Iterator precondition not satisfied: " + "current intervals do not overlap"); } - +#endif + + // Advance the second segment if it would still overlap the first + // + // The condition below is equivalent to + // idx.second < Ny - 1 && intervals_overlap(idx.first, idx.second + 1) + // + // For the right-hand condition, we know that (by precondition) + // get_loc1(idx.first) < get_loc2(idx.second + 1) + // + // and therefore that + // get_loc1(idx.first) < get_loc2(idx.second + 2), + // + // so this inequality is all that remains to be checked. + // + if (idx.second < Ny - 1 + && get_loc2(idx.second + 1) < get_loc1(idx.first + 1)) { + idx.second++; + } + // Could not advance the second segment above: advance the first instead, + // and the second as well if they share an endpoint + else { + if (idx.second < Ny - 1 + && get_loc1(idx.first + 1) == get_loc2(idx.second + 1)) { + idx.second++; + } + idx.first++; + } + +#if !NDEBUG + // Verify postcondition + if (!(at_end() || intervals_overlap(idx.first, idx.second))) { + throw std::logic_error("Iterator postcondition not satisfied: " + "current intervals do not overlap (not at end)"); + } +#endif + + return *this; } iterator& operator++(int) { @@ -156,4 +199,4 @@ class Interleaved { iterator end() { return iterator(this).advance_to_end(); }; }; -#endif // INTERLEAVED_ITERATOR_H +#endif // OVERLAPPING_SEGMENTS_H diff --git a/src/test_emd_fast_smooth.cpp b/src/test_emd_fast_smooth.cpp index 798e0c84..ac0a65b7 100644 --- a/src/test_emd_fast_smooth.cpp +++ b/src/test_emd_fast_smooth.cpp @@ -12,6 +12,8 @@ // header file. #include "fastSmoothV2.h" #include "emd_fast_no_smoothing.h" +#include "interleaved_iterator.h" + #include #include #include @@ -28,46 +30,45 @@ bool within_toleranceV2(double actual, double expected, double tolerance) { double simpleSlowArea(double startx,double endx,double starty1,double endy1,double starty2,double endy2) { - // Making this step size smaller - double step = (endx-startx)/100000000.0; - double curX; - double curY1; - double curY2; - double res = 0; - for (int i=0;i<100000000;i++) - { - curX = startx + i*step; - curY1 = starty1 +(endy1-starty1)*i/100000000.0; - curY2 = starty2 +(endy2-starty2)*i/100000000.0; - res += step*std::abs(curY1-curY2); - } - return res; + // Making this step size smaller + double step = (endx-startx)/100000000.0; + double curX; + double curY1; + double curY2; + double res = 0; + for (int i=0;i<100000000;i++) + { + curX = startx + i*step; + curY1 = starty1 +(endy1-starty1)*i/100000000.0; + curY2 = starty2 +(endy2-starty2)*i/100000000.0; + res += step*std::abs(curY1-curY2); + } + return res; } void runSegmentConstraintTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) { - double tempVal1; - double tempVal2; - tempVal1 = get_segment_constrained(start,end,start,end,val1_start,val1_end,val2_start,val2_end); - tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - std::cout << "\n"; - std::cout << "segment constrained " << tempVal1 << " simpleResult " << tempVal2 << "\n"; - expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); + double tempVal1; + double tempVal2; + tempVal1 = get_segment_constrained(start,end,start,end,val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment constrained " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); } void runSegmentTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) { - double tempVal1; - double tempVal2; - tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); - tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); - std::cout << "\n"; - std::cout << "segment test " << tempVal1 << " simpleResult " << tempVal2 << "\n"; - expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); + double tempVal1; + double tempVal2; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment test " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); } - context("emd_fast_smoothing segment constrain simple") { test_that("emd_fast_smoothing segment constrain simple") { // Two upward linear segments @@ -114,4 +115,97 @@ context("emd_fast_smoothing segment full") { runSegmentTest(0.0,1.0,1.0,2.0,2.0,3.0); }} +template +void runIntervalOverlapTest(Container1T& actual, Container2T& expected) +{ + std::cout << "Left endpoints of overlapping intervals:" << std::endl; + for (std::pair p : actual) + std::cout << p.first << ", " << p.second << std::endl; + + std::cout << "Expected:" << std::endl; + for (std::pair p : expected) + std::cout << p.first << ", " << p.second << std::endl; + + bool result = std::equal(actual.begin(), actual.end(), expected.begin()); + + std::cout << "Same? " << std::boolalpha << result << std::endl; + std::cout << "~~~~~~~~~~\n"; + + expect_true(result); +} + +context("emd_fast_smooth overlapping interval iterator") { + test_that("emd_fast_smooth overlapping interval iterator") { + { + std::vector xs {1.0, 3.0, 5.0}; + std::vector ys {2.0, 4.0, 6.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 0}, {1, 1}, {2, 1}, {2, 2}}; + runIntervalOverlapTest(actual, expected); + } + { + std::vector xs {1.0, 3.0}; + std::vector ys {4.0, 6.0, 8.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, -1}, {1, -1}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {5.0, 5.5}; + std::vector ys {4.0, 6.0, 8.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {-1, 0}, {0, 0}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {1.0, 2.0}; + std::vector ys {1.0, 3.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {1.0, 3.0}; + std::vector ys {1.0, 2.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, 0}, {0, 1}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {1.0, 2.0}; + std::vector ys {1.5, 2.0, 3.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {1.0, 2.0}; + std::vector ys {1.0, 2.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + std::vector xs {1.0, 2.0}; + std::vector ys {1.5, 2.0}; + OverlappingSegments > actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + } +} From 77b9338be707b1e3e436384825c06b92a143fc26 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Wed, 22 Apr 2020 21:04:55 +0100 Subject: [PATCH 44/84] Move overlapping segment iterator into fastSmoothV2.h --- src/fastSmoothV2.h | 201 ++++++++++++++++++++++++++++++++++++ src/interleaved_iterator.h | 202 ------------------------------------- 2 files changed, 201 insertions(+), 202 deletions(-) delete mode 100644 src/interleaved_iterator.h diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h index ac3b1b11..130f8b52 100644 --- a/src/fastSmoothV2.h +++ b/src/fastSmoothV2.h @@ -1,6 +1,18 @@ // Enable C++11 // [[Rcpp::plugins(cpp11)]] + +#ifndef FASTSMOOTHV2_H +#define FASTSMOOTHV2_H + #include +#include +#include +#include +#include +// #include +// #include +#include +#include #include #include #include @@ -8,6 +20,193 @@ using namespace Rcpp; +class OverlappingSegments { + // references the two sequences + const NumericVector& loc1; + const NumericVector& loc2; + const double binWidth1, binWidth2; + +public: + OverlappingSegments(NumericVector& loc1_, NumericVector& loc2_, + double binWidth1_ = 1.0, double binWidth2_ = 1.0) + : loc1(loc1_), loc2(loc2_), binWidth1(binWidth1_), binWidth2(binWidth2_) + { + // check the requirement that loc1 and loc2 are nonempty and + // (strictly) sorted + + if (loc1.size() == 0 || loc2.size() == 0) + throw std::invalid_argument("Input vectors must be nonempty"); + + for (int i = 0; i < loc1.size() - 1; i++) + if (loc1[i] > loc1[i + 1]) + throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); + + for (int i = 0; i < loc2.size() - 1; i++) + if (loc2[i] > loc2[i + 1]) + throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); + } + + class iterator { + const OverlappingSegments * const segs; + + long Nx, Ny; + + // the minimum and maximum contained values; + double minloc, maxloc; + + // the current iteration state + std::pair idx; + + public: + typedef std::pair value_type; + typedef void difference_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::input_iterator_tag iterator_category; + + // Iterate over pairs of indices (i,j) into the sequences loc1 and + // loc2, where the intervals [loc1[i], loc1[i+1]] and [loc2[j], + // loc2[j+1]] overlap. + // + // These indices are returned from the iterator as + // std::pair. + // + // A sequence has an implicit segment from minloc (with index -1) + // to its zeroth element. The elements loc1[0] and loc2[0] are + // compared to determine whether, for either sequence, this + // initial implicit segment overlaps the zeroth segment of the + // other one. If both sequences start with the same value, the + // iteration starts at (0,0). + // + explicit iterator(OverlappingSegments *segs_) + : segs(segs_), Nx(segs_->loc1.size()), Ny(segs_->loc2.size()) + { + // The initial state of the iterator + + if (segs->loc1[0] < segs->loc2[0]) { + minloc = segs->loc1[0]; + idx.first = 0; + idx.second = -1; + } + else if (segs->loc1[0] == segs->loc2[0]) { + minloc = segs->loc1[0]; // == segs->loc2[0] + idx.first = 0; + idx.second = 0; + } + else { + minloc = segs->loc2[0]; + idx.first = -1; + idx.second = 0; + } + + // maxloc is used to signal the end of the current segment: + // since we consider only left-hand endpoints, the only + // requirement is that it compares greater than any real loc in + // either sequence - in reality it will be the largest loc plus + // the corresponding bin width + // maxloc = std::numeric_limits::infinity(); + maxloc = std::max(segs->loc1[segs->loc1.size() - 1] + segs->binWidth1, + segs->loc2[segs->loc2.size() - 1] + segs->binWidth2); + } + + double get_loc1(long i) const { + if (i < 0) return minloc; + else if (i >= Nx) return maxloc; + else return segs->loc1[i]; + } + + double get_loc2(long i) const { + if (i < 0) return minloc; + else if (i >= Ny) return maxloc; + else return segs->loc2[i]; + } + + // Does interval i (from the first collection of segments) overlap + // interval j (from the second)? + bool intervals_overlap(long i, long j) const { + return (get_loc1(i) < get_loc2(j + 1) && get_loc2(j) < get_loc1(i + 1)); + } + + bool at_end() const { return idx.first == Nx && idx.second == Ny - 1; } + + iterator& advance_to_end() { + idx.first = Nx; + idx.second = Ny - 1; + return *this; + } + + value_type operator*() const { return idx; } + + const value_type *operator->() const { return &idx; } + + iterator& operator++() { + +#if !NDEBUG + // Verify precondition + if (!intervals_overlap(idx.first, idx.second)) { + throw std::logic_error("Iterator precondition not satisfied: " + "current intervals do not overlap"); + } +#endif + + // Advance the second segment if it would still overlap the first + // + // The condition below is equivalent to + // idx.second < Ny - 1 && intervals_overlap(idx.first, idx.second + 1) + // + // For the right-hand condition, we know that (by precondition) + // get_loc1(idx.first) < get_loc2(idx.second + 1) + // + // and therefore that + // get_loc1(idx.first) < get_loc2(idx.second + 2), + // + // so this inequality is all that remains to be checked. + // + if (idx.second < Ny - 1 + && get_loc2(idx.second + 1) < get_loc1(idx.first + 1)) { + idx.second++; + } + // Could not advance the second segment above: advance the first instead, + // and the second as well if they share an endpoint + else { + if (idx.second < Ny - 1 + && get_loc1(idx.first + 1) == get_loc2(idx.second + 1)) { + idx.second++; + } + idx.first++; + } + +#if !NDEBUG + // Verify postcondition + if (!(at_end() || intervals_overlap(idx.first, idx.second))) { + throw std::logic_error("Iterator postcondition not satisfied: " + "current intervals do not overlap (not at end)"); + } +#endif + + return *this; + } + + iterator operator++(int) { + iterator res = *this; + operator++(); + return res; + } + + friend bool operator==(const iterator& lhs, const iterator& rhs) { + return lhs.idx == rhs.idx; + } + + friend bool operator!=(const iterator& lhs, const iterator& rhs) { + return !(lhs == rhs); + } + }; + + iterator begin() { return iterator(this); }; + iterator end() { return iterator(this).advance_to_end(); }; +}; + + double bowtie_area(double length, double val1_start, double val1_end, double val2_start, double val2_end); double get_segment(double start, double end, double val1_start, @@ -32,3 +231,5 @@ inline double leftmost_segments(const NumericVector& loc1, double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); + +#endif // FASTSMOOTHV2_H diff --git a/src/interleaved_iterator.h b/src/interleaved_iterator.h deleted file mode 100644 index 7451a9fb..00000000 --- a/src/interleaved_iterator.h +++ /dev/null @@ -1,202 +0,0 @@ -#ifndef OVERLAPPING_SEGMENTS_H -#define OVERLAPPING_SEGMENTS_H - -#include -#include -#include -#include -#include -#include -#include -#include - -// Class used to support iteration (with OverlappingSegments::iterator) -// over pairs of overlapping segments within two sequences. -// -template -class OverlappingSegments { - // short alias for the value type of the container - typedef typename std::remove_reference::type CvalT; - typedef long int IndexT; - - // references the two sequences - const Container& loc1; - const Container& loc2; - -public: - OverlappingSegments(Container& loc1_, Container& loc2_) - : loc1(loc1_), loc2(loc2_) { - // check the requirement that loc1 and loc2 are nonempty and - // (strictly) sorted - - if (loc1.size() == 0 || loc2.size() == 0) - throw std::invalid_argument("Input vectors must be nonempty"); - - for (int i = 0; i < loc1.size() - 1; i++) - if (loc1[i] > loc1[i + 1]) - throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); - - for (int i = 0; i < loc2.size() - 1; i++) - if (loc2[i] > loc2[i + 1]) - throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); - } - - class iterator { - const OverlappingSegments * const segs; - - IndexT Nx, Ny; - - // the minimum and maximum contained values; - CvalT minloc, maxloc; - - // the current iteration state - std::pair idx; - - public: - typedef std::pair value_type; - typedef void difference_type; - typedef value_type* pointer; - typedef value_type& reference; - typedef std::input_iterator_tag iterator_category; - - // Iterate over pairs of indices (i,j) into the sequences loc1 and - // loc2, where the intervals [loc1[i], loc1[i+1]] and [loc2[j], - // loc2[j+1]] overlap. - // - // These indices are returned from the iterator as - // std::pair. - // - // A sequence has an implicit segment from minloc (with index -1) - // to its zeroth element. The elements loc1[0] and loc2[0] are - // compared to determine whether, for either sequence, this - // initial implicit segment overlaps the zeroth segment of the - // other one. If both sequences start with the same value, the - // iteration starts at (0,0). - // - explicit iterator(OverlappingSegments *segs_) - : segs(segs_), Nx(segs_->loc1.size()), Ny(segs_->loc2.size()) - { - // The initial state of the iterator - - if (segs->loc1[0] < segs->loc2[0]) { - minloc = segs->loc1[0]; - idx.first = 0; - idx.second = -1; - } - else if (segs->loc1[0] == segs->loc2[0]) { - minloc = segs->loc1[0]; // == segs->loc2[0] - idx.first = 0; - idx.second = 0; - } - else { - minloc = segs->loc2[0]; - idx.first = -1; - idx.second = 0; - } - - // maxloc is used to signal the end of the current segment: - // since we consider only left-hand endpoints, the only - // requirement is that it compares greater than any real loc in - // either sequence - in reality it will be the largest loc plus - // the corresponding binwidth - maxloc = std::numeric_limits::infinity(); - } - - CvalT get_loc1(IndexT i) const { - if (i < 0) return minloc; - else if (i >= Nx) return maxloc; - else return segs->loc1[i]; - } - - CvalT get_loc2(IndexT i) const { - if (i < 0) return minloc; - else if (i >= Ny) return maxloc; - else return segs->loc2[i]; - } - - // Does interval i (from the first collection of segments) overlap - // interval j (from the second)? - bool intervals_overlap(IndexT i, IndexT j) const { - return (get_loc1(i) < get_loc2(j + 1) && get_loc2(j) < get_loc1(i + 1)); - } - - bool at_end() const { return idx.first == Nx && idx.second == Ny - 1; } - - iterator& advance_to_end() { - idx.first = Nx; - idx.second = Ny - 1; - return *this; - } - - value_type operator*() const { return idx; } - - const value_type *operator->() const { return &idx; } - - iterator& operator++() { - -#if !NDEBUG - // Verify precondition - if (!intervals_overlap(idx.first, idx.second)) { - throw std::logic_error("Iterator precondition not satisfied: " - "current intervals do not overlap"); - } -#endif - - // Advance the second segment if it would still overlap the first - // - // The condition below is equivalent to - // idx.second < Ny - 1 && intervals_overlap(idx.first, idx.second + 1) - // - // For the right-hand condition, we know that (by precondition) - // get_loc1(idx.first) < get_loc2(idx.second + 1) - // - // and therefore that - // get_loc1(idx.first) < get_loc2(idx.second + 2), - // - // so this inequality is all that remains to be checked. - // - if (idx.second < Ny - 1 - && get_loc2(idx.second + 1) < get_loc1(idx.first + 1)) { - idx.second++; - } - // Could not advance the second segment above: advance the first instead, - // and the second as well if they share an endpoint - else { - if (idx.second < Ny - 1 - && get_loc1(idx.first + 1) == get_loc2(idx.second + 1)) { - idx.second++; - } - idx.first++; - } - -#if !NDEBUG - // Verify postcondition - if (!(at_end() || intervals_overlap(idx.first, idx.second))) { - throw std::logic_error("Iterator postcondition not satisfied: " - "current intervals do not overlap (not at end)"); - } -#endif - - return *this; - } - - iterator& operator++(int) { - iterator res = *this; - operator++(); - return res; - } - - friend bool operator==(const iterator& lhs, const iterator& rhs) { - return lhs.idx == rhs.idx; - } - - friend bool operator!=(const iterator& lhs, const iterator& rhs) { - return !(lhs == rhs); - } - }; - - iterator begin() { return iterator(this); }; - iterator end() { return iterator(this).advance_to_end(); }; -}; - -#endif // OVERLAPPING_SEGMENTS_H From 72de9eafb26705389fcfaff450cf0aae3b3c279b Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Wed, 22 Apr 2020 21:05:43 +0100 Subject: [PATCH 45/84] Use NumericVectors in tests --- src/test_emd_fast_smooth.cpp | 49 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/test_emd_fast_smooth.cpp b/src/test_emd_fast_smooth.cpp index ac0a65b7..8d9f453a 100644 --- a/src/test_emd_fast_smooth.cpp +++ b/src/test_emd_fast_smooth.cpp @@ -12,7 +12,6 @@ // header file. #include "fastSmoothV2.h" #include "emd_fast_no_smoothing.h" -#include "interleaved_iterator.h" #include #include @@ -137,72 +136,72 @@ void runIntervalOverlapTest(Container1T& actual, Container2T& expected) context("emd_fast_smooth overlapping interval iterator") { test_that("emd_fast_smooth overlapping interval iterator") { { - std::vector xs {1.0, 3.0, 5.0}; - std::vector ys {2.0, 4.0, 6.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 3.0, 5.0}; + NumericVector ys {2.0, 4.0, 6.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, -1}, {0, 0}, {1, 0}, {1, 1}, {2, 1}, {2, 2}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 3.0}; - std::vector ys {4.0, 6.0, 8.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 3.0}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, -1}, {1, -1}, {1, 0}, {1, 1}, {1, 2}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {5.0, 5.5}; - std::vector ys {4.0, 6.0, 8.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {5.0, 5.5}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {-1, 0}, {0, 0}, {1, 0}, {1, 1}, {1, 2}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 2.0}; - std::vector ys {1.0, 3.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 3.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, 0}, {1, 0}, {1, 1}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 3.0}; - std::vector ys {1.0, 2.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 3.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, 0}, {0, 1}, {1, 1}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 2.0}; - std::vector ys {1.5, 2.0, 3.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0, 3.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, -1}, {0, 0}, {1, 1}, {1, 2}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 2.0}; - std::vector ys {1.0, 2.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, 0}, {1, 1}}; runIntervalOverlapTest(actual, expected); } { - std::vector xs {1.0, 2.0}; - std::vector ys {1.5, 2.0}; - OverlappingSegments > actual(xs, ys); + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0}; + OverlappingSegments actual(xs, ys); std::vector > expected { {0, -1}, {0, 0}, {1, 1}}; runIntervalOverlapTest(actual, expected); From 8e12200baef07c3955fe6deb1e920ecc24ecabda Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Wed, 22 Apr 2020 21:07:04 +0100 Subject: [PATCH 46/84] Use overlapping interval iterator in NetEmdSmoothV2 --- src/fastSmoothV2.cpp | 228 ++++++++++++++++++++++++++++--------------- 1 file changed, 147 insertions(+), 81 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 1b544a2d..90f02074 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -5,6 +5,7 @@ #include #include #include +#include "fastSmoothV2.h" using namespace Rcpp; @@ -168,6 +169,10 @@ double leftmost_segments(const NumericVector& loc1, curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + std::cerr << "%% " << index << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " + << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 + << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; + if (curSeg2Loc1 > curSeg1Loc3) { break; } @@ -190,99 +195,160 @@ double leftmost_segments(const NumericVector& loc1, double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2) { - double res = 0; + // double res = 0; - // Hist 1 - double curSeg1Loc1; // Start of the gradient section in Seg1 - double curSeg1Loc2; // End of the gradient section in Seg1 - double curSeg1Loc3; // End of the flat section in Seg1 - double curSeg1Val1; // Start value in Seg1 - double curSeg1Val2; // End value in Seg1 + // // Hist 1 + // double curSeg1Loc1; // Start of the gradient section in Seg1 + // double curSeg1Loc2; // End of the gradient section in Seg1 + // double curSeg1Loc3; // End of the flat section in Seg1 + // double curSeg1Val1; // Start value in Seg1 + // double curSeg1Val2; // End value in Seg1 - // Hist 2 - double curSeg2Loc1; // Start of the gradient section in Seg2 - double curSeg2Loc2; // End of the gradient section in Seg2 - double curSeg2Loc3; // End of the flat section in Seg2 - double curSeg2Val1; // Start value in Seg2 - double curSeg2Val2; // End value in Seg2 + // // Hist 2 + // double curSeg2Loc1; // Start of the gradient section in Seg2 + // double curSeg2Loc2; // End of the gradient section in Seg2 + // double curSeg2Loc3; // End of the flat section in Seg2 + // double curSeg2Val1; // Start value in Seg2 + // double curSeg2Val2; // End value in Seg2 - // Starting index for the second histogram - double secondStart = 0; + // // Starting index for the second histogram + // double secondStart = 0; - // Smallest and largest location values - double maxLoc = std::max(loc1[loc1.size()-1] + binWidth1, - loc2[loc2.size()-1] + binWidth2); + // // Smallest and largest location values + // double maxLoc = std::max(loc1[loc1.size()-1] + binWidth1, + // loc2[loc2.size()-1] + binWidth2); - // warning area before loc2[0] is not well tested - // As values outside of the range appear to be zero + // // warning area before loc2[0] is not well tested + // // As values outside of the range appear to be zero - if (loc2[0] < loc1[0]) { - res += leftmost_segments(loc2, loc1, val2, val1, binWidth2, maxLoc); - } - else + // std::cerr << "===============\n"; + + // if (loc2[0] < loc1[0]) { + // res += leftmost_segments(loc2, loc1, val2, val1, binWidth2, maxLoc); + // } + // else + // { + // res += leftmost_segments(loc1, loc2, val1, val2, binWidth1, maxLoc); + // } + + // // Add both the overlapping sections and the non overlapping section on the right + // // Note we reiterate over the first few sections loc1 + // // Could store where we are upto from above to save time + // // Reset Val counter + // curSeg1Val2 = 0; + // for (int index1 = 0; index1 < loc1.size(); index1++) { + // // Get the three relevant locations + // // Start; end of linear section; end of flat section + // curSeg1Loc1 = loc1[index1]; + // curSeg1Loc2 = loc1[index1] + binWidth1; + // // could pull this check outside of the loop with final case not sure if worth it + // if (index1 == loc1.size() - 1) { + // curSeg1Loc3 = maxLoc; + // } + // else { + // curSeg1Loc3 = loc1[index1 + 1]; + // } + // // Update value to the start and end of the current section + // curSeg1Val1 = curSeg1Val2; + // curSeg1Val2 = val1[index1]; + // // Setting up the previous value for the next loop + // // Could replace this loop with a while + // // but if so would need to be careful about overlaps + // if (secondStart == 0) { + // curSeg2Val2 = 0; + // } + // else { + // curSeg2Val2 = val2[secondStart - 1]; + // } + // for (int index2 = secondStart; index2 < loc2.size(); index2++) { + // // Construct 3 sections for second seg. + // curSeg2Loc1 = loc2[index2]; + // curSeg2Loc2 = loc2[index2] + binWidth2; + // if (index2 == loc2.size() - 1) { + // curSeg2Loc3 = maxLoc; + // } + // else { + // curSeg2Loc3 = loc2[index2 + 1]; + // } + // //update values + // curSeg2Val1 = curSeg2Val2; + // curSeg2Val2 = val2[index2]; + // // If this section is behind Seg1 + // // Do not consider again + // if (curSeg2Loc3 < curSeg1Loc1) { + // secondStart = index2 + 1; + // continue; + // } + // // If current Seg2 is beyond Seg1 break out of loop + // res += get_double_segment_constrained( + // curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + // curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + // std::cerr << "%% " << index1 << " " << index2 << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " + // << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 + // << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; + + // if (curSeg2Loc3 > curSeg1Loc3) { + // break; + // } + // } + // } + + // std::cerr << "---------------\n"; + { - res += leftmost_segments(loc1, loc2, val1, val2, binWidth1, maxLoc); - } + OverlappingSegments segs(loc1, loc2, binWidth1, binWidth2); + + double res = 0.0; + + for (OverlappingSegments::iterator it = segs.begin(), endIt = segs.end(); + it != endIt; ++it) + { + // Hist 1 + // Start of the gradient section in Seg1 + double curSeg1Loc1 = it.get_loc1(it->first); + + // End of the gradient section in Seg1 + double curSeg1Loc2 = (it->first < 0) ? curSeg1Loc1 : (curSeg1Loc1 + binWidth1); + + // End of the flat section in Seg1 + double curSeg1Loc3 = it.get_loc1(it->first + 1); + + // Start value in Seg1 + double curSeg1Val1 = (it->first <= 0) ? 0.0 : val1[it->first - 1]; + + // End value in Seg1 + double curSeg1Val2 = (it->first < 0) ? 0.0 : val1[it->first]; + + // Hist 2 + // Start of the gradient section in Seg2 + double curSeg2Loc1 = it.get_loc2(it->second); + + // End of the gradient section in Seg2 + double curSeg2Loc2 = (it->second < 0) ? curSeg2Loc1 : (curSeg2Loc1 + binWidth2); + + // End of the flat section in Seg2 + double curSeg2Loc3 = it.get_loc2(it->second + 1); + + // Start value in Seg2 + double curSeg2Val1 = (it->second <= 0) ? 0.0 : val2[it->second - 1]; + + // End value in Seg2 + double curSeg2Val2 = (it->second < 0) ? 0.0 : val2[it->second]; + + // std::cerr << "%% " << it->first << " " << it->second << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " + // << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 + // << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; - // Add both the overlapping sections and the non overlapping section on the right - // Note we reiterate over the first few sections loc1 - // Could store where we are upto from above to save time - // Reset Val counter - curSeg1Val2 = 0; - for (int index1 = 0; index1 < loc1.size(); index1++) { - // Get the three relevant locations - // Start; end of linear section; end of flat section - curSeg1Loc1 = loc1[index1]; - curSeg1Loc2 = loc1[index1] + binWidth1; - // could pull this check outside of the loop with final case not sure if worth it - if (index1 == loc1.size() - 1) { - curSeg1Loc3 = maxLoc; - } - else { - curSeg1Loc3 = loc1[index1 + 1]; - } - // Update value to the start and end of the current section - curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index1]; - // Setting up the previous value for the next loop - // Could replace this loop with a while - // but if so would need to be careful about overlaps - if (secondStart == 0) { - curSeg2Val2 = 0; - } - else { - curSeg2Val2 = val2[secondStart - 1]; - } - for (int index2 = secondStart; index2 < loc2.size(); index2++) { - // Construct 3 sections for second seg. - curSeg2Loc1 = loc2[index2]; - curSeg2Loc2 = loc2[index2] + binWidth2; - if (index2 == loc2.size() - 1) { - curSeg2Loc3 = maxLoc; - } - else { - curSeg2Loc3 = loc2[index2 + 1]; - } - //update values - curSeg2Val1 = curSeg2Val2; - curSeg2Val2 = val2[index2]; - // If this section is behind Seg1 - // Do not consider again - if (curSeg2Loc3 < curSeg1Loc1) { - secondStart = index2 + 1; - continue; - } - // If current Seg2 is beyond Seg1 break out of loop res += get_double_segment_constrained( curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); - - if (curSeg2Loc3 > curSeg1Loc3) { - break; - } + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); } + + return res; } - return res; + + // std::cerr << "===============\n"; } From 857c91ce2b459467b850716fd00b09c7a66d1b6b Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 23 Apr 2020 11:04:43 +0100 Subject: [PATCH 47/84] Small refactor of OverlappingSegments, and use for NetEmdSmoothV2 --- src/fastSmoothV2.cpp | 289 ++++++++++--------------------------------- src/fastSmoothV2.h | 189 ++++++++++++++-------------- 2 files changed, 157 insertions(+), 321 deletions(-) diff --git a/src/fastSmoothV2.cpp b/src/fastSmoothV2.cpp index 90f02074..0c2ffb5c 100644 --- a/src/fastSmoothV2.cpp +++ b/src/fastSmoothV2.cpp @@ -10,21 +10,21 @@ using namespace Rcpp; double bowtie_area(double length, double val1_start, double val1_end, - double val2_start, double val2_end) + double val2_start, double val2_end) { double midPoint = (val1_start - val2_start) / ((val2_end - val2_start) - (val1_end - val1_start)); const double midValue = val1_start + midPoint * (val1_end - val1_start); - + midPoint = midPoint * length; - + double topTriangle = 0.5 * midPoint * (midValue - val1_start); double topRectangle = midPoint * (val1_start - val2_start); double bottomTriangle = 0.5 * midPoint * (midValue - val2_start); - + double res = topTriangle + topRectangle - bottomTriangle; - + topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); topRectangle = 0; // midPoint*(val1_start-val2_start); bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); @@ -36,7 +36,7 @@ double bowtie_area(double length, double val1_start, double val1_end, // Compute the unsigned area between two line segments // assumes that val1_end > val1_start and val2_end > val2_start double get_segment(double start, double end, double val1_start, - double val1_end, double val2_start, double val2_end) + double val1_end, double val2_start, double val2_end) { const double length = end - start; @@ -49,7 +49,7 @@ double get_segment(double start, double end, double val1_start, bool both_differences_positive = val1_start > val2_start && val1_end >= val2_end; bool both_differences_negative = val1_start <= val2_start && val1_end <= val2_end; - + if (both_differences_positive || both_differences_negative) { // They are in the same order: no bowtie @@ -73,9 +73,9 @@ double get_segment(double start, double end, double val1_start, // cut down and compute segment double get_segment_constrained(double seg1L1, double seg1L2, - double seg2L1, double seg2L2, - double seg1V1, double seg1V2, - double seg2V1, double seg2V2) + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2) { //We have a valid range double valStart1, valEnd1, valStart2, valEnd2; @@ -102,86 +102,30 @@ double get_double_segment_constrained( double seg2Loc1, double seg2Loc2, double seg2Loc3, double seg2Val1, double seg2Val2) { - double res = 0; - - // compare the linear section with the linear section - res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc1, seg2Loc2, - seg1Val1, seg1Val2, seg2Val1, seg2Val2); - - // compare the linear section with the flat section - // This could be easily special cased (saving ~1 if statements ). - res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc2, seg2Loc3, - seg1Val1, seg1Val2, seg2Val2, seg2Val2); - - // compare the flat section with the linear section - // This could be easily special cased (saving ~1 if statements ). - res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc1, seg2Loc2, - seg1Val2, seg1Val2, seg2Val1, seg2Val2); - - // compare the flat section with the flat section - // This could be easily special cased (saving ~2 if statements ). - res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc2, seg2Loc3, - seg1Val2, seg1Val2, seg2Val2, seg2Val2); - - return res; -} + double res = 0; + // compare the linear section with the linear section + res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc1, seg2Loc2, + seg1Val1, seg1Val2, seg2Val1, seg2Val2); -// Dealing with segments which are to the left of the region covered by both -// segs -double leftmost_segments(const NumericVector& loc1, - const NumericVector& loc2, - const NumericVector& val1, - const NumericVector& val2, - double binWidth1, - double maxLoc) -{ - double res = 0.0; + // compare the linear section with the flat section + // This could be easily special cased (saving ~1 if statements ). + res += get_segment_constrained(seg1Loc1, seg1Loc2, seg2Loc2, seg2Loc3, + seg1Val1, seg1Val2, seg2Val2, seg2Val2); - assert(loc1[0] < loc2[0]); - - // Fix the position of Seg2 and then interate over Seg1 until we have all - // of the segments of Seg1 before Seg2 starts. - - // are these all used? - - double curSeg2Loc1 = loc1[0]; - double curSeg2Loc2 = loc1[0]; - double curSeg2Loc3 = loc2[0]; - double curSeg2Val1 = 0; - double curSeg2Val2 = 0; - - // Set this value so we can update in the lopp - double curSeg1Val2 = 0; - for (int index = 0; index < loc1.size(); index++) { - double curSeg1Loc1 = loc1[index]; - double curSeg1Loc2 = loc1[index] + binWidth1; - double curSeg1Loc3; - if (index == loc1.size() - 1) { - curSeg1Loc3 = maxLoc; - } - else { - curSeg1Loc3 = loc1[index + 1]; - } - double curSeg1Val1 = curSeg1Val2; - curSeg1Val2 = val1[index]; - res += get_double_segment_constrained( - curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + // compare the flat section with the linear section + // This could be easily special cased (saving ~1 if statements ). + res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc1, seg2Loc2, + seg1Val2, seg1Val2, seg2Val1, seg2Val2); - std::cerr << "%% " << index << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " - << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 - << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; - - if (curSeg2Loc1 > curSeg1Loc3) { - break; - } - } + // compare the flat section with the flat section + // This could be easily special cased (saving ~2 if statements ). + res += get_segment_constrained(seg1Loc2, seg1Loc3, seg2Loc2, seg2Loc3, + seg1Val2, seg1Val2, seg2Val2, seg2Val2); return res; } - //' @title //' Compute EMD ////' @@ -195,160 +139,55 @@ double leftmost_segments(const NumericVector& loc1, double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2) { - // double res = 0; - - // // Hist 1 - // double curSeg1Loc1; // Start of the gradient section in Seg1 - // double curSeg1Loc2; // End of the gradient section in Seg1 - // double curSeg1Loc3; // End of the flat section in Seg1 - // double curSeg1Val1; // Start value in Seg1 - // double curSeg1Val2; // End value in Seg1 - - // // Hist 2 - // double curSeg2Loc1; // Start of the gradient section in Seg2 - // double curSeg2Loc2; // End of the gradient section in Seg2 - // double curSeg2Loc3; // End of the flat section in Seg2 - // double curSeg2Val1; // Start value in Seg2 - // double curSeg2Val2; // End value in Seg2 - - - // // Starting index for the second histogram - // double secondStart = 0; - - // // Smallest and largest location values - // double maxLoc = std::max(loc1[loc1.size()-1] + binWidth1, - // loc2[loc2.size()-1] + binWidth2); - - - // // warning area before loc2[0] is not well tested - // // As values outside of the range appear to be zero - - // std::cerr << "===============\n"; - - // if (loc2[0] < loc1[0]) { - // res += leftmost_segments(loc2, loc1, val2, val1, binWidth2, maxLoc); - // } - // else - // { - // res += leftmost_segments(loc1, loc2, val1, val2, binWidth1, maxLoc); - // } - - // // Add both the overlapping sections and the non overlapping section on the right - // // Note we reiterate over the first few sections loc1 - // // Could store where we are upto from above to save time - // // Reset Val counter - // curSeg1Val2 = 0; - // for (int index1 = 0; index1 < loc1.size(); index1++) { - // // Get the three relevant locations - // // Start; end of linear section; end of flat section - // curSeg1Loc1 = loc1[index1]; - // curSeg1Loc2 = loc1[index1] + binWidth1; - // // could pull this check outside of the loop with final case not sure if worth it - // if (index1 == loc1.size() - 1) { - // curSeg1Loc3 = maxLoc; - // } - // else { - // curSeg1Loc3 = loc1[index1 + 1]; - // } - // // Update value to the start and end of the current section - // curSeg1Val1 = curSeg1Val2; - // curSeg1Val2 = val1[index1]; - // // Setting up the previous value for the next loop - // // Could replace this loop with a while - // // but if so would need to be careful about overlaps - // if (secondStart == 0) { - // curSeg2Val2 = 0; - // } - // else { - // curSeg2Val2 = val2[secondStart - 1]; - // } - // for (int index2 = secondStart; index2 < loc2.size(); index2++) { - // // Construct 3 sections for second seg. - // curSeg2Loc1 = loc2[index2]; - // curSeg2Loc2 = loc2[index2] + binWidth2; - // if (index2 == loc2.size() - 1) { - // curSeg2Loc3 = maxLoc; - // } - // else { - // curSeg2Loc3 = loc2[index2 + 1]; - // } - // //update values - // curSeg2Val1 = curSeg2Val2; - // curSeg2Val2 = val2[index2]; - // // If this section is behind Seg1 - // // Do not consider again - // if (curSeg2Loc3 < curSeg1Loc1) { - // secondStart = index2 + 1; - // continue; - // } - // // If current Seg2 is beyond Seg1 break out of loop - // res += get_double_segment_constrained( - // curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - // curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); - - // std::cerr << "%% " << index1 << " " << index2 << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " - // << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 - // << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; - - // if (curSeg2Loc3 > curSeg1Loc3) { - // break; - // } - // } - // } - - // std::cerr << "---------------\n"; - - { - OverlappingSegments segs(loc1, loc2, binWidth1, binWidth2); - - double res = 0.0; - - for (OverlappingSegments::iterator it = segs.begin(), endIt = segs.end(); - it != endIt; ++it) - { - // Hist 1 - // Start of the gradient section in Seg1 - double curSeg1Loc1 = it.get_loc1(it->first); + OverlappingSegments segs(loc1, loc2, binWidth1, binWidth2); - // End of the gradient section in Seg1 - double curSeg1Loc2 = (it->first < 0) ? curSeg1Loc1 : (curSeg1Loc1 + binWidth1); - - // End of the flat section in Seg1 - double curSeg1Loc3 = it.get_loc1(it->first + 1); + double res = 0.0; - // Start value in Seg1 - double curSeg1Val1 = (it->first <= 0) ? 0.0 : val1[it->first - 1]; + for (OverlappingSegments::iterator it = segs.begin(), endIt = segs.end(); + it != endIt; ++it) + { + // The OverlappingSegments iterator returns pairs of the left + // indices of overlapping endpoints: it->first is the index into + // loc1, it->second is the index into loc2. When the smallest + // element in one of these vectors is larger than the current + // element of the other, an 'index' of -1 is returned. - // End value in Seg1 - double curSeg1Val2 = (it->first < 0) ? 0.0 : val1[it->first]; + // Hist 1 + // Start of the gradient section in Seg1 + double curSeg1Loc1 = segs.loc1_left(it->first); - // Hist 2 - // Start of the gradient section in Seg2 - double curSeg2Loc1 = it.get_loc2(it->second); + // End of the gradient section in Seg1 + double curSeg1Loc2 = segs.loc1_mid(it->first); - // End of the gradient section in Seg2 - double curSeg2Loc2 = (it->second < 0) ? curSeg2Loc1 : (curSeg2Loc1 + binWidth2); + // End of the flat section in Seg1 + double curSeg1Loc3 = segs.loc1_right(it->first); - // End of the flat section in Seg2 - double curSeg2Loc3 = it.get_loc2(it->second + 1); + // Start and end values in Seg1: val1 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg1Val1 = (it->first > 0) ? val1[it->first - 1] : 0.0; + double curSeg1Val2 = (it->first >= 0) ? val1[it->first] : 0.0; - // Start value in Seg2 - double curSeg2Val1 = (it->second <= 0) ? 0.0 : val2[it->second - 1]; + // Hist 2 + // Start of the gradient section in Seg1 + double curSeg2Loc1 = segs.loc2_left(it->second); - // End value in Seg2 - double curSeg2Val2 = (it->second < 0) ? 0.0 : val2[it->second]; + // End of the gradient section in Seg1 + double curSeg2Loc2 = segs.loc2_mid(it->second); - // std::cerr << "%% " << it->first << " " << it->second << " : " << curSeg1Loc1 << " " << curSeg1Loc2 << " " << curSeg1Loc3 << " " - // << curSeg1Val1 << " " << curSeg1Val2 << " " << curSeg2Loc1 << " " << curSeg2Loc2 - // << " " << curSeg2Loc3 << " " << curSeg2Val1 << " " << curSeg2Val2 << std::endl; + // End of the flat section in Seg1 + double curSeg2Loc3 = segs.loc2_right(it->second); - res += get_double_segment_constrained( - curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, - curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); - } + // Start and end values in Seg2: val2 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg2Val1 = (it->second > 0) ? val2[it->second - 1] : 0.0; + double curSeg2Val2 = (it->second >= 0) ? val2[it->second] : 0.0; - return res; + res += get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); } - // std::cerr << "===============\n"; + return res; } diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h index 130f8b52..6f4036c1 100644 --- a/src/fastSmoothV2.h +++ b/src/fastSmoothV2.h @@ -9,8 +9,6 @@ #include #include #include -// #include -// #include #include #include #include @@ -21,38 +19,81 @@ using namespace Rcpp; class OverlappingSegments { - // references the two sequences + + // The two sequences of left-hand segment endpoints const NumericVector& loc1; const NumericVector& loc2; const double binWidth1, binWidth2; + // shorter names for loc1.size() and loc2.size() + const long N1, N2; + + // the minimum and maximum values over both sequences; + double minloc, maxloc; + public: OverlappingSegments(NumericVector& loc1_, NumericVector& loc2_, double binWidth1_ = 1.0, double binWidth2_ = 1.0) - : loc1(loc1_), loc2(loc2_), binWidth1(binWidth1_), binWidth2(binWidth2_) + : loc1(loc1_), loc2(loc2_), + binWidth1(binWidth1_), binWidth2(binWidth2_), + N1(loc1.size()), N2(loc2.size()) { - // check the requirement that loc1 and loc2 are nonempty and - // (strictly) sorted - - if (loc1.size() == 0 || loc2.size() == 0) + if (N1 == 0 || N2 == 0) throw std::invalid_argument("Input vectors must be nonempty"); - - for (int i = 0; i < loc1.size() - 1; i++) - if (loc1[i] > loc1[i + 1]) - throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); - - for (int i = 0; i < loc2.size() - 1; i++) - if (loc2[i] > loc2[i + 1]) - throw std::invalid_argument("Input vectors must be sorted in strict ascending order"); + + for (int i = 0; i < N1 - 1; i++) + if (loc1[i] > loc1[i + 1] + binWidth1) + throw std::invalid_argument( + "Elements of loc1 must be sorted in ascending order, " + "with elements separated by at least binWidth1"); + + for (int i = 0; i < N2 - 1; i++) + if (loc2[i] > loc2[i + 1] + binWidth2) + throw std::invalid_argument( + "Elements of loc2 must be sorted in ascending order, " + "with elements separated by at least binWidth2"); + + minloc = std::min(loc1[0], loc2[0]); + maxloc = std::max(loc1[N1 - 1] + binWidth1, loc2[N2 - 1] + binWidth2); } - class iterator { - const OverlappingSegments * const segs; + // left, mid and right locations of the segments, for loc1 and loc2 + double loc1_left(long i) const { + return (i >= 0) ? loc1[i] : minloc; + } - long Nx, Ny; + double loc1_mid(long i) const { + return (i >= 0) ? (loc1[i] + binWidth1) : minloc; + } - // the minimum and maximum contained values; - double minloc, maxloc; + double loc1_right(long i) const { + return (i + 1 < N1) ? loc1[i + 1] : maxloc; + } + + double loc2_left(long i) const { + return (i >= 0) ? loc2[i] : minloc; + } + + double loc2_mid(long i) const { + return (i >= 0) ? (loc2[i] + binWidth2) : minloc; + } + + double loc2_right(long i) const { + return (i + 1 < N2) ? loc2[i + 1] : maxloc; + } + + // Does interval i (from the first collection of segments) overlap + // interval j (from the second)? + bool intervals_overlap(long i, long j) const { + return (loc1_left(i) < loc2_right(j) && loc2_left(j) < loc1_right(i)); + } + + iterator begin() { return iterator(*this); }; + iterator end() { return iterator(*this).advance_to_end(); }; + + // OverlappingSegments iterator + class iterator { + const OverlappingSegments& segs; // the current iteration state std::pair idx; @@ -78,99 +119,59 @@ class OverlappingSegments { // other one. If both sequences start with the same value, the // iteration starts at (0,0). // - explicit iterator(OverlappingSegments *segs_) - : segs(segs_), Nx(segs_->loc1.size()), Ny(segs_->loc2.size()) - { - // The initial state of the iterator - - if (segs->loc1[0] < segs->loc2[0]) { - minloc = segs->loc1[0]; + explicit iterator(const OverlappingSegments& segs_) : segs(segs_) { + if (segs.loc1[0] < segs.loc2[0]) { idx.first = 0; idx.second = -1; } - else if (segs->loc1[0] == segs->loc2[0]) { - minloc = segs->loc1[0]; // == segs->loc2[0] + else if (segs.loc1[0] == segs.loc2[0]) { idx.first = 0; idx.second = 0; } else { - minloc = segs->loc2[0]; idx.first = -1; idx.second = 0; } - - // maxloc is used to signal the end of the current segment: - // since we consider only left-hand endpoints, the only - // requirement is that it compares greater than any real loc in - // either sequence - in reality it will be the largest loc plus - // the corresponding bin width - // maxloc = std::numeric_limits::infinity(); - maxloc = std::max(segs->loc1[segs->loc1.size() - 1] + segs->binWidth1, - segs->loc2[segs->loc2.size() - 1] + segs->binWidth2); } - double get_loc1(long i) const { - if (i < 0) return minloc; - else if (i >= Nx) return maxloc; - else return segs->loc1[i]; - } - - double get_loc2(long i) const { - if (i < 0) return minloc; - else if (i >= Ny) return maxloc; - else return segs->loc2[i]; - } - - // Does interval i (from the first collection of segments) overlap - // interval j (from the second)? - bool intervals_overlap(long i, long j) const { - return (get_loc1(i) < get_loc2(j + 1) && get_loc2(j) < get_loc1(i + 1)); - } - - bool at_end() const { return idx.first == Nx && idx.second == Ny - 1; } + // Is the current iterator at one-past-the-end? Equivalent to an + // equality comparison with segs.end(). + bool at_end() const { return idx.first == segs.N1 && idx.second == segs.N2 - 1; } + // Update the current iterator to point to one-past-the-end iterator& advance_to_end() { - idx.first = Nx; - idx.second = Ny - 1; + idx.first = segs.N1; + idx.second = segs.N2 - 1; return *this; } - - value_type operator*() const { return idx; } - - const value_type *operator->() const { return &idx; } iterator& operator++() { - #if !NDEBUG // Verify precondition - if (!intervals_overlap(idx.first, idx.second)) { + if (!segs.intervals_overlap(idx.first, idx.second)) { throw std::logic_error("Iterator precondition not satisfied: " "current intervals do not overlap"); } #endif // Advance the second segment if it would still overlap the first - // - // The condition below is equivalent to - // idx.second < Ny - 1 && intervals_overlap(idx.first, idx.second + 1) - // - // For the right-hand condition, we know that (by precondition) - // get_loc1(idx.first) < get_loc2(idx.second + 1) // + // The condition below is equivalent to + // idx.second < N2 - 1 && intervals_overlap(idx.first, idx.second + 1) + // given that we know (by the precondition) that + // loc1_left(idx.first) < loc2_right(idx.second) // and therefore that - // get_loc1(idx.first) < get_loc2(idx.second + 2), - // - // so this inequality is all that remains to be checked. + // loc1_left(idx.first) < loc2_right(idx.second + 1), // - if (idx.second < Ny - 1 - && get_loc2(idx.second + 1) < get_loc1(idx.first + 1)) { + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) < segs.loc1_right(idx.first)) { idx.second++; } // Could not advance the second segment above: advance the first instead, // and the second as well if they share an endpoint else { - if (idx.second < Ny - 1 - && get_loc1(idx.first + 1) == get_loc2(idx.second + 1)) { + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) == segs.loc1_right(idx.first)) { idx.second++; } idx.first++; @@ -178,7 +179,7 @@ class OverlappingSegments { #if !NDEBUG // Verify postcondition - if (!(at_end() || intervals_overlap(idx.first, idx.second))) { + if (!(at_end() || segs.intervals_overlap(idx.first, idx.second))) { throw std::logic_error("Iterator postcondition not satisfied: " "current intervals do not overlap (not at end)"); } @@ -193,6 +194,10 @@ class OverlappingSegments { return res; } + value_type operator*() const { return idx; } + + const value_type *operator->() const { return &idx; } + friend bool operator==(const iterator& lhs, const iterator& rhs) { return lhs.idx == rhs.idx; } @@ -201,20 +206,19 @@ class OverlappingSegments { return !(lhs == rhs); } }; - - iterator begin() { return iterator(this); }; - iterator end() { return iterator(this).advance_to_end(); }; }; double bowtie_area(double length, double val1_start, double val1_end, - double val2_start, double val2_end); + double val2_start, double val2_end); + double get_segment(double start, double end, double val1_start, - double val1_end, double val2_start, double val2_end); + double val1_end, double val2_start, double val2_end); + double get_segment_constrained(double seg1L1, double seg1L2, - double seg2L1, double seg2L2, - double seg1V1, double seg1V2, - double seg2V1, double seg2V2); + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2); double get_double_segment_constrained( double seg1Loc1, double seg1Loc2, double seg1Loc3, @@ -222,13 +226,6 @@ double get_double_segment_constrained( double seg2Loc1, double seg2Loc2, double seg2Loc3, double seg2Val1, double seg2Val2); -inline double leftmost_segments(const NumericVector& loc1, - const NumericVector& loc2, - const NumericVector& val1, - const NumericVector& val2, - double binWidth1, - double maxLoc); - double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); From 17ef75e815c11966704eb0330eaf202b83c7f09e Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 23 Apr 2020 11:27:53 +0100 Subject: [PATCH 48/84] Small formatting changes --- src/fastSmoothV2.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h index 6f4036c1..98a9672b 100644 --- a/src/fastSmoothV2.h +++ b/src/fastSmoothV2.h @@ -119,7 +119,9 @@ class OverlappingSegments { // other one. If both sequences start with the same value, the // iteration starts at (0,0). // - explicit iterator(const OverlappingSegments& segs_) : segs(segs_) { + explicit iterator(const OverlappingSegments& segs_) + : segs(segs_) + { if (segs.loc1[0] < segs.loc2[0]) { idx.first = 0; idx.second = -1; @@ -136,7 +138,9 @@ class OverlappingSegments { // Is the current iterator at one-past-the-end? Equivalent to an // equality comparison with segs.end(). - bool at_end() const { return idx.first == segs.N1 && idx.second == segs.N2 - 1; } + bool at_end() const { + return idx.first == segs.N1 && idx.second == segs.N2 - 1; + } // Update the current iterator to point to one-past-the-end iterator& advance_to_end() { From a67c2cebaa66a3a87fb3ccc21eb87e9697f2c29c Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Thu, 23 Apr 2020 11:30:11 +0100 Subject: [PATCH 49/84] Move begin() and end() after iterator class (fix breaking build) --- src/fastSmoothV2.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h index 98a9672b..77e3d461 100644 --- a/src/fastSmoothV2.h +++ b/src/fastSmoothV2.h @@ -88,9 +88,6 @@ class OverlappingSegments { return (loc1_left(i) < loc2_right(j) && loc2_left(j) < loc1_right(i)); } - iterator begin() { return iterator(*this); }; - iterator end() { return iterator(*this).advance_to_end(); }; - // OverlappingSegments iterator class iterator { const OverlappingSegments& segs; @@ -210,6 +207,9 @@ class OverlappingSegments { return !(lhs == rhs); } }; + + iterator begin() { return iterator(*this); } + iterator end() { return iterator(*this).advance_to_end(); } }; From a59cc6f8ddadd4f6ab1b43181f571f6ddd0b62f7 Mon Sep 17 00:00:00 2001 From: leospinaf Date: Tue, 23 Jun 2020 10:49:05 +0100 Subject: [PATCH 50/84] test --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 43b3164f..26da5aaa 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Network Comparison -An R package implementing the Netdis and NetEMD alignment-free network comparison measures. +An R package implementing the Netdis and NetEMD alignment-free network comparison measures. + ### :warning: BETA: Package under construction (pre-release) :warning: Until this package hits release 1.0 anything can change with no notice. From 1fd6fb7df02feec9d597f430ed8681a081e54839 Mon Sep 17 00:00:00 2001 From: andeElliott Date: Wed, 18 May 2022 10:26:56 +0100 Subject: [PATCH 51/84] removed the flag for sse to test if this will work on M1 --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index 25761e11..07ae5f45 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ CXX_STD = CXX11 -PKG_CPPFLAGS += -fno-fast-math -msse2 -mfpmath=sse -mstackrealign +PKG_CPPFLAGS += -fno-fast-math -msse2 -mstackrealign From 66f9f8f42503cfed15d41311c253c80f5c681385 Mon Sep 17 00:00:00 2001 From: andeElliott Date: Wed, 18 May 2022 10:29:49 +0100 Subject: [PATCH 52/84] attempt2 to fix the M1 issues --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index 07ae5f45..abab4e6b 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ CXX_STD = CXX11 -PKG_CPPFLAGS += -fno-fast-math -msse2 -mstackrealign +PKG_CPPFLAGS += -fno-fast-math -mstackrealign From dcff9a3558a4dd6276aaad12c590031b40a8b28d Mon Sep 17 00:00:00 2001 From: andeElliott Date: Wed, 18 May 2022 10:43:19 +0100 Subject: [PATCH 53/84] adding arg back in to see if it is needed --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index abab4e6b..fe240994 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ CXX_STD = CXX11 -PKG_CPPFLAGS += -fno-fast-math -mstackrealign +PKG_CPPFLAGS += -fno-fast-math -msse2 -mstackrealign From 0a04e3bdacb5108071c9f445d5013f9bcf492cb4 Mon Sep 17 00:00:00 2001 From: andeElliott Date: Wed, 18 May 2022 11:13:10 +0100 Subject: [PATCH 54/84] updated Rcpp Exports --- src/RcppExports.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 933f6b71..3779ab6d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // counts_from_observations NumericMatrix counts_from_observations(NumericMatrix features); RcppExport SEXP _netdist_counts_from_observations(SEXP featuresSEXP) { From 474d8952c4adceaee0d43a799d1397bff5b35f5e Mon Sep 17 00:00:00 2001 From: andeElliott Date: Wed, 18 May 2022 11:27:48 +0100 Subject: [PATCH 55/84] update namespace to test if functions will export --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index f428db48..ca0bca0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,3 +79,4 @@ import(Rcpp) importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) useDynLib(netdist, .registration=TRUE) +exportPattern("^[[:alpha:]]+") From ebc0873766b7fb41a3f436e850dccdcd89a656b3 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 21:07:39 +0100 Subject: [PATCH 56/84] setup linting actions --- .github/workflows/build.yml | 22 +- R/PlottingFunctions.R | 35 ++- R/data.R | 7 +- R/dhist.R | 12 +- R/emd.R | 3 +- R/graph_binning.R | 2 +- R/measures_net_dis.R | 342 ++++++++++++------------- R/measures_net_emd.R | 220 ++++++++-------- R/orca_interface.R | 7 +- tests/testthat/test_emd.R | 20 +- tests/testthat/test_measures_net_emd.R | 26 +- 11 files changed, 359 insertions(+), 337 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46523f94..e38577d3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: config: - - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'release'} # run coverage, linting, and update docs on this platform only - {os: ubuntu-latest, r: 'oldrel'} - {os: ubuntu-latest, r: 'devel'} - {os: macOS-latest, r: 'release'} @@ -50,7 +50,14 @@ jobs: with: extra-packages: | any::devtools + + - uses: r-lib/actions/setup-r-dependencies@v2 + if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} + with: + extra-packages: | any::covr + any::lintr + any::styler - name: Document run: devtools::document() @@ -66,10 +73,21 @@ jobs: git pull --ff-only git push origin - - name: Check package + - name: Check style with styler + if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} + run: styler::style_pkg(dry='fail') + shell: Rscript {0} + + - name: Check style with lintr + if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} + run: lintr::lint_package() + shell: Rscript {0} + + - name: Check package with devtools run: devtools::check() shell: Rscript {0} - name: Check coverage + if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} run: covr::codecov(quiet = FALSE) shell: Rscript {0} diff --git a/R/PlottingFunctions.R b/R/PlottingFunctions.R index e525dabc..b4f9dbf8 100644 --- a/R/PlottingFunctions.R +++ b/R/PlottingFunctions.R @@ -2,7 +2,7 @@ #' Heatmap of Netdis many-to-many comparisons #' #' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. -#' +#' #' @param netdislist Default output of \code{netdis_many_to_many}. #' #' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. @@ -10,19 +10,19 @@ #' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). #' #' @param main Title of the plot. -#' +#' #' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. -#' +#' #' @return Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. #' @export -netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D",main="Nedis",docluster=TRUE){ - adjmat <- cross_comp_to_matrix(measure = netdislist$netdis[whatrow,], cross_comparison_spec = netdislist$comp_spec) +netdis.plot <- function(netdislist, whatrow = c(1, 2)[2], clustering_method = "ward.D", main = "Nedis", docluster = TRUE) { + adjmat <- cross_comp_to_matrix(measure = netdislist$netdis[whatrow, ], cross_comparison_spec = netdislist$comp_spec) vnames <- rownames(adjmat) - - legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) - levels1 <- round(legend1,digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) + + legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) + levels1 <- round(legend1, digits = 2) + pheatmap::pheatmap(mat = as.dist(adjmat), cluster_rows = docluster, cluster_cols = docluster, clustering_method = clustering_method, angle_col = 45, main = main, treeheight_row = 80, labels_row = vnames, labels_col = vnames, display_numbers = TRUE, legend_breaks = legend1, legend_labels = levels1) } @@ -31,7 +31,7 @@ netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D", #' Heatmap of NetEmd many-to-many comparisons #' #' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. -#' +#' #' @param netdislist Default output of \code{netdis_many_to_many}. #' #' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. @@ -39,18 +39,17 @@ netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D", #' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). #' #' @param main Title of the plot. -#' +#' #' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. -#' +#' #' @return Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. #' @export -netemd.plot <- function(netemdlist,clustering_method="ward.D",main="NetEmd",docluster=TRUE){ +netemd.plot <- function(netemdlist, clustering_method = "ward.D", main = "NetEmd", docluster = TRUE) { adjmat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) vnames <- rownames(adjmat) - - legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) - levels1 <- round(legend1,digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) - + + legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) + levels1 <- round(legend1, digits = 2) + pheatmap::pheatmap(mat = as.dist(adjmat), cluster_rows = docluster, cluster_cols = docluster, clustering_method = clustering_method, angle_col = 45, main = main, treeheight_row = 80, labels_row = vnames, labels_col = vnames, display_numbers = TRUE, legend_breaks = legend1, legend_labels = levels1) } diff --git a/R/data.R b/R/data.R index cf5c5dff..4753ed66 100644 --- a/R/data.R +++ b/R/data.R @@ -47,18 +47,17 @@ #' World trade networks from 1985–2014 -#' +#' #' The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. #' #' \itemize{ #' \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. #' \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. #' } -#' +#' #' @format A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. #' @source \strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. #' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). -#' +#' #' @encoding UTF-8 "worldtradesub" - diff --git a/R/dhist.R b/R/dhist.R index a5261f76..862afccd 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -123,12 +123,12 @@ is_dhist <- function(x, fast_check = TRUE) { has_masses <- purrr::has_element(attr(x, "name"), "masses") # Require list with correct class and presence of 1D numeric vector named # elements "locations" and "masses" - return(has_class_attr - && purrr::is_list(x) - && has_locations - && has_masses - && is_numeric_vector_1d(x$locations) - && is_numeric_vector_1d(x$masses)) + return(has_class_attr && + purrr::is_list(x) && + has_locations && + has_masses && + is_numeric_vector_1d(x$locations) && + is_numeric_vector_1d(x$masses)) } #' Discrete histogram from observations (Pure R slow version) diff --git a/R/emd.R b/R/emd.R index 86575521..da99cb36 100644 --- a/R/emd.R +++ b/R/emd.R @@ -80,8 +80,7 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { min_emd <- soln$objective min_offset <- soln$minimum return(list(min_emd = min_emd, min_offset = min_offset)) - } - else { + } else { # Fall back on other version if either dhist is smoothed return(min_emd_optimise(dhist1, dhist2)) } diff --git a/R/graph_binning.R b/R/graph_binning.R index 3f8aa21b..482fc524 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -10,7 +10,7 @@ binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) { - if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index d1120d03..2b0bbc21 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,12 +1,12 @@ #' Netdis between two graphs #' #' Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). -#' -#' +#' +#' #' @param graph_1 A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered. #' #' @param graph_2 A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered. -#' +#' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. Matrix containing counts of each graphlet (columns) for #' each ego-network (rows) in the input graph. Columns are labelled with @@ -16,7 +16,7 @@ #' each ego network. (default: NULL). #' If the \code{graphlet_counts_1} argument is defined then #' \code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}. -#' +#' #' #' @param graphlet_counts_2 Pre-generated graphlet counts for the second query #' graph. Matrix containing counts of each graphlet (columns) for @@ -35,7 +35,7 @@ #' expected counts are calculated for all query graphs. #' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the #' query graphs themselves. (Geometric-Poisson approximation). -#' +#' #' @param graphlet_counts_ref Pre-generated reference graphlet counts. #' Matrix containing counts of each graphlet (columns) for #' each ego-network (rows) in the reference graph. Columns are labelled with @@ -46,7 +46,7 @@ #' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not #' be used. #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighborhood size (default: 2). #' @@ -75,50 +75,50 @@ #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply #' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the #' values of \code{ref_graph} and \code{graphlet_counts_ref}. -#' +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size. #' #' @examples #' require(netdist) #' require(igraph) -#' #Set source directory for Virus PPI graph edge files stored in the netdist package. +#' # Set source directory for Virus PPI graph edge files stored in the netdist package. #' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") #' # Load query graphs as igraph objects -#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -#' graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") -#' -#' #Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. -#' -#' #Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) -#' +#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +#' graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") +#' +#' # Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # This option will focus on detecting more general and global discrepancies between the ego-network structures. +#' +#' # Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) +#' #' # Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. -#' # Two lattice networks of different sizes are used for this example. -#' goldstd_1 <- graph.lattice(c(8,8)) #A reference net -#' goldstd_2 <- graph.lattice(c(44,44)) #A reference net -#' -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) -#' -#' -#' #Providing pre-calculated subgraph counts. -#' -#' props_1 <- count_graphlets_ego(graph = graph_1) -#' props_2 <- count_graphlets_ego(graph = graph_2) -#' props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) -#' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) -#' -#' #Netdis Geometric-Poisson. -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) -#' -#' #Netdis Zero Expectation. -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) -#' -#' #Netdis using gold-standard network -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +#' # Two lattice networks of different sizes are used for this example. +#' goldstd_1 <- graph.lattice(c(8, 8)) # A reference net +#' goldstd_2 <- graph.lattice(c(44, 44)) # A reference net +#' +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) +#' +#' +#' # Providing pre-calculated subgraph counts. +#' +#' props_1 <- count_graphlets_ego(graph = graph_1) +#' props_2 <- count_graphlets_ego(graph = graph_2) +#' props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +#' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) +#' +#' # Netdis Geometric-Poisson. +#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) +#' +#' # Netdis Zero Expectation. +#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) +#' +#' # Netdis using gold-standard network +#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) +#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) #' @export netdis_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, @@ -132,8 +132,8 @@ netdis_one_to_one <- function(graph_1 = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_2 = NULL, - graphlet_counts_ref= NULL) { - + graphlet_counts_ref = NULL) { + ## ------------------------------------------------------------------------ # Check arguments if (is.null(graph_1) && is.null(graphlet_counts_1)) { @@ -145,7 +145,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -157,7 +157,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_2)) { graphlet_counts_2 <- count_graphlets_ego( graph_2, @@ -169,13 +169,13 @@ netdis_one_to_one <- function(graph_1 = NULL, ) } rm(graph_2) - + graphlet_counts <- list( graph_1 = graphlet_counts_1, graph_2 = graphlet_counts_2 ) - - if(!is.null(ref_graph)){ + + if (!is.null(ref_graph)) { if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, @@ -203,7 +203,7 @@ netdis_one_to_one <- function(graph_1 = NULL, graphlet_counts = graphlet_counts, graphlet_counts_ref = graphlet_counts_ref ) - + ## ------------------------------------------------------------------------ # extract netdis statistics from list returned by netdis_many_to_many result$netdis[, 1] @@ -226,7 +226,7 @@ netdis_one_to_one <- function(graph_1 = NULL, #' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size. #' @@ -255,7 +255,7 @@ netdis_one_to_one <- function(graph_1 = NULL, #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply #' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the #' values of \code{ref_graph} and \code{graphlet_counts_ref}. -#' +#' #' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. If the \code{graphlet_counts_1} argument is defined then @@ -268,7 +268,7 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the #' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not #' be used. -#' +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -284,7 +284,7 @@ netdis_one_to_many <- function(graph_1 = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_compare = NULL, - graphlet_counts_ref= NULL) { + graphlet_counts_ref = NULL) { ## ------------------------------------------------------------------------ # Check arguments if (is.null(graph_1) && is.null(graphlet_counts_1)) { @@ -293,11 +293,11 @@ netdis_one_to_many <- function(graph_1 = NULL, if (is.null(graphs_compare) && is.null(graphlet_counts_compare)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } - + ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -309,7 +309,7 @@ netdis_one_to_many <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_compare)) { graphlet_counts_compare <- purrr::map( graphs_compare, @@ -322,13 +322,13 @@ netdis_one_to_many <- function(graph_1 = NULL, ) } rm(graphs_compare) - + graphlet_counts <- append(graphlet_counts_compare, - list(graph_1 = graphlet_counts_1), - after = 0 + list(graph_1 = graphlet_counts_1), + after = 0 ) - - if(!is.null(ref_graph)){ + + if (!is.null(ref_graph)) { if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, @@ -341,7 +341,7 @@ netdis_one_to_many <- function(graph_1 = NULL, ref_graph <- NULL } } - + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( @@ -358,7 +358,7 @@ netdis_one_to_many <- function(graph_1 = NULL, graphlet_counts = graphlet_counts, graphlet_counts_ref = graphlet_counts_ref ) - + ## ------------------------------------------------------------------------ # restructure netdis_many_to_many output colnames(result$netdis) <- result$comp_spec$name_b @@ -384,7 +384,7 @@ netdis_one_to_many <- function(graph_1 = NULL, #' Can be "many-to-many" (all pairwise combinations) or "one-to-many" #' (compare first graph in graphs to all other graphs.) #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size (default 2). #' @@ -393,7 +393,7 @@ netdis_one_to_many <- function(graph_1 = NULL, #' #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges (default 1). -#' +#' #' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} #' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. #' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with @@ -423,15 +423,15 @@ netdis_one_to_many <- function(graph_1 = NULL, #' additional column labelled "N" including the node count for #' each ego network. #' -#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: NULL). Matrix containing counts +#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: NULL). Matrix containing counts #' of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with #' graphlet IDs and rows are labelled with the ID of the central node in each #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for -#' each ego network. +#' each ego network. #' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not #' be used. -#' +#' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. #' @@ -448,14 +448,14 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = NULL, graphlet_counts = NULL, graphlet_counts_ref = NULL) { - + ## ------------------------------------------------------------------------ # Check arguments and set functions appropriately if (is.null(graphs) && is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } - - + + # Set default binning_fn if none supplied if (is.null(binning_fn)) { binning_fn <- purrr::partial( @@ -464,7 +464,7 @@ netdis_many_to_many <- function(graphs = NULL, num_intervals = 100 ) } - + # If no ref_graph supplied, default to geometric poisson unless user-defined # functions have been provided. if (is.null(ref_graph) && is.null(graphlet_counts_ref)) { @@ -476,9 +476,9 @@ netdis_many_to_many <- function(graphs = NULL, netdis_expected_counts, scale_fn = NULL ) - } - # If a ref_graph value supplied (including a constant), default to approach - # from original netdis paper, unless user-defined functions provided. + } + # If a ref_graph value supplied (including a constant), default to approach + # from original netdis paper, unless user-defined functions provided. } else { if (is.null(bin_counts_fn)) { bin_counts_fn <- purrr::partial( @@ -492,9 +492,9 @@ netdis_many_to_many <- function(graphs = NULL, netdis_expected_counts, scale_fn = count_graphlet_tuples ) - } + } } - + ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. # But if graphlet counts have already been provided we can skip this step. @@ -510,7 +510,7 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(graphs) - + ## ------------------------------------------------------------------------ # Centred counts # If there are no graphlet_counts_ref, and a number has been passed as ref_graph, treat it as a constant expected @@ -526,12 +526,12 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = NULL, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # If there are no graphlet_counts_ref, and If a reference graph passed, use it to calculate expected counts for all # query graphs. } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { - + # Generate ego networks and calculate graphlet counts # But if graphlet_counts_ref provided can skip this step if (is.null(graphlet_counts_ref)) { @@ -545,22 +545,22 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(ref_graph) - + # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) - + # bin ref ego-network densities binned_densities <- binning_fn(densities_ref) - + ref_ego_density_bins <- binned_densities$breaks - + # Average ref graphlet counts across density bins ref_binned_graphlet_counts <- bin_counts_fn( graphlet_counts_ref, binned_densities$interval_indexes, max_graphlet_size = max_graphlet_size ) - + # Calculate centred counts using ref graph centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -572,7 +572,7 @@ netdis_many_to_many <- function(graphs = NULL, exp_counts_fn = exp_counts_fn, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # If no reference passed, calculate expected counts using query networks # themselves. Geometric-Poisson GP #This is the function that creates an error for a graph with three connected nodes. @@ -589,17 +589,17 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(graphlet_counts) - + ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - + rm(centred_graphlet_counts) - + ## ------------------------------------------------------------------------ # Generate pairwise comparisons comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) - + ## ------------------------------------------------------------------------ # Calculate netdis statistics results <- parallel::mcmapply( @@ -614,7 +614,7 @@ netdis_many_to_many <- function(graphs = NULL, comp_spec$index_b, SIMPLIFY = TRUE ) - + list(netdis = results, comp_spec = comp_spec) } @@ -638,14 +638,14 @@ netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vecto ids <- graphlet_ids_for_size(graphlet_size) counts1 <- centred_graphlet_count_vector_1[ids] counts2 <- centred_graphlet_count_vector_2[ids] - + # Calculate normalising constant norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) * sum(counts2^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate intermediate "netD" statistic that falls within range -1..1 netds2 <- (1 / sqrt(norm_const)) * sum((counts1 * counts2) / - sqrt(counts1^2 + counts2^2), na.rm = TRUE) + sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate corresponding "netd" Netdis statistic that falls within range 0..1 0.5 * (1 - netds2) } @@ -659,7 +659,7 @@ netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vecto #' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 #' @param max_graphlet_size max graphlet size to calculate Netdis for. #' The size of a graphlet is the number of nodes it contains. Netdis is -#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported. +#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported. #' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export @@ -668,15 +668,15 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { stop("max_graphlet_size must be 3, 4 or 5.") } - + netdis_statistics <- purrr::map(3:max_graphlet_size, - netdis, - centred_graphlet_count_vector_1 = centred_graphlet_count_vector_1, - centred_graphlet_count_vector_2 = centred_graphlet_count_vector_2 + netdis, + centred_graphlet_count_vector_1 = centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2 = centred_graphlet_count_vector_2 ) - + netdis_statistics <- simplify2array(netdis_statistics) - + names(netdis_statistics) <- sapply( "netdis", @@ -684,7 +684,7 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count 3:max_graphlet_size, sep = "" ) - + netdis_statistics } @@ -728,31 +728,30 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size #' max_graphlet_size. #' @export -netdis_centred_graphlet_counts <- function( - graphlet_counts, - ref_ego_density_bins, - ref_binned_graphlet_counts, - binning_fn, - bin_counts_fn, - exp_counts_fn, - max_graphlet_size) { - +netdis_centred_graphlet_counts <- function(graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + binning_fn, + bin_counts_fn, + exp_counts_fn, + max_graphlet_size) { + ## ------------------------------------------------------------------------ # If a number has been passed as ref_binned_graphlet_counts, treat it as a # constant expected counts value (e.g. if ref_binned_graphlet_counts = 0 # then no centring of counts). if (is.numeric(ref_binned_graphlet_counts) && - length(ref_binned_graphlet_counts) == 1) { + length(ref_binned_graphlet_counts) == 1) { exp_graphlet_counts <- netdis_const_expected_counts( graphlet_counts, const = ref_binned_graphlet_counts ) - + ## ------------------------------------------------------------------------ # If reference bins and counts passed, use them to calculate # expected counts } else if (!is.null(ref_ego_density_bins) && - !is.null(ref_binned_graphlet_counts)) { + !is.null(ref_binned_graphlet_counts)) { # Calculate expected graphlet counts (using ref # graph ego network density bins) exp_graphlet_counts <- exp_counts_fn( @@ -761,29 +760,29 @@ netdis_centred_graphlet_counts <- function( ref_binned_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # If NULL passed as ref bins and counts, calculate expected counts using # query network itself. This should be GP. } else if (is.null(ref_ego_density_bins) && - is.null(ref_binned_graphlet_counts)) { + is.null(ref_binned_graphlet_counts)) { # Get ego-network densities densities <- ego_network_density(graphlet_counts) - + # bin ref ego-network densities - binned_densities <- binning_fn(densities) - + binned_densities <- binning_fn(densities) + # extract bin breaks and indexes from binning results ego_density_bin_breaks <- binned_densities$breaks ego_density_bin_indexes <- binned_densities$interval_indexes - + # Calculate expected counts in each bin binned_graphlet_counts <- bin_counts_fn( graphlet_counts, ego_density_bin_indexes, max_graphlet_size = max_graphlet_size ) - + # Calculate expected graphlet counts for each ego network exp_graphlet_counts <- exp_counts_fn( graphlet_counts, @@ -791,7 +790,7 @@ netdis_centred_graphlet_counts <- function( binned_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts } else { @@ -803,7 +802,7 @@ netdis_centred_graphlet_counts <- function( - Constant numeric ref_binned_graphlet_counts: Use as constant expected counts value.") } - + ## ------------------------------------------------------------------------ # Subtract expected counts from actual graphlet counts netdis_subtract_exp_counts( @@ -824,16 +823,15 @@ netdis_centred_graphlet_counts <- function( #' nummber of ego networks (rows). #' @param max_graphlet_size Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported. #' @export -netdis_subtract_exp_counts <- function( - graphlet_counts, - exp_graphlet_counts, - max_graphlet_size) { - +netdis_subtract_exp_counts <- function(graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) { + # select columns for graphlets up to size max_graphlet_size id <- graphlet_key(max_graphlet_size)$id graphlet_counts <- graphlet_counts[, id] exp_graphlet_counts <- exp_graphlet_counts[, id] - + # Subtract expected counts from actual graphlet counts graphlet_counts - exp_graphlet_counts } @@ -856,14 +854,13 @@ netdis_subtract_exp_counts <- function( #' \code{density_binned_reference_counts} values will be multiplied by. #' #' @export -netdis_expected_counts <- function( - graphlet_counts, - density_breaks, - density_binned_reference_counts, - max_graphlet_size, - scale_fn = NULL) { - - +netdis_expected_counts <- function(graphlet_counts, + density_breaks, + density_binned_reference_counts, + max_graphlet_size, + scale_fn = NULL) { + + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- t(apply( @@ -873,7 +870,7 @@ netdis_expected_counts <- function( density_binned_reference_counts = density_binned_reference_counts, scale_fn = scale_fn )) - + expected_graphlet_counts } @@ -899,15 +896,15 @@ netdis_expected_counts_ego <- function(graphlet_counts, density_breaks, density_binned_reference_counts, scale_fn = NULL) { - + # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- density_from_counts(graphlet_counts) matched_density_index <- interval_index(query_density, density_breaks) - + matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number @@ -943,17 +940,17 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, apply(graphlet_counts, MARGIN = 2, function(gc) { tapply(gc, INDEX = density_interval_indexes, FUN = agg_fn) }) - + # if only 1 bin (i.e. no binning) will be left with a 1D list. # convert it into a 2D list. if (is.null(dim(mean_density_binned_graphlet_counts))) { dim(mean_density_binned_graphlet_counts) <- c(1, length(mean_density_binned_graphlet_counts)) - + colnames(mean_density_binned_graphlet_counts) <- colnames(graphlet_counts) } - + mean_density_binned_graphlet_counts } @@ -996,13 +993,13 @@ density_binned_counts <- function(graphlet_counts, # by dividing by total number of k-tuples in # ego-network (where k is graphlet size) graphlet_counts <- scale_fn(graphlet_counts, - max_graphlet_size = max_graphlet_size + max_graphlet_size = max_graphlet_size ) } - + mean_density_binned_graphlet_counts(graphlet_counts, - density_interval_indexes, - agg_fn = agg_fn + density_interval_indexes, + agg_fn = agg_fn ) } @@ -1021,46 +1018,45 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, max_graphlet_size) { # extract ego networks belonging to input density bin index counts <- graphlet_counts[density_interval_indexes == bin_idx, ] - + # mean graphlet counts in this density bin means <- colMeans(counts) - + # subtract mean graphlet counts from actual graphlet counts mean_sub_counts <- sweep(counts, 2, means) - + # variance in graphlet counts across ego networks in this density bin Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) - + # Dealing with zero variance HERE ind_zerovar <- (Vd_sq < .00000001) - if(sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 - + if (sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 + # GP theta parameter for each graphlet id in this density bin theta_d <- 2 * means / (Vd_sq + means) - + exp_counts_dk <- vector() for (k in 2:max_graphlet_size) { graphlet_idx <- graphlet_ids_for_size(k) - + # GP lambda parameter for graphlet size k in this density bin lambda_dk <- mean(2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]), - na.rm = TRUE + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE ) - + # Expected counts for graphlet size k in this density bin exp_counts_dk <- append( exp_counts_dk, lambda_dk / theta_d[graphlet_idx] ) - } - + # Dealing with divisions by zero. ind <- is.na(exp_counts_dk) ind <- ind | is.infinite(exp_counts_dk) - if(sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 - + if (sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 + exp_counts_dk } @@ -1076,7 +1072,7 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, max_graphlet_size) { - + # calculate expected counts for each density bin index nbins <- length(unique(density_interval_indexes)) expected_counts_bin <- t(sapply( @@ -1086,10 +1082,10 @@ density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes = density_interval_indexes, max_graphlet_size = max_graphlet_size )) - + # remove NAs caused by bins with zero counts for a graphlet expected_counts_bin[is.nan(expected_counts_bin)] <- 0 - + expected_counts_bin } @@ -1137,10 +1133,10 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { graphlet_tuple_counts <- t(apply(graphlet_counts, 1, - count_graphlet_tuples, - max_graphlet_size = max_graphlet_size + count_graphlet_tuples, + max_graphlet_size = max_graphlet_size )) - + graphlet_tuple_counts } @@ -1177,12 +1173,12 @@ scale_graphlet_counts_ego <- function(graphlet_counts, graphlet_counts, max_graphlet_size = max_graphlet_size ) - + scaled_graphlet_counts <- scale_graphlet_count( graphlet_counts, ego_graphlet_tuples ) - + return(scaled_graphlet_counts) } @@ -1197,12 +1193,12 @@ scale_graphlet_counts_ego <- function(graphlet_counts, count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { # extract node counts from graph_graphlet_counts N <- graph_graphlet_counts["N"] - + graphlet_key <- graphlet_key(max_graphlet_size) graphlet_node_counts <- graphlet_key$node_count - + graphlet_tuple_counts <- choose(N, graphlet_node_counts) - + graphlet_tuple_counts <- stats::setNames( graphlet_tuple_counts, graphlet_key$id diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index 8246dfeb..ddcd1592 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -1,6 +1,6 @@ #' NetEMD Network Earth Mover's Distance between a pair of networks. #' -#' Calculates the network Earth Mover's Distance (EMD) between +#' Calculates the network Earth Mover's Distance (EMD) between #' two sets of network features. This is done by individually normalising the distribution #' of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. #' This is calculated as follows: @@ -39,42 +39,42 @@ #' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated #' offsets giving the minimal EMD for each pair of histograms #' @examples -#' require(igraph) -#' graph_1 <- graph.lattice(c(8,8)) -#' graph_2 <- graph.lattice(c(44,44)) -#' netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) -#' -#' #Providing a matrix of network features -#' props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) -#' props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) -#' -#' netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) -#' -#' #Providing the network features as lists of dhist objects -#' dhists_1<- graph_features_to_histograms(props_a) -#' dhists_2<- graph_features_to_histograms(props_b) -#' -#' netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) -#' -#' -#' # A variation of NetEmd: Using the Laplacian spectrum -#' #Laplacian -#' Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) -#' Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) -#' -#' #Normalized Laplacian -#' NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) -#' NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) -#' -#' #Spectra (This may take a couple of minutes). -#' props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -#' props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -#' -#' netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. -#' +#' require(igraph) +#' graph_1 <- graph.lattice(c(8, 8)) +#' graph_2 <- graph.lattice(c(44, 44)) +#' netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", max_graphlet_size = 5) +#' +#' # Providing a matrix of network features +#' props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) +#' props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) +#' +#' netemd_one_to_one(dhists_1 = props_a, dhists_2 = props_b, smoothing_window_width = 1) +#' +#' # Providing the network features as lists of dhist objects +#' dhists_1 <- graph_features_to_histograms(props_a) +#' dhists_2 <- graph_features_to_histograms(props_b) +#' +#' netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2) +#' +#' +#' # A variation of NetEmd: Using the Laplacian spectrum +#' # Laplacian +#' Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) +#' Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) +#' +#' # Normalized Laplacian +#' NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) +#' NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) +#' +#' # Spectra (This may take a couple of minutes). +#' props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) +#' props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) +#' +#' netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +#' #' @export -netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2=NULL, method = "optimise", - return_details = FALSE, smoothing_window_width = 0,feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { +netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, dhists_2 = NULL, method = "optimise", + return_details = FALSE, smoothing_window_width = 0, feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) { ## ------------------------------------------------------------------------ # Check arguments 1 if (!igraph::is.igraph(graph_1) & is.null(dhists_1)) { @@ -86,44 +86,50 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= ## ------------------------------------------------------------------------ # Check arguments 2 # If dhists_1 is a matrix of network features then transform them to dhist objects. - if(is.matrix(dhists_1)){ + if (is.matrix(dhists_1)) { dhists_1 <- graph_features_to_histograms(dhists_1) } - if(is.matrix(dhists_2)){ + if (is.matrix(dhists_2)) { dhists_2 <- graph_features_to_histograms(dhists_2) } ## ------------------------------------------------------------------------ # Check arguments 3 - #If input is graph then get graphlet counts - if(igraph::is.igraph(graph_1)){ - if(!is.null(dhists_1)){warning("dhists_1 will be calculated from graph_1.")} - dhists_1 <- gdd(graph = graph_1, feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size + # If input is graph then get graphlet counts + if (igraph::is.igraph(graph_1)) { + if (!is.null(dhists_1)) { + warning("dhists_1 will be calculated from graph_1.") + } + dhists_1 <- gdd( + graph = graph_1, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size ) } - if(igraph::is.igraph(graph_2)){ - if(!is.null(dhists_2)){warning("dhists_2 will be calculated from graph_2.")} - dhists_2 <- gdd(graph = graph_2, feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size + if (igraph::is.igraph(graph_2)) { + if (!is.null(dhists_2)) { + warning("dhists_2 will be calculated from graph_2.") + } + dhists_2 <- gdd( + graph = graph_2, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size ) } - - rm(graph_1,graph_2) + + rm(graph_1, graph_2) ## ------------------------------------------------------------------------ # Require either a pair of "dhist" discrete histograms or two lists of "dhist" # discrete histograms pair_of_dhist_lists <- all(purrr::map_lgl(dhists_1, is_dhist)) && all(purrr::map_lgl(dhists_2, is_dhist)) - + # If input is two lists of "dhist" discrete histograms, determine the minimum # EMD and associated offset for pairs of histograms taken from the same # position in each list if (pair_of_dhist_lists) { details <- purrr::map2(dhists_1, dhists_2, function(dhist1, dhist2) { netemd_single_pair(dhist1, dhist2, - method = method, - smoothing_window_width = smoothing_window_width + method = method, + smoothing_window_width = smoothing_window_width ) }) # Collect the minimum EMDs and associated offsets for all histogram pairs @@ -143,14 +149,14 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= } else { return(arithmetic_mean) } - } - else { + } else { # Wrap each member of a single pair of histograms is a list and recursively # call this net_emd function. This ensures they are treated the same. - return(netemd_one_to_one(dhists_1 = list(dhists_1), dhists_2 = list(dhists_2), - method = method, - return_details = return_details, - smoothing_window_width = smoothing_window_width + return(netemd_one_to_one( + dhists_1 = list(dhists_1), dhists_2 = list(dhists_2), + method = method, + return_details = return_details, + smoothing_window_width = smoothing_window_width )) } } @@ -195,10 +201,10 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving #' the minimal EMD for each GDD #' @export -netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L),feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { - if(max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores," cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) - +netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise", smoothing_window_width = 0, + return_details = FALSE, mc.cores = getOption("mc.cores", 2L), feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) { + if (max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores, " cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows @@ -211,59 +217,63 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo ## ------------------------------------------------------------------------ # Check arguments 1 which_imput_type <- NULL - if(!is.null(graphs) & is.null(dhists)){ - if ( !all(( unlist(sapply(X = graphs, FUN = igraph::is.igraph)) ) ) ) { + if (!is.null(graphs) & is.null(dhists)) { + if (!all((unlist(sapply(X = graphs, FUN = igraph::is.igraph))))) { stop("Graphs need to be igraph graph objects, or a list of dhists network features should be supplied.") } which_imput_type <- "Graphs" } - if (!is.null(dhists) ) { - if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { + if (!is.null(dhists)) { + if (all((unlist(sapply(X = dhists, FUN = is.matrix))))) { which_imput_type <- "Matrix" - } - if ( all(( unlist(sapply(X = dhists, FUN = - function(l){ all(( unlist(sapply(X = l, FUN = is_dhist)) ) ) } - )) ) ) ) { + } + if (all((unlist(sapply( + X = dhists, FUN = + function(l) { + all((unlist(sapply(X = l, FUN = is_dhist)))) + } + ))))) { which_imput_type <- "dhist" } - if(is.null(which_imput_type)){ + if (is.null(which_imput_type)) { warning("dhists does not conform to a Matrix or dhist class for all elmenents/netwroks in the list.") } } ## ------------------------------------------------------------------------ # Check arguments 2 # If dhists is a list of matrices of network features then transform them to dhist objects. - if(which_imput_type == "Matrix"){ - dhists <- sapply(X = dhists,FUN = graph_features_to_histograms, simplify = FALSE ) + if (which_imput_type == "Matrix") { + dhists <- sapply(X = dhists, FUN = graph_features_to_histograms, simplify = FALSE) } ## ------------------------------------------------------------------------ # Check arguments 3 - #If input is graph then get graphlet counts - if(which_imput_type == "Graphs"){ + # If input is graph then get graphlet counts + if (which_imput_type == "Graphs") { dhists <- parallel::mcmapply(gdd, graphs, - MoreArgs = - list( - feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size - ), - SIMPLIFY = FALSE, mc.cores = mc.cores + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores ) } rm(graphs) ## ------------------------------------------------------------------------ # Check arguments 4 - #cross_comparison_spec was coded to require names! - if(is.null(names(dhists))){ - names(dhists) <- paste("Net",1:length(dhists),sep = "") + # cross_comparison_spec was coded to require names! + if (is.null(names(dhists))) { + names(dhists) <- paste("Net", 1:length(dhists), sep = "") } ## ------------------------------------------------------------------------ comp_spec <- cross_comparison_spec(dhists) num_features <- length(dhists[[1]]) out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { - netemd_one_to_one(dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], - method = method, return_details = return_details, - smoothing_window_width = smoothing_window_width + netemd_one_to_one( + dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], + method = method, return_details = return_details, + smoothing_window_width = smoothing_window_width ) }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) if (return_details) { @@ -306,16 +316,16 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo #' @return A list with the following named elements #' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated #' offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. -#' @examples -#' require(igraph) -#' goldstd_1 <- graph.lattice(c(8,8)) -#' goldstd_2 <- graph.lattice(c(44,44)) -#' props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) -#' props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) -#' dhists_1<- graph_features_to_histograms(props_1) -#' dhists_2<- graph_features_to_histograms(props_2) -#' # Obtain the minimum NetEMD_edges between the histograms -#' netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +#' @examples +#' require(igraph) +#' goldstd_1 <- graph.lattice(c(8, 8)) +#' goldstd_2 <- graph.lattice(c(44, 44)) +#' props_1 <- count_orbits_per_node(graph = goldstd_1, max_graphlet_size = 5) +#' props_2 <- count_orbits_per_node(graph = goldstd_2, max_graphlet_size = 5) +#' dhists_1 <- graph_features_to_histograms(props_1) +#' dhists_2 <- graph_features_to_histograms(props_2) +#' # Obtain the minimum NetEMD_edges between the histograms +#' netemd_single_pair(dhists_1[[1]], dhists_2[[1]], method = "optimise", smoothing_window_width = 0) #' @export netemd_single_pair <- function(dhist1, dhist2, method = "optimise", smoothing_window_width = 0) { @@ -335,14 +345,14 @@ netemd_single_pair <- function(dhist1, dhist2, method = "optimise", dhist1 <- as_smoothed_dhist(dhist1, smoothing_window_width) dhist2 <- as_smoothed_dhist(dhist2, smoothing_window_width) } - + # Store means and variances to calculate offset later mean1 <- dhist_mean_location(dhist1) mean2 <- dhist_mean_location(dhist2) - + var1 <- dhist_variance(dhist1) var2 <- dhist_variance(dhist2) - + # Mean centre histograms. This helps with numerical stability as, after # variance normalisation, the differences between locations are often small. # We want to avoid calculating small differences between large numbers as @@ -351,11 +361,11 @@ netemd_single_pair <- function(dhist1, dhist2, method = "optimise", # clustered around zero, rather than some potentially large mean location. dhist1 <- mean_centre_dhist(dhist1) dhist2 <- mean_centre_dhist(dhist2) - + # Normalise histogram to unit mass and unit variance dhist1_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist1)) dhist2_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist2)) - + # Calculate minimal EMD result <- min_emd(dhist1_norm, dhist2_norm, method = method) # As we mean-centred the histograms prior to passing to min_emd(), the offset diff --git a/R/orca_interface.R b/R/orca_interface.R index 130595a9..51614203 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -67,10 +67,10 @@ indexed_edges_to_graph <- function(indexed_edges) { #' previous alterations have been made #' @return A named list of simplified igraph graph object, with the name of each #' graph set to the name of the file it was read from. -#' @examples +#' @examples #' # Set source directory for Virus protein-protein interaction edge files stored in the netdist package. #' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -#' print(source_dir) +#' print(source_dir) #' # Load query graphs as igraph objects #' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") #' graph_1 @@ -256,8 +256,7 @@ gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, out <- count_graphlets_per_node(graph, max_graphlet_size = max_graphlet_size ) - } - else { + } else { stop("gdd: unrecognised feature_type") } graph_features_to_histograms(out) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 960a00c0..9db131ee 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -10,7 +10,7 @@ test_that("cost_matrix returns all zeros when all bin locations are identical", expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) }) -test_that("cost_matrix returns zeros along diagonal when both sets of bin +test_that("cost_matrix returns zeros along diagonal when both sets of bin locations are the same", { bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 @@ -18,8 +18,8 @@ test_that("cost_matrix returns zeros along diagonal when both sets of bin expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) }) -test_that("cost_matrix returns zeros along diagonal and taxicab distance from - all zeros for all other elements when both sets of bin locations are +test_that("cost_matrix returns zeros along diagonal and taxicab distance from + all zeros for all other elements when both sets of bin locations are the same and are a sequence of consecutive integers", { bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 @@ -28,7 +28,7 @@ test_that("cost_matrix returns zeros along diagonal and taxicab distance from expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) }) -test_that("cost_matrix is correct size when the two histograms are of different +test_that("cost_matrix is correct size when the two histograms are of different lengths", { bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) bin_centres2 <- c(8, 9, 10) @@ -41,7 +41,7 @@ test_that("cost_matrix is correct size when the two histograms are of different context("EMD: EMD") # EMD: Property-based tests -test_that("EMD methods return 0 when comparing a 1D feature distribution to +test_that("EMD methods return 0 when comparing a 1D feature distribution to itself", { bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 @@ -59,7 +59,7 @@ test_that("EMD methods return 0 when comparing a 1D feature distribution to expect_equal(emd(histogram1, histogram2), expected) }) -test_that("EMD methods return numBins/2 when offsetting a symmetric discrete +test_that("EMD methods return numBins/2 when offsetting a symmetric discrete triangle distribution by 1", { cost_fn <- function(triangle_width) { move_dist <- ceiling((triangle_width + 1) / 2) @@ -191,7 +191,7 @@ test_that("EMD methods return numBins/2 when offsetting a symmetric discrete expect_equal(emd(histogram1, histogram2), expected) }) -test_that("EMD methods return same result for densely and sparsely specified +test_that("EMD methods return same result for densely and sparsely specified bins", { sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) @@ -242,7 +242,7 @@ test_that("EMD methods return same result for densely and sparsely specified ) }) -test_that("EMD methods return same result when order of densely specified bins +test_that("EMD methods return same result when order of densely specified bins is changed", { bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) @@ -284,7 +284,7 @@ test_that("EMD methods return same result when order of densely specified bins ) }) -test_that("EMD methods return same result when order of sparsely specified bins +test_that("EMD methods return same result when order of sparsely specified bins is changed", { bin_masses1 <- c(1, 1, 1, 1, 1, 1) bin_masses2 <- c(1, 1, 1, 1, 1, 1) @@ -392,7 +392,7 @@ test_that("min_emd_ methods correctly compare a non-offset 1D feature expect_equal(actual_optimise, expected) }) -test_that("min_emd_ methods correctly compare an offset 1D feature +test_that("min_emd_ methods correctly compare an offset 1D feature distribution to itself", { offset <- 10 bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index 6a99c71b..d7aeb68c 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -22,8 +22,9 @@ expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) expect_self_netemd_correct <- function(histogram, shift, method, - return_details = FALSE) { - self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), + return_details = FALSE) { + self_net_emd <- netemd_one_to_one( + dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method, return_details = return_details ) loc <- histogram$locations @@ -199,8 +200,7 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift),method = method, return_details = return_details - ) + self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method, return_details = return_details) loc <- histogram$locations mass <- histogram$masses var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 @@ -292,7 +292,8 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + expect_equalish(netemd_one_to_one( + dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -346,7 +347,8 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over random graphs purrr::walk(random_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + expect_equalish(netemd_one_to_one( + dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -403,11 +405,11 @@ test_that("netemd_many_to_many works", { expected_netemd_fn <- function(gdds) { list( netemds = c( - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$HSV, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$KSHV, dhists_2 = gdds$VZV) + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 = gdds$KSHV, dhists_2 = gdds$VZV) ), comp_spec = cross_comparison_spec(gdds) ) @@ -415,7 +417,7 @@ test_that("netemd_many_to_many works", { # Comparison function for clarity compare_fn <- function(gdds) { - expect_equal(netemd_many_to_many(dhists=gdds), expected_netemd_fn(gdds)) + expect_equal(netemd_many_to_many(dhists = gdds), expected_netemd_fn(gdds)) } # Map over test parameters, comparing actual gdds to expected From 1082d295b33fb91db46160455e05fd124922754b Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 20:11:47 +0000 Subject: [PATCH 57/84] Update documentation --- man/netdis_many_to_many.Rd | 4 +-- man/netdis_one_to_one.Rd | 62 +++++++++++++++++----------------- man/netemd_one_to_one.Rd | 68 +++++++++++++++++++------------------- man/netemd_single_pair.Rd | 18 +++++----- man/read_simple_graphs.Rd | 2 +- 5 files changed, 77 insertions(+), 77 deletions(-) diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index ef60e0f6..12f5c057 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -75,12 +75,12 @@ ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for each ego network.} -\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: NULL). Matrix containing counts +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: NULL). Matrix containing counts of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for -each ego network. +each ego network. If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not be used.} } diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 28b8a259..8f3b5208 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -103,41 +103,41 @@ Calculates the different variants of the network dissimilarity statistic Netdis \examples{ require(netdist) require(igraph) -#Set source directory for Virus PPI graph edge files stored in the netdist package. +# Set source directory for Virus PPI graph edge files stored in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -#Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. +# Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # This option will focus on detecting more general and global discrepancies between the ego-network structures. -#Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) +# Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) # Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. -# Two lattice networks of different sizes are used for this example. - goldstd_1 <- graph.lattice(c(8,8)) #A reference net - goldstd_2 <- graph.lattice(c(44,44)) #A reference net - - netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) - netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) - - - #Providing pre-calculated subgraph counts. - - props_1 <- count_graphlets_ego(graph = graph_1) - props_2 <- count_graphlets_ego(graph = graph_2) - props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) - props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) - -#Netdis Geometric-Poisson. -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) - -#Netdis Zero Expectation. -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) - -#Netdis using gold-standard network -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +# Two lattice networks of different sizes are used for this example. +goldstd_1 <- graph.lattice(c(8, 8)) # A reference net +goldstd_2 <- graph.lattice(c(44, 44)) # A reference net + +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) + + +# Providing pre-calculated subgraph counts. + +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) +props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) + +# Netdis Geometric-Poisson. +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) + +# Netdis Zero Expectation. +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) + +# Netdis using gold-standard network +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) } diff --git a/man/netemd_one_to_one.Rd b/man/netemd_one_to_one.Rd index 59f4b04e..caebd1e7 100644 --- a/man/netemd_one_to_one.Rd +++ b/man/netemd_one_to_one.Rd @@ -62,7 +62,7 @@ the minimal EMD for each pair of histograms, \code{min_offsets}: the associated offsets giving the minimal EMD for each pair of histograms } \description{ -Calculates the network Earth Mover's Distance (EMD) between +Calculates the network Earth Mover's Distance (EMD) between two sets of network features. This is done by individually normalising the distribution of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. This is calculated as follows: @@ -71,37 +71,37 @@ This is calculated as follows: 3. Take the average minimum EMD across all features. } \examples{ - require(igraph) - graph_1 <- graph.lattice(c(8,8)) - graph_2 <- graph.lattice(c(44,44)) - netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) - - #Providing a matrix of network features - props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) - props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) - - netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) - - #Providing the network features as lists of dhist objects - dhists_1<- graph_features_to_histograms(props_a) - dhists_2<- graph_features_to_histograms(props_b) - - netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) - - - # A variation of NetEmd: Using the Laplacian spectrum - #Laplacian - Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) - Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) - - #Normalized Laplacian - NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) - NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) - - #Spectra (This may take a couple of minutes). - props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) - props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) - - netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. - +require(igraph) +graph_1 <- graph.lattice(c(8, 8)) +graph_2 <- graph.lattice(c(44, 44)) +netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", max_graphlet_size = 5) + +# Providing a matrix of network features +props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) +props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) + +netemd_one_to_one(dhists_1 = props_a, dhists_2 = props_b, smoothing_window_width = 1) + +# Providing the network features as lists of dhist objects +dhists_1 <- graph_features_to_histograms(props_a) +dhists_2 <- graph_features_to_histograms(props_b) + +netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2) + + +# A variation of NetEmd: Using the Laplacian spectrum +# Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) + +# Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) + +# Spectra (This may take a couple of minutes). +props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + } diff --git a/man/netemd_single_pair.Rd b/man/netemd_single_pair.Rd index 1158d2f5..5cf3b3ed 100644 --- a/man/netemd_single_pair.Rd +++ b/man/netemd_single_pair.Rd @@ -43,13 +43,13 @@ This is calculated as follows: 2. Find the minimum EMD between the histograms } \examples{ - require(igraph) - goldstd_1 <- graph.lattice(c(8,8)) - goldstd_2 <- graph.lattice(c(44,44)) - props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) - props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) - dhists_1<- graph_features_to_histograms(props_1) - dhists_2<- graph_features_to_histograms(props_2) - # Obtain the minimum NetEMD_edges between the histograms - netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +require(igraph) +goldstd_1 <- graph.lattice(c(8, 8)) +goldstd_2 <- graph.lattice(c(44, 44)) +props_1 <- count_orbits_per_node(graph = goldstd_1, max_graphlet_size = 5) +props_2 <- count_orbits_per_node(graph = goldstd_2, max_graphlet_size = 5) +dhists_1 <- graph_features_to_histograms(props_1) +dhists_2 <- graph_features_to_histograms(props_2) +# Obtain the minimum NetEMD_edges between the histograms +netemd_single_pair(dhists_1[[1]], dhists_2[[1]], method = "optimise", smoothing_window_width = 0) } diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 57d45b86..95416894 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -52,7 +52,7 @@ following order: \examples{ # Set source directory for Virus protein-protein interaction edge files stored in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -print(source_dir) +print(source_dir) # Load query graphs as igraph objects graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") graph_1 From f44076ca751ac3806f764a33db77fda59c5ca81c Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 21:35:15 +0100 Subject: [PATCH 58/84] move quality checks into separate workflow --- .Rbuildignore | 1 + .github/workflows/build.yml | 25 +-------------------- .github/workflows/quality.yaml | 40 ++++++++++++++++++++++++++++++++++ .lintr | 1 + 4 files changed, 43 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/quality.yaml create mode 100644 .lintr diff --git a/.Rbuildignore b/.Rbuildignore index aa09cc1d..6555a0db 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,4 @@ ^appveyor\.yml$ ^doc$ ^Meta$ +^\.github$ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e38577d3..160d59eb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: config: - - {os: ubuntu-latest, r: 'release'} # run coverage, linting, and update docs on this platform only + - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel'} - {os: ubuntu-latest, r: 'devel'} - {os: macOS-latest, r: 'release'} @@ -51,14 +51,6 @@ jobs: extra-packages: | any::devtools - - uses: r-lib/actions/setup-r-dependencies@v2 - if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} - with: - extra-packages: | - any::covr - any::lintr - any::styler - - name: Document run: devtools::document() shell: Rscript {0} @@ -73,21 +65,6 @@ jobs: git pull --ff-only git push origin - - name: Check style with styler - if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} - run: styler::style_pkg(dry='fail') - shell: Rscript {0} - - - name: Check style with lintr - if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} - run: lintr::lint_package() - shell: Rscript {0} - - name: Check package with devtools run: devtools::check() shell: Rscript {0} - - - name: Check coverage - if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} - run: covr::codecov(quiet = FALSE) - shell: Rscript {0} diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml new file mode 100644 index 00000000..7a53a048 --- /dev/null +++ b/.github/workflows/quality.yaml @@ -0,0 +1,40 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: Quality checks + +jobs: + lint-project: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + any::covr + any::lintr + any::styler + + - name: Styler + run: styler::style_pkg(dry='fail') + shell: Rscript {0} + + - name: Lintr + run: lintr::lint_package() + shell: Rscript {0} + + - name: Coverage + run: covr::codecov(quiet = FALSE) + shell: Rscript {0} diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..f2c1b9b3 --- /dev/null +++ b/.lintr @@ -0,0 +1 @@ +error_on_lint: TRUE From caff555a6043958e093288ffe14b50d29e461a00 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 21:47:35 +0100 Subject: [PATCH 59/84] quality checks badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 380c78b6..f2ecc2bf 100755 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Until this package hits release 1.0 anything can change with no notice. [![GitHub release](https://img.shields.io/github/release/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/releases/latest) [![Build](https://github.com/alan-turing-institute/network-comparison/actions/workflows/main.yml/badge.svg)](https://github.com/alan-turing-institute/network-comparison/actions/workflows/main.yml) +[![Quality checks](https://github.com/alan-turing-institute/network-comparison/actions/workflows/quality.yaml/badge.svg)](https://github.com/alan-turing-institute/network-comparison/actions/workflows/quality.yaml) [![Codecov](https://img.shields.io/codecov/c/github/alan-turing-institute/network-comparison/master.svg)](https://codecov.io/gh/alan-turing-institute/network-comparison?branch=master) [![license](https://img.shields.io/github/license/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/edit/master/LICENSE) [![Github All Releases](https://img.shields.io/github/downloads/alan-turing-institute/network-comparison/total.svg)](https://github.com/alan-turing-institute/network-comparison/releases/latest) From 7123f0330367a824b3e7f19f4717458415890b28 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 22:29:16 +0100 Subject: [PATCH 60/84] start fixing line lengths to 80 chars --- R/dhist.R | 82 ++++++++----- R/measures_net_dis.R | 276 ++++++++++++++++++++++++++++--------------- 2 files changed, 234 insertions(+), 124 deletions(-) diff --git a/R/dhist.R b/R/dhist.R index 862afccd..4ea7a237 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -9,23 +9,27 @@ #' location #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across -#' a bin centred on the location and having width = \code{smoothing_window_width} -#' (default = \code{0} - no smoothing) +#' a bin centred on the location and having +#' width = \code{smoothing_window_width} (default = \code{0} - no smoothing) #' @param sorted Whether or not to return a discrete histogram with locations #' and masses sorted by ascending mass (default = \code{TRUE}) #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' Note that locations where no mass is present are not included in the returned #' \code{dhist} object. Mass in these discrete histograms is treated as being -#' present precisely at the specified location. Discrete histograms should not be used -#' for data where observations have been grouped into bins representing ranges -#' of observation values. +#' present precisely at the specified location. Discrete histograms should not +#' be used for data where observations have been grouped into bins representing +#' ranges of observation values. #' @export -dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) { +dhist <- function(locations, + masses, + smoothing_window_width = 0, + sorted = TRUE) { if (!is_numeric_vector_1d(locations)) { stop("Bin locations must be provided as a 1D numeric vector") } @@ -76,9 +80,11 @@ update_dhist <- #' @param dhist A discrete histogram as a \code{dhist} object #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across -#' a bin centred on the location and having width = \code{smoothing_window_width} -#' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} -#' attribute set to the value provided \code{smoothing_window_width} parameter. +#' a bin centred on the location and having +#' width = \code{smoothing_window_width} +#' @return A copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to the value provided +#' \code{smoothing_window_width} parameter. #' @export as_smoothed_dhist <- function(dhist, smoothing_window_width) { dhist <- update_dhist(dhist, smoothing_window_width = smoothing_window_width) @@ -90,8 +96,8 @@ as_smoothed_dhist <- function(dhist, smoothing_window_width) { #' Returns an "unsmoothed" copy of a \code{dhist} object with its #' \code{smoothing_window_width} attribute set to 0. #' @param dhist A discrete histogram as a \code{dhist} object -#' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} -#' attribute set to 0. +#' @return A copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to 0. #' @export as_unsmoothed_dhist <- function(dhist) { dhist <- update_dhist(dhist, smoothing_window_width = 0) @@ -101,8 +107,8 @@ as_unsmoothed_dhist <- function(dhist) { #' Check if an object is a \code{dhist} discrete histogram #' #' Checks if the input object is of class \code{dhist}. If \code{fast_check} is -#' \code{TRUE} then the only check is whether the object has a class attribute of -#' \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks +#' \code{TRUE} then the only check is whether the object has a class attribute +#' of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks #' are also made to ensure that the object has the structure required of a #' \code{dhist} object. #' @param x An arbitrary object @@ -111,8 +117,8 @@ as_unsmoothed_dhist <- function(dhist) { #' is set to \code{dhist} (default = \code{TRUE}) #' @export is_dhist <- function(x, fast_check = TRUE) { - # Quick check that relies on user not to construct variables with "dhist" class - # that do not have the required elements + # Quick check that relies on user not to construct variables with "dhist" + # class that do not have the required elements has_class_attr <- (class(x) == "dhist") if (fast_check) { # Early return if fast check requested @@ -133,13 +139,15 @@ is_dhist <- function(x, fast_check = TRUE) { #' Discrete histogram from observations (Pure R slow version) #' -#' Generate a sparse discrete histogram from a set of discrete numeric observations +#' Generate a sparse discrete histogram from a set of discrete numeric +#' observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' @export dhist_from_obs_slow <- function(observations) { @@ -162,13 +170,15 @@ dhist_from_obs_slow <- function(observations) { #' Discrete histogram from observations #' -#' Generate a sparse discrete histogram from a set of discrete numeric observations +#' Generate a sparse discrete histogram from a set of discrete numeric +#' observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: #' \itemize{ #' \item \code{locations}: A 1D numeric vector of discrete locations -#' \item \code{masses}: A 1D numeric vector of the mass present at each location +#' \item \code{masses}: A 1D numeric vector of the mass present at each +#' location #' } #' @export dhist_from_obs <- function(observations) { @@ -299,7 +309,8 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { stop("ECMFs must have the same type") } ecmf_type <- ecmf_type1 - # Determine all possible locations where either ECMF changes gradient ("knots") + # Determine all possible locations where either ECMF changes gradient + # ("knots") x1 <- ecmf_knots(dhist_ecmf1) x2 <- ecmf_knots(dhist_ecmf2) x <- sort(union(x1, x2)) @@ -413,7 +424,11 @@ area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { #' increasing (default) or decreasing order of location #' @export sort_dhist <- function(dhist, decreasing = FALSE) { - sorted_indexes <- sort(dhist$locations, decreasing = decreasing, index.return = TRUE)$ix + sorted_indexes <- sort( + dhist$locations, + decreasing = decreasing, + index.return = TRUE + )$ix dhist$masses <- dhist$masses[sorted_indexes] dhist$locations <- dhist$locations[sorted_indexes] return(dhist) @@ -463,12 +478,14 @@ dhist_variance <- function(dhist) { # For unsmoothed discrete histograms, the mass associated with each location # is located precisely at the lcoation. Therefore cariance (i.e. E[X^2]) # is the mass-weighted sum of the mean-centred locations - variance <- sum(dhist$masses * (mean_centred_locations)^2) / sum(dhist$masses) + variance <- sum(dhist$masses * (mean_centred_locations)^2) / + sum(dhist$masses) } else { - # For smoothed histograms, the mass associated with each location is "smoothed" - # uniformly across a bin centred on the location with width = smoothing_window_width - # Variance (i.e. E[X^2]) is therefore the mass-weighted sum of the integrals - # of x^2 over the mean-centred bins at each location. + # For smoothed histograms, the mass associated with each location is + # "smoothed" uniformly across a bin centred on the location with + # width = smoothing_window_width Variance (i.e. E[X^2]) is therefore the + # mass-weighted sum of the integrals of x^2 over the mean-centred bins at + # each location. hw <- dhist$smoothing_window_width / 2 bin_lowers <- mean_centred_locations - hw bin_uppers <- mean_centred_locations + hw @@ -540,13 +557,17 @@ normalise_dhist_variance <- function(dhist) { std_dev <- dhist_std(dhist) centred_locations <- (dhist$locations - dhist_mean_location(dhist)) normalised_centred_locations <- centred_locations / std_dev - normalised_locations <- normalised_centred_locations + dhist_mean_location(dhist) + normalised_locations <- normalised_centred_locations + + dhist_mean_location(dhist) dhist <- update_dhist(dhist, locations = normalised_locations) # If smoothing_window_width not zero, then update it to reflect the variance # normalisation if (dhist$smoothing_window_width != 0) { normalised_smoothing_window_width <- dhist$smoothing_window_width / std_dev - dhist <- update_dhist(dhist, smoothing_window_width = normalised_smoothing_window_width) + dhist <- update_dhist( + dhist, + smoothing_window_width = normalised_smoothing_window_width + ) } } return(dhist) @@ -584,7 +605,8 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { #' #' Check if a variable is a 1D numeric vector by checking that: #' \itemize{ -#' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers +#' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of +#' numbers #' \item \code{is_null(dim(input))}: Input is not a matrix or array #' } #' @param input Arbitrary object diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 2b0bbc21..0ae66fd3 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,11 +1,23 @@ #' Netdis between two graphs #' -#' Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). +#' Calculates the different variants of the network dissimilarity statistic +#' Netdis between two graphs. The variants currently supported are Netdis using +#' a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), +#' and Netdis using a Geometric Poisson approximation for the expectation +#' (\code{ref_graph = NULL}). #' #' -#' @param graph_1 A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered. +#' @param graph_1 A simple graph object from the \code{igraph} package. +#' \code{graph_1} can be set to \code{NULL} (default) if +#' \code{graphlet_counts_1} is provided. If both \code{graph_1} and +#' \code{graphlet_counts_1} are not \code{NULL}, then only +#' \code{graphlet_counts_1} will be considered. #' -#' @param graph_2 A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered. +#' @param graph_2 A simple graph object from the \code{igraph} package. +#' \code{graph_2} can be set to \code{NULL} (default) if +#' \code{graphlet_counts_2} is provided. If both \code{graph_2} and +#' \code{graphlet_counts_2} are not \code{NULL}, then only +#' \code{graphlet_counts_2} will be considered. #' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. Matrix containing counts of each graphlet (columns) for @@ -13,9 +25,9 @@ #' graphlet IDs and rows are labelled with the ID of the central node in each #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for -#' each ego network. (default: NULL). -#' If the \code{graphlet_counts_1} argument is defined then -#' \code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}. +#' each ego network. If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. These counts can be obtained with +#' \code{count_graphlets_ego}. (default: NULL). #' #' #' @param graphlet_counts_2 Pre-generated graphlet counts for the second query @@ -26,15 +38,17 @@ #' additional column labelled "N" including the node count for #' each ego network. (default: NULL). #' If the \code{graphlet_counts_2} argument is defined then -#' \code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}. +#' \code{graph_2} will not be used. These counts can be obtained with +#' \code{count_graphlets_ego}. #' #' @param ref_graph Controls how expected counts are calculated. Either: #' 1) A numeric value - used as a constant expected counts value for all query #' graphs . #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the -#' query graphs themselves. (Geometric-Poisson approximation). +#' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be +#' calculated based on the properties of the query graphs themselves +#' (Geometric-Poisson approximation). #' #' @param graphlet_counts_ref Pre-generated reference graphlet counts. #' Matrix containing counts of each graphlet (columns) for @@ -42,11 +56,11 @@ #' graphlet IDs and rows are labelled with the ID of the central node in each #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for -#' each ego network. (default: NULL). -#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -#' be used. +#' each ego network. If the \code{graphlet_counts_ref} argument is defined then +#' \code{ref_graph} will not be used. (default: NULL). #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only +#' 4 (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighborhood size (default: 2). #' @@ -56,25 +70,31 @@ #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges (default: 1). #' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -#' (Default: NULL). +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} +#' and \code{num_intervals = 100} is used (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. If +#' \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. #' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size. @@ -82,25 +102,49 @@ #' @examples #' require(netdist) #' require(igraph) -#' # Set source directory for Virus PPI graph edge files stored in the netdist package. -#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +#' # Set source directory for Virus PPI graph edge files stored in the +#' # netdist package. +#' source_dir <- system.file( +#' file.path("extdata", "VRPINS"), +#' package = "netdist" +#' ) #' # Load query graphs as igraph objects -#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") -#' graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -#' -#' # Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # This option will focus on detecting more general and global discrepancies between the ego-network structures. -#' -#' # Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +#' graph_1 <- read_simple_graph( +#' file.path(source_dir, "EBV.txt"), +#' format = "ncol" +#' ) +#' graph_2 <- read_simple_graph( +#' file.path(source_dir, "ECL.txt"), +#' format = "ncol" +#' ) +#' +#' # Netdis variant using the Geometric Poisson approximation to remove the +#' # background expectation of each network. This option will focus on detecting +#' # more general and global discrepancies between the ego-network structures. +#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) +#' +#' # Comparing the networks via their observed ego counts without centering them +#' # (equivalent to using expectation equal to zero). This option, will focus on +#' # detecting small discrepancies. #' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) #' -#' # Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. +#' # Example of the use of netdis with a reference graph.This option will focus +#' # on detecting discrepancies between the networks relative to the ego-network +#' # structure of the reference network / gold-standard. #' # Two lattice networks of different sizes are used for this example. #' goldstd_1 <- graph.lattice(c(8, 8)) # A reference net #' goldstd_2 <- graph.lattice(c(44, 44)) # A reference net #' -#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) -#' netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) +#' netdis_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_1 +#' ) +#' netdis_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_2 +#' ) #' #' #' # Providing pre-calculated subgraph counts. @@ -111,14 +155,30 @@ #' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) #' #' # Netdis Geometric-Poisson. -#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = NULL +#' ) #' #' # Netdis Zero Expectation. -#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = 0 +#' ) #' #' # Netdis using gold-standard network -#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) -#' netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_1 +#' ) +#' netdis_one_to_one( +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_2 +#' ) #' @export netdis_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, @@ -226,7 +286,8 @@ netdis_one_to_one <- function(graph_1 = NULL, #' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 +#' and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size. #' @@ -236,25 +297,31 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges. #' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -#' (Default: NULL). +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +#' \code{num_intervals = 100} is used (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. If +#' \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. #' #' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query @@ -377,14 +444,15 @@ netdis_one_to_many <- function(graph_1 = NULL, #' graphs. #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL (default) - Expected counts will be calculated based on the properties of the -#' query graphs themselves. (Geometric-Poisson approximation). +#' 3) NULL (default) - Expected counts will be calculated based on the +#' properties of the query graphs themselves. (Geometric-Poisson approximation). #' #' @param comparisons Which comparisons to perform between graphs. #' Can be "many-to-many" (all pairwise combinations) or "one-to-many" #' (compare first graph in graphs to all other graphs.) #' -#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 +#' (default) and 5 are supported. #' #' @param neighbourhood_size Ego network neighbourhood size (default 2). #' @@ -394,24 +462,31 @@ netdis_one_to_many <- function(graph_1 = NULL, #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges (default 1). #' -#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} -#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL). +#' @param binning_fn Function used to bin ego network densities. Takes edge +#' \code{densities} as its single argument, and returns a named list including, +#' the input \code{densities}, the resulting bin \code{breaks} (vector of +#' density bin limits), and the vector \code{interval_indexes} which states to +#' what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method +#' \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +#' \code{num_intervals = 100} is used (default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' (bin indexes) and \code{max_graphlet_size} as arguments. +#' If \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +#' approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and +#' \code{graphlet_counts_ref}. #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' either the approach from the original Netdis paper, or the respective +#' Geometric-Poisson approximation; depending on the values of \code{ref_graph} +#' and \code{graphlet_counts_ref}. #' #' @param graphlet_counts Pre-generated graphlet counts (default: NULL). If the #' \code{graphlet_counts} argument is defined then \code{graphs} will not be @@ -423,14 +498,14 @@ netdis_one_to_many <- function(graph_1 = NULL, #' additional column labelled "N" including the node count for #' each ego network. #' -#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: NULL). Matrix containing counts -#' of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with -#' graphlet IDs and rows are labelled with the ID of the central node in each -#' ego-network. As well as graphlet counts, each matrix must contain an -#' additional column labelled "N" including the node count for -#' each ego network. -#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -#' be used. +#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: +#' NULL). Matrix containing counts of each graphlet (columns) for each +#' ego-network (rows) in the input graph. Columns are labelled with graphlet IDs +#' and rows are labelled with the ID of the central node in each ego-network. As +#' well as graphlet counts, each matrix must contain an additional column +#' labelled "N" including the node count for each ego network. +#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} +#' will not be used. #' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. @@ -513,8 +588,9 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # Centred counts - # If there are no graphlet_counts_ref, and a number has been passed as ref_graph, treat it as a constant expected - # counts value (e.g. if ref_graph = 0 then no centring of counts). + # If there are no graphlet_counts_ref, and a number has been passed as + # ref_graph, treat it as a constant expected counts value (e.g. if + # ref_graph = 0 then no centring of counts). if (is.numeric(ref_graph) && length(ref_graph) == 1 && is.null(graphlet_counts_ref)) { centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -528,8 +604,8 @@ netdis_many_to_many <- function(graphs = NULL, ) ## ------------------------------------------------------------------------ - # If there are no graphlet_counts_ref, and If a reference graph passed, use it to calculate expected counts for all - # query graphs. + # If there are no graphlet_counts_ref, and If a reference graph passed, use + # it to calculate expected counts for all query graphs. } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { # Generate ego networks and calculate graphlet counts @@ -575,7 +651,8 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # If no reference passed, calculate expected counts using query networks - # themselves. Geometric-Poisson GP #This is the function that creates an error for a graph with three connected nodes. + # themselves. Geometric-Poisson GP #This is the function that creates an + # error for a graph with three connected nodes. } else { centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -623,15 +700,18 @@ netdis_many_to_many <- function(graphs = NULL, #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets #' of size \code{graphlet_size}. -#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 -#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 +#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for +#' graph 1 +#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for +#' graph 2 #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. #' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, +netdis <- function(centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2, graphlet_size) { # Select subset of centred counts corresponding to graphlets of the # specified size @@ -655,15 +735,19 @@ netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vecto #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}) for all #' graphlet sizes up to \code{max_graphlet_size}. -#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 -#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 +#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for +#' graph 1 +#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for +#' graph 2 #' @param max_graphlet_size max graphlet size to calculate Netdis for. #' The size of a graphlet is the number of nodes it contains. Netdis is -#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported. +#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently +#' only 4 and 5 are supported. #' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, +netdis_uptok <- function(centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2, max_graphlet_size) { if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { stop("max_graphlet_size must be 3, 4 or 5.") @@ -709,8 +793,8 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count #' @param binning_fn Function used to bin ego network densities. Only needed if #' \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are #' \code{NULL}. Takes densities as its single argument, and returns a named list -#' including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} -#' (density bin index for each ego network). +#' including keys \code{breaks} (vector of bin edges) and +#' \code{interval_indexes} (density bin index for each ego network). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Only needed if \code{ref_ego_density_bins} and @@ -723,7 +807,8 @@ netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. #' -#' @param max_graphlet_size max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported. +#' @param max_graphlet_size max graphlet size to calculate centred counts for. +#' Currently only size 4 and 5 are supported. #' #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size #' max_graphlet_size. @@ -797,8 +882,8 @@ netdis_centred_graphlet_counts <- function(graphlet_counts, stop("Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts. Options are: - Both NULL: calculate expected counts using query network. - - Vector of bin edges and matrix of binned counts: Reference graph values - for calculating expected counts. + - Vector of bin edges and matrix of binned counts: Reference graph + values for calculating expected counts. - Constant numeric ref_binned_graphlet_counts: Use as constant expected counts value.") } @@ -821,7 +906,8 @@ netdis_centred_graphlet_counts <- function(graphlet_counts, #' nummber of ego networks (rows). #' @param exp_graphlet_counts Matrix of expected graphlet counts (columns) for a #' nummber of ego networks (rows). -#' @param max_graphlet_size Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported. +#' @param max_graphlet_size Do the subtraction for graphlets up to this size. +#' Currently only size 4 and 5 are supported. #' @export netdis_subtract_exp_counts <- function(graphlet_counts, exp_graphlet_counts, @@ -847,7 +933,8 @@ netdis_subtract_exp_counts <- function(graphlet_counts, #' @param density_binned_reference_counts Reference network graphlet counts for #' each density bin. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' Currently only size 4 and 5 are supported. #' @param scale_fn Optional function to scale calculated expected counts, taking #' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, #' and returning a scale factor that the looked up @@ -882,7 +969,8 @@ netdis_expected_counts <- function(graphlet_counts, #' #' @param graphlet_counts Node and graphlet counts for an ego network. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' Currently only size 4 and 5 are supported. #' @param density_breaks Density values defining bin edges. #' @param density_binned_reference_counts Reference network graphlet counts for #' each density bin. @@ -1011,8 +1099,8 @@ density_binned_counts <- function(graphlet_counts, #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin indexes for each ego network in #' \code{graphlet_counts}. -#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -#' included in graphlet_counts. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently +#' only size 4 and 5 are supported. included in graphlet_counts. exp_counts_bin_gp <- function(bin_idx, graphlet_counts, density_interval_indexes, max_graphlet_size) { @@ -1066,8 +1154,8 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin index for #' each ego network. -#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -#' included in graphlet_counts. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently +#' only size 4 and 5 are supported. included in graphlet_counts. #' @export density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, From ff2e549e41439ec74ab43e9c4eb778ba05e90d19 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 22:31:28 +0100 Subject: [PATCH 61/84] styler --- R/dhist.R | 4 ++-- R/measures_net_dis.R | 52 +++++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/R/dhist.R b/R/dhist.R index 4ea7a237..0ff24b9a 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -479,7 +479,7 @@ dhist_variance <- function(dhist) { # is located precisely at the lcoation. Therefore cariance (i.e. E[X^2]) # is the mass-weighted sum of the mean-centred locations variance <- sum(dhist$masses * (mean_centred_locations)^2) / - sum(dhist$masses) + sum(dhist$masses) } else { # For smoothed histograms, the mass associated with each location is # "smoothed" uniformly across a bin centred on the location with @@ -558,7 +558,7 @@ normalise_dhist_variance <- function(dhist) { centred_locations <- (dhist$locations - dhist_mean_location(dhist)) normalised_centred_locations <- centred_locations / std_dev normalised_locations <- normalised_centred_locations + - dhist_mean_location(dhist) + dhist_mean_location(dhist) dhist <- update_dhist(dhist, locations = normalised_locations) # If smoothing_window_width not zero, then update it to reflect the variance # normalisation diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 0ae66fd3..51c689b3 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -105,17 +105,17 @@ #' # Set source directory for Virus PPI graph edge files stored in the #' # netdist package. #' source_dir <- system.file( -#' file.path("extdata", "VRPINS"), -#' package = "netdist" +#' file.path("extdata", "VRPINS"), +#' package = "netdist" #' ) #' # Load query graphs as igraph objects #' graph_1 <- read_simple_graph( -#' file.path(source_dir, "EBV.txt"), -#' format = "ncol" +#' file.path(source_dir, "EBV.txt"), +#' format = "ncol" #' ) #' graph_2 <- read_simple_graph( -#' file.path(source_dir, "ECL.txt"), -#' format = "ncol" +#' file.path(source_dir, "ECL.txt"), +#' format = "ncol" #' ) #' #' # Netdis variant using the Geometric Poisson approximation to remove the @@ -136,14 +136,14 @@ #' goldstd_2 <- graph.lattice(c(44, 44)) # A reference net #' #' netdis_one_to_one( -#' graph_1 = graph_1, -#' graph_2 = graph_2, -#' ref_graph = goldstd_1 +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_1 #' ) #' netdis_one_to_one( -#' graph_1 = graph_1, -#' graph_2 = graph_2, -#' ref_graph = goldstd_2 +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' ref_graph = goldstd_2 #' ) #' #' @@ -156,28 +156,28 @@ #' #' # Netdis Geometric-Poisson. #' netdis_one_to_one( -#' graphlet_counts_1 = props_1, -#' graphlet_counts_2 = props_2, -#' ref_graph = NULL +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = NULL #' ) #' #' # Netdis Zero Expectation. #' netdis_one_to_one( -#' graphlet_counts_1 = props_1, -#' graphlet_counts_2 = props_2, -#' ref_graph = 0 +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' ref_graph = 0 #' ) #' #' # Netdis using gold-standard network #' netdis_one_to_one( -#' graphlet_counts_1 = props_1, -#' graphlet_counts_2 = props_2, -#' graphlet_counts_ref = props_goldstd_1 +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_1 #' ) #' netdis_one_to_one( -#' graphlet_counts_1 = props_1, -#' graphlet_counts_2 = props_2, -#' graphlet_counts_ref = props_goldstd_2 +#' graphlet_counts_1 = props_1, +#' graphlet_counts_2 = props_2, +#' graphlet_counts_ref = props_goldstd_2 #' ) #' @export netdis_one_to_one <- function(graph_1 = NULL, @@ -591,7 +591,9 @@ netdis_many_to_many <- function(graphs = NULL, # If there are no graphlet_counts_ref, and a number has been passed as # ref_graph, treat it as a constant expected counts value (e.g. if # ref_graph = 0 then no centring of counts). - if (is.numeric(ref_graph) && length(ref_graph) == 1 && is.null(graphlet_counts_ref)) { + if (is.numeric(ref_graph) && + length(ref_graph) == 1 && + is.null(graphlet_counts_ref)) { centred_graphlet_counts <- purrr::map( graphlet_counts, netdis_centred_graphlet_counts, From 8726fe10b8c9f9d90ed626409d2026aa79468eb3 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Mon, 6 Jun 2022 21:34:34 +0000 Subject: [PATCH 62/84] Update documentation --- man/as_smoothed_dhist.Rd | 8 +- man/as_unsmoothed_dhist.Rd | 4 +- man/density_binned_counts_gp.Rd | 4 +- man/dhist.Rd | 13 +-- man/dhist_from_obs.Rd | 6 +- man/dhist_from_obs_slow.Rd | 6 +- man/exp_counts_bin_gp.Rd | 4 +- man/is_dhist.Rd | 4 +- man/is_numeric_vector_1d.Rd | 3 +- man/netdis.Rd | 6 +- man/netdis_centred_graphlet_counts.Rd | 7 +- man/netdis_expected_counts.Rd | 3 +- man/netdis_expected_counts_ego.Rd | 3 +- man/netdis_many_to_many.Rd | 48 +++++---- man/netdis_one_to_many.Rd | 29 +++--- man/netdis_one_to_one.Rd | 136 +++++++++++++++++++------- man/netdis_subtract_exp_counts.Rd | 3 +- man/netdis_uptok.Rd | 9 +- 18 files changed, 194 insertions(+), 102 deletions(-) diff --git a/man/as_smoothed_dhist.Rd b/man/as_smoothed_dhist.Rd index 1dfc0c14..f81847dd 100644 --- a/man/as_smoothed_dhist.Rd +++ b/man/as_smoothed_dhist.Rd @@ -11,11 +11,13 @@ as_smoothed_dhist(dhist, smoothing_window_width) \item{smoothing_window_width}{If greater than 0, the discrete histogram will be treated as having the mass at each location "smoothed" uniformly across -a bin centred on the location and having width = \code{smoothing_window_width}} +a bin centred on the location and having +width = \code{smoothing_window_width}} } \value{ -A copy of a \code{dhist} object with its \code{smoothing_window_width} -attribute set to the value provided \code{smoothing_window_width} parameter. +A copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to the value provided +\code{smoothing_window_width} parameter. } \description{ Returns a "smoothed" copy of a \code{dhist} object with its diff --git a/man/as_unsmoothed_dhist.Rd b/man/as_unsmoothed_dhist.Rd index d4304810..4d2c250c 100644 --- a/man/as_unsmoothed_dhist.Rd +++ b/man/as_unsmoothed_dhist.Rd @@ -10,8 +10,8 @@ as_unsmoothed_dhist(dhist) \item{dhist}{A discrete histogram as a \code{dhist} object} } \value{ -A copy of a \code{dhist} object with its \code{smoothing_window_width} -attribute set to 0. +A copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to 0. } \description{ Returns an "unsmoothed" copy of a \code{dhist} object with its diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index 0dcd867e..cf722709 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -17,8 +17,8 @@ density_binned_counts_gp( \item{density_interval_indexes}{Density bin index for each ego network.} -\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -included in graphlet_counts.} +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently +only size 4 and 5 are supported. included in graphlet_counts.} } \description{ Calculate expected counts in density bins using the diff --git a/man/dhist.Rd b/man/dhist.Rd index 4566df12..701f3385 100644 --- a/man/dhist.Rd +++ b/man/dhist.Rd @@ -15,8 +15,8 @@ location} \item{smoothing_window_width}{If greater than 0, the discrete histogram will be treated as having the mass at each location "smoothed" uniformly across -a bin centred on the location and having width = \code{smoothing_window_width} -(default = \code{0} - no smoothing)} +a bin centred on the location and having +width = \code{smoothing_window_width} (default = \code{0} - no smoothing)} \item{sorted}{Whether or not to return a discrete histogram with locations and masses sorted by ascending mass (default = \code{TRUE})} @@ -26,13 +26,14 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } Note that locations where no mass is present are not included in the returned \code{dhist} object. Mass in these discrete histograms is treated as being -present precisely at the specified location. Discrete histograms should not be used -for data where observations have been grouped into bins representing ranges -of observation values. +present precisely at the specified location. Discrete histograms should not +be used for data where observations have been grouped into bins representing +ranges of observation values. } \description{ Creates a discrete histogram object of class \code{dhist}, with bin diff --git a/man/dhist_from_obs.Rd b/man/dhist_from_obs.Rd index 584da0b8..f2df2ffd 100644 --- a/man/dhist_from_obs.Rd +++ b/man/dhist_from_obs.Rd @@ -14,9 +14,11 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } } \description{ -Generate a sparse discrete histogram from a set of discrete numeric observations +Generate a sparse discrete histogram from a set of discrete numeric +observations } diff --git a/man/dhist_from_obs_slow.Rd b/man/dhist_from_obs_slow.Rd index 39afc81e..e40b78fb 100644 --- a/man/dhist_from_obs_slow.Rd +++ b/man/dhist_from_obs_slow.Rd @@ -14,9 +14,11 @@ A sparse discrete histogram. Format is a \code{dhist} object, which is a list of class \code{dhist} with the following named elements: \itemize{ \item \code{locations}: A 1D numeric vector of discrete locations - \item \code{masses}: A 1D numeric vector of the mass present at each location + \item \code{masses}: A 1D numeric vector of the mass present at each + location } } \description{ -Generate a sparse discrete histogram from a set of discrete numeric observations +Generate a sparse discrete histogram from a set of discrete numeric +observations } diff --git a/man/exp_counts_bin_gp.Rd b/man/exp_counts_bin_gp.Rd index dc3bc3c5..4ed9ea90 100644 --- a/man/exp_counts_bin_gp.Rd +++ b/man/exp_counts_bin_gp.Rd @@ -22,8 +22,8 @@ exp_counts_bin_gp( \item{density_interval_indexes}{Density bin indexes for each ego network in \code{graphlet_counts}.} -\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. -included in graphlet_counts.} +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently +only size 4 and 5 are supported. included in graphlet_counts.} } \description{ INTERNAL FUNCTION - DO NOT CALL DIRECTLY diff --git a/man/is_dhist.Rd b/man/is_dhist.Rd index 1f550d6f..114e35e0 100644 --- a/man/is_dhist.Rd +++ b/man/is_dhist.Rd @@ -15,8 +15,8 @@ is set to \code{dhist} (default = \code{TRUE})} } \description{ Checks if the input object is of class \code{dhist}. If \code{fast_check} is -\code{TRUE} then the only check is whether the object has a class attribute of -\code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks +\code{TRUE} then the only check is whether the object has a class attribute +of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks are also made to ensure that the object has the structure required of a \code{dhist} object. } diff --git a/man/is_numeric_vector_1d.Rd b/man/is_numeric_vector_1d.Rd index 02f41145..9ad1cdd5 100644 --- a/man/is_numeric_vector_1d.Rd +++ b/man/is_numeric_vector_1d.Rd @@ -15,7 +15,8 @@ TRUE if input is a 1D numeric vector. FALSE otherwise. \description{ Check if a variable is a 1D numeric vector by checking that: \itemize{ - \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers + \item \code{is_numeric(input)}: Input is vector, matrix, array or list of + numbers \item \code{is_null(dim(input))}: Input is not a matrix or array } } diff --git a/man/netdis.Rd b/man/netdis.Rd index cc091357..6f71662a 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -11,9 +11,11 @@ netdis( ) } \arguments{ -\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} +\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for +graph 1} -\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} +\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for +graph 2} \item{graphlet_size}{The size of graphlets to use for the Netdis calculation (only counts for graphlets of the specified size will be used). The size of diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index eb3779cd..91b0ac40 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -30,8 +30,8 @@ or a constant numeric value to subtract from all graphlet counts.} \item{binning_fn}{Function used to bin ego network densities. Only needed if \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are \code{NULL}. Takes densities as its single argument, and returns a named list -including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} -(density bin index for each ego network).} +including keys \code{breaks} (vector of bin edges) and +\code{interval_indexes} (density bin index for each ego network).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Only needed if \code{ref_ego_density_bins} and @@ -44,7 +44,8 @@ expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments.} -\item{max_graphlet_size}{max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{max graphlet size to calculate centred counts for. +Currently only size 4 and 5 are supported.} } \value{ graphlet_counts minus exp_graphlet_counts for graphlets up to size diff --git a/man/netdis_expected_counts.Rd b/man/netdis_expected_counts.Rd index f9ee0968..eed67af7 100644 --- a/man/netdis_expected_counts.Rd +++ b/man/netdis_expected_counts.Rd @@ -22,7 +22,8 @@ nummber of ego networks (rows).} each density bin.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +Currently only size 4 and 5 are supported.} \item{scale_fn}{Optional function to scale calculated expected counts, taking \code{graphlet_counts} and \code{max_graphlet_size} as arguments, diff --git a/man/netdis_expected_counts_ego.Rd b/man/netdis_expected_counts_ego.Rd index b565ecab..d2935c27 100644 --- a/man/netdis_expected_counts_ego.Rd +++ b/man/netdis_expected_counts_ego.Rd @@ -17,7 +17,8 @@ netdis_expected_counts_ego( \item{graphlet_counts}{Node and graphlet counts for an ego network.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +Currently only size 4 and 5 are supported.} \item{density_breaks}{Density values defining bin edges.} diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index 12f5c057..9313d7e5 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -29,14 +29,15 @@ obtained by using \code{read_simple_graphs}.} graphs. 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL (default) - Expected counts will be calculated based on the properties of the -query graphs themselves. (Geometric-Poisson approximation).} +3) NULL (default) - Expected counts will be calculated based on the +properties of the query graphs themselves. (Geometric-Poisson approximation).} \item{comparisons}{Which comparisons to perform between graphs. Can be "many-to-many" (all pairwise combinations) or "one-to-many" (compare first graph in graphs to all other graphs.)} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 +(default) and 5 are supported.} \item{neighbourhood_size}{Ego network neighbourhood size (default 2).} @@ -46,24 +47,31 @@ than min_ego_nodes nodes (default 3).} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges (default 1).} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +\code{num_intervals = 100} is used (default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), -it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson -approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. +If \code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts}{Pre-generated graphlet counts (default: NULL). If the \code{graphlet_counts} argument is defined then \code{graphs} will not be @@ -75,14 +83,14 @@ ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for each ego network.} -\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: NULL). Matrix containing counts -of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with -graphlet IDs and rows are labelled with the ID of the central node in each -ego-network. As well as graphlet counts, each matrix must contain an -additional column labelled "N" including the node count for -each ego network. -If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -be used.} +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: +NULL). Matrix containing counts of each graphlet (columns) for each +ego-network (rows) in the input graph. Columns are labelled with graphlet IDs +and rows are labelled with the ID of the central node in each ego-network. As +well as graphlet counts, each matrix must contain an additional column +labelled "N" including the node count for each ego network. +If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} +will not be used.} } \value{ Netdis statistics between query graphs for graphlet sizes diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index c50cd4f4..bfc2344a 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -35,7 +35,8 @@ expected counts are calculated for all query graphs. 3) NULL - Expected counts will be calculated based on the properties of the query graphs themselves.} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 +and 5 are supported.} \item{neighbourhood_size}{Ego network neighbourhood size.} @@ -45,25 +46,31 @@ than min_ego_nodes nodes.} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges.} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -(Default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and +\code{num_intervals = 100} is used (Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), - it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson - approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. If +\code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. If the \code{graphlet_counts_1} argument is defined then diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 8f3b5208..63474c02 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -21,19 +21,29 @@ netdis_one_to_one( ) } \arguments{ -\item{graph_1}{A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered.} - -\item{graph_2}{A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered.} +\item{graph_1}{A simple graph object from the \code{igraph} package. +\code{graph_1} can be set to \code{NULL} (default) if +\code{graphlet_counts_1} is provided. If both \code{graph_1} and +\code{graphlet_counts_1} are not \code{NULL}, then only +\code{graphlet_counts_1} will be considered.} + +\item{graph_2}{A simple graph object from the \code{igraph} package. +\code{graph_2} can be set to \code{NULL} (default) if +\code{graphlet_counts_2} is provided. If both \code{graph_2} and +\code{graphlet_counts_2} are not \code{NULL}, then only +\code{graphlet_counts_2} will be considered.} \item{ref_graph}{Controls how expected counts are calculated. Either: 1) A numeric value - used as a constant expected counts value for all query graphs . 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the -query graphs themselves. (Geometric-Poisson approximation).} +3) NULL (Default) - Used for Netdis-GP, where the expected counts will be +calculated based on the properties of the query graphs themselves +(Geometric-Poisson approximation).} -\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only +4 (default) and 5 are supported.} \item{neighbourhood_size}{Ego network neighborhood size (default: 2).} @@ -43,25 +53,31 @@ than min_ego_nodes nodes (default: 3).} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges (default: 1).} -\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} -as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. -ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used -(Default: NULL).} +\item{binning_fn}{Function used to bin ego network densities. Takes edge +\code{densities} as its single argument, and returns a named list including, +the input \code{densities}, the resulting bin \code{breaks} (vector of +density bin limits), and the vector \code{interval_indexes} which states to +what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method +\code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} +and \code{num_intervals = 100} is used (Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +(bin indexes) and \code{max_graphlet_size} as arguments. If +\code{bin_counts_fn} is \code{NULL}, (default), it will apply either the +approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and +\code{graphlet_counts_ref}.} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. If \code{exp_counts_fn} is \code{NULL}, (default), it will apply -either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the -values of \code{ref_graph} and \code{graphlet_counts_ref}.} +either the approach from the original Netdis paper, or the respective +Geometric-Poisson approximation; depending on the values of \code{ref_graph} +and \code{graphlet_counts_ref}.} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. Matrix containing counts of each graphlet (columns) for @@ -69,9 +85,9 @@ each ego-network (rows) in the input graph. Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for -each ego network. (default: NULL). -If the \code{graphlet_counts_1} argument is defined then -\code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} +each ego network. If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used. These counts can be obtained with +\code{count_graphlets_ego}. (default: NULL).} \item{graphlet_counts_2}{Pre-generated graphlet counts for the second query graph. Matrix containing counts of each graphlet (columns) for @@ -81,7 +97,8 @@ ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for each ego network. (default: NULL). If the \code{graphlet_counts_2} argument is defined then -\code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} +\code{graph_2} will not be used. These counts can be obtained with +\code{count_graphlets_ego}.} \item{graphlet_counts_ref}{Pre-generated reference graphlet counts. Matrix containing counts of each graphlet (columns) for @@ -89,39 +106,66 @@ each ego-network (rows) in the reference graph. Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. As well as graphlet counts, each matrix must contain an additional column labelled "N" including the node count for -each ego network. (default: NULL). -If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not -be used.} +each ego network. If the \code{graphlet_counts_ref} argument is defined then +\code{ref_graph} will not be used. (default: NULL).} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size. } \description{ -Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). +Calculates the different variants of the network dissimilarity statistic +Netdis between two graphs. The variants currently supported are Netdis using +a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), +and Netdis using a Geometric Poisson approximation for the expectation +(\code{ref_graph = NULL}). } \examples{ require(netdist) require(igraph) -# Set source directory for Virus PPI graph edge files stored in the netdist package. -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Set source directory for Virus PPI graph edge files stored in the +# netdist package. +source_dir <- system.file( + file.path("extdata", "VRPINS"), + package = "netdist" +) # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") +graph_1 <- read_simple_graph( + file.path(source_dir, "EBV.txt"), + format = "ncol" +) +graph_2 <- read_simple_graph( + file.path(source_dir, "ECL.txt"), + format = "ncol" +) -# Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. -netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # This option will focus on detecting more general and global discrepancies between the ego-network structures. +# Netdis variant using the Geometric Poisson approximation to remove the +# background expectation of each network. This option will focus on detecting +# more general and global discrepancies between the ego-network structures. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) -# Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +# Comparing the networks via their observed ego counts without centering them +# (equivalent to using expectation equal to zero). This option, will focus on +# detecting small discrepancies. netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) -# Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. +# Example of the use of netdis with a reference graph.This option will focus +# on detecting discrepancies between the networks relative to the ego-network +# structure of the reference network / gold-standard. # Two lattice networks of different sizes are used for this example. goldstd_1 <- graph.lattice(c(8, 8)) # A reference net goldstd_2 <- graph.lattice(c(44, 44)) # A reference net -netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) -netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) +netdis_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + ref_graph = goldstd_1 +) +netdis_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + ref_graph = goldstd_2 +) # Providing pre-calculated subgraph counts. @@ -132,12 +176,28 @@ props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) # Netdis Geometric-Poisson. -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + ref_graph = NULL +) # Netdis Zero Expectation. -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + ref_graph = 0 +) # Netdis using gold-standard network -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_1 +) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_2 +) } diff --git a/man/netdis_subtract_exp_counts.Rd b/man/netdis_subtract_exp_counts.Rd index c03aa42f..3540c12f 100644 --- a/man/netdis_subtract_exp_counts.Rd +++ b/man/netdis_subtract_exp_counts.Rd @@ -17,7 +17,8 @@ nummber of ego networks (rows).} \item{exp_graphlet_counts}{Matrix of expected graphlet counts (columns) for a nummber of ego networks (rows).} -\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. +Currently only size 4 and 5 are supported.} } \description{ Subtract expected graphlet counts from actual graphlet counts. diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index 3ab881c9..98ddf371 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -11,13 +11,16 @@ netdis_uptok( ) } \arguments{ -\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} +\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for +graph 1} -\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} +\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for +graph 2} \item{max_graphlet_size}{max graphlet size to calculate Netdis for. The size of a graphlet is the number of nodes it contains. Netdis is -calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported.} +calculated for all graphlets from size 3 to size max_graphlet_size. Currently +only 4 and 5 are supported.} } \value{ Netdis statistic calculated using centred counts for graphlets of From 644ef1110d70978a0be0702331d6b1363cc54f52 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Tue, 7 Jun 2022 12:17:01 +0100 Subject: [PATCH 63/84] fix line length R and tests directories --- R/PlottingFunctions.R | 85 +- R/RcppExports.R | 8 +- R/emd.R | 58 +- R/graph_binning.R | 4 +- R/measures_net_emd.R | 289 +++-- R/net_emd_speed_benchmark.R | 14 +- R/orca_interface.R | 52 +- tests/testthat/test_dhist.R | 1165 ++++++++++-------- tests/testthat/test_emd.R | 21 +- tests/testthat/test_graph_binning.R | 185 ++- tests/testthat/test_measures_net_dis.R | 1157 ++++++++++-------- tests/testthat/test_measures_net_emd.R | 167 ++- tests/testthat/test_orca_interface.R | 1484 +++++++++++------------ tests/testthat/test_utility_functions.R | 216 +++- 14 files changed, 2824 insertions(+), 2081 deletions(-) diff --git a/R/PlottingFunctions.R b/R/PlottingFunctions.R index b4f9dbf8..5219dee3 100644 --- a/R/PlottingFunctions.R +++ b/R/PlottingFunctions.R @@ -1,28 +1,55 @@ #' Heatmap of Netdis many-to-many comparisons #' -#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. #' #' @param netdislist Default output of \code{netdis_many_to_many}. #' -#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} +#' to be used for plotting. #' -#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} +#' function from the \code{pheatmap} package. The dendrogram will appear if +#' \code{docluster} is TRUE (default). #' #' @param main Title of the plot. #' -#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. +#' @param docluster controls the order of the rows and columns. If TRUE +#' (default) the rows and columns will be reordered to create the dendrogram. If +#' FALSE, then only the heatmap is drawn. #' -#' @return Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' @return Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. #' @export -netdis.plot <- function(netdislist, whatrow = c(1, 2)[2], clustering_method = "ward.D", main = "Nedis", docluster = TRUE) { - adjmat <- cross_comp_to_matrix(measure = netdislist$netdis[whatrow, ], cross_comparison_spec = netdislist$comp_spec) +netdis.plot <- function(netdislist, + whatrow = c(1, 2)[2], + clustering_method = "ward.D", + main = "Nedis", + docluster = TRUE) { + adjmat <- cross_comp_to_matrix( + measure = netdislist$netdis[whatrow, ], + cross_comparison_spec = netdislist$comp_spec + ) vnames <- rownames(adjmat) legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) levels1 <- round(legend1, digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat), cluster_rows = docluster, cluster_cols = docluster, clustering_method = clustering_method, angle_col = 45, main = main, treeheight_row = 80, labels_row = vnames, labels_col = vnames, display_numbers = TRUE, legend_breaks = legend1, legend_labels = levels1) + pheatmap::pheatmap( + mat = as.dist(adjmat), + cluster_rows = docluster, + cluster_cols = docluster, + clustering_method = clustering_method, + angle_col = 45, + main = main, + treeheight_row = 80, + labels_row = vnames, + labels_col = vnames, + display_numbers = TRUE, + legend_breaks = legend1, + legend_labels = levels1 + ) } @@ -30,26 +57,52 @@ netdis.plot <- function(netdislist, whatrow = c(1, 2)[2], clustering_method = "w #' Heatmap of NetEmd many-to-many comparisons #' -#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' Provides a heatmap and dendrogram for the network comparisons via +#' \code{pheatmap}. #' #' @param netdislist Default output of \code{netdis_many_to_many}. #' -#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} +#' to be used for plotting. #' -#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} +#' function from the \code{pheatmap} package. The dendrogram will appear if +#' \code{docluster} is TRUE (default). #' #' @param main Title of the plot. #' -#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. +#' @param docluster controls the order of the rows and columns. If TRUE +#' (default) the rows and columns will be reordered to create the dendrogram. If +#' FALSE, then only the heatmap is drawn. #' -#' @return Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +#' @return Provides a heat map and dendrogram for the network comparisons via +#' \code{pheatmap}. #' @export -netemd.plot <- function(netemdlist, clustering_method = "ward.D", main = "NetEmd", docluster = TRUE) { - adjmat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) +netemd.plot <- function(netemdlist, + clustering_method = "ward.D", + main = "NetEmd", + docluster = TRUE) { + adjmat <- cross_comp_to_matrix( + measure = netemdlist$netemds, + cross_comparison_spec = netemdlist$comp_spec + ) vnames <- rownames(adjmat) legend1 <- seq(min(adjmat), max(adjmat), length.out = 5) levels1 <- round(legend1, digits = 2) - pheatmap::pheatmap(mat = as.dist(adjmat), cluster_rows = docluster, cluster_cols = docluster, clustering_method = clustering_method, angle_col = 45, main = main, treeheight_row = 80, labels_row = vnames, labels_col = vnames, display_numbers = TRUE, legend_breaks = legend1, legend_labels = levels1) + pheatmap::pheatmap( + mat = as.dist(adjmat), + cluster_rows = docluster, + cluster_cols = docluster, + clustering_method = clustering_method, + angle_col = 45, + main = main, + treeheight_row = 80, + labels_row = vnames, + labels_col = vnames, + display_numbers = TRUE, + legend_breaks = legend1, + legend_labels = levels1 + ) } diff --git a/R/RcppExports.R b/R/RcppExports.R index b7d0c2dc..f1932442 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -22,6 +22,12 @@ counts_from_observations <- function(features) { #' #' @export emd_fast_no_smoothing <- function(locations1, values1, locations2, values2) { - .Call(`_netdist_emd_fast_no_smoothing`, locations1, values1, locations2, values2) + .Call( + `_netdist_emd_fast_no_smoothing`, + locations1, + values1, + locations2, + values2 + ) } diff --git a/R/emd.R b/R/emd.R index da99cb36..02208756 100644 --- a/R/emd.R +++ b/R/emd.R @@ -9,8 +9,8 @@ #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @return Earth Mover's Distance between the two discrete histograms #' @export @@ -43,7 +43,8 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { # Can we run the C++ fast implementation (only works with no smoothing)? - if ((dhist1$smoothing_window_width == 0) && (dhist2$smoothing_window_width == 0)) { + if ((dhist1$smoothing_window_width == 0) && + (dhist2$smoothing_window_width == 0)) { # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) @@ -147,10 +148,10 @@ min_emd_optimise <- function(dhist1, dhist2) { #' to ensure that the offset with the global minimum EMD is found. #' #' This is because of the piece-wise linear nature of the two ECMFs. Between any -#' two offsets where knots from the two ECMFs align, EMD will be either constant, -#' or uniformly increasing or decreasing. Therefore, there the EMD between two -#' sets of aligned knots cannot be smaller than the EMD at one or other of the -#' bounding offsets. +#' two offsets where knots from the two ECMFs align, EMD will be either +#' constant, or uniformly increasing or decreasing. Therefore, there the EMD +#' between two sets of aligned knots cannot be smaller than the EMD at one or +#' other of the bounding offsets. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object #' @return Earth Mover's Distance between the two discrete histograms @@ -240,15 +241,16 @@ emd <- function(dhist1, dhist2) { #' Distance between the two histograms by summing the absolute difference #' between the two cumulative histograms. #' @references -#' Calculation of the Wasserstein Distance Between Probability Distributions on the Line -#' S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 -#' \url{http://dx.doi.org/10.1137/1118101} +#' Calculation of the Wasserstein Distance Between Probability Distributions on +#' the Line S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, +#' 784-786 \url{http://dx.doi.org/10.1137/1118101} #' @param dhist1 A discrete histogram as a \code{dhist} object #' @param dhist2 A discrete histogram as a \code{dhist} object #' @return Earth Mover's Distance between the two input histograms #' @export emd_cs <- function(dhist1, dhist2) { - # Generate Empirical Cumulative Mass Functions (ECMFs) for each discrete histogram + # Generate Empirical Cumulative Mass Functions (ECMFs) for each discrete + # histogram ecmf1 <- dhist_ecmf(dhist1) ecmf2 <- dhist_ecmf(dhist2) # Calculate the area between the two ECMFs @@ -278,10 +280,16 @@ emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { # the bin_mass and bin_centre vectors for each histogram must have the same # length. if (length(bin_centres1) != num_bins1) { - stop("Number of bin masses and bin centres provided for histogram 1 must be equal") + stop( + "Number of bin masses and bin centres provided for histogram 1 must ", + "be equal" + ) } if (length(bin_centres2) != num_bins2) { - stop("Number of bin masses and bin centres provided for histogram 2 must be equal") + stop( + "Number of bin masses and bin centres provided for histogram 2 must ", + "be equal" + ) } # Generate cost matrix @@ -290,10 +298,16 @@ emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { # Linear Programming solver requires all bin masses and transportation costs # to be integers to generate correct answer if (!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { - stop("All bin masses for histogram 1 must be integers for accurate Linear Programming solution") + stop( + "All bin masses for histogram 1 must be integers for accurate Linear ", + "Programming solution" + ) } if (!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { - stop("All bin masses for histogram 2 must be integers for accurate Linear Programming solution") + stop( + "All bin masses for histogram 2 must be integers for accurate ", + "Linear Programming solution" + ) } if (!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { stop("All costs must be integers for accurate Linear Programming solution") @@ -319,8 +333,18 @@ cost_matrix <- function(bin_centres1, bin_centres2) { # Calculate distances between all bins in network 1 and all bins in network 2 num_bins1 <- length(bin_centres1) num_bins2 <- length(bin_centres2) - loc_mat1 <- matrix(bin_centres1, nrow = num_bins1, ncol = num_bins2, byrow = FALSE) - loc_mat2 <- matrix(bin_centres2, nrow = num_bins1, ncol = num_bins2, byrow = TRUE) + loc_mat1 <- matrix( + bin_centres1, + nrow = num_bins1, + ncol = num_bins2, + byrow = FALSE + ) + loc_mat2 <- matrix( + bin_centres2, + nrow = num_bins1, + ncol = num_bins2, + byrow = TRUE + ) cost_mat <- abs(loc_mat1 - loc_mat2) return(cost_mat) } diff --git a/R/graph_binning.R b/R/graph_binning.R index 482fc524..e75da79b 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -10,7 +10,9 @@ binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) { - if (length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) { + min_counts_per_interval <- length(densities) + } breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index ddcd1592..e1688ec1 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -1,23 +1,37 @@ #' NetEMD Network Earth Mover's Distance between a pair of networks. #' #' Calculates the network Earth Mover's Distance (EMD) between -#' two sets of network features. This is done by individually normalising the distribution -#' of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. +#' two sets of network features. This is done by individually normalising the +#' distribution of each feature so that they have unit mass and unit variance. +#' Then the minimun EMD between the same pair of features (one for each +#' corresponding graph) is calculated by considering all possible translations +#' of the feature distributions. Finally the average over all features is +#' reported. #' This is calculated as follows: #' 1. Normalise each feature histogram to have unit mass and unit variance. -#' 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. +#' 2. For each feature, find the minimum EMD between each pair of histograms +#' considering all possible histogram translations. #' 3. Take the average minimum EMD across all features. -#' @param graph_1 A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided. -#' @param graph_2 A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided. -#' @param dhists_1 Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param graph_1 A network/graph object from the \code{igraph} package. +#' \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is +#' provided. +#' @param graph_2 A network/graph object from the \code{igraph} package. +#' \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is +#' provided. +#' @param dhists_1 Either, a \code{dhist} discrete histogram object, or list of +#' such objects, or a matrix of network features (each column representing a +#' feature). \code{dhists_1} can be set to \code{NULL} (default) if +#' \code{graph_1} is provided. A \code{dhist} object can be obtained from +#' \code{graph_features_to_histograms}. #' @param dhists_2 Same as \code{dhists_1}. -#' @param method The method to be used to find the minimum EMD across all potential -#' offsets for each pair of histograms. Default is "optimise" to use +#' @param method The method to be used to find the minimum EMD across all +#' potential offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the -#' histograms at all offsets that are candidates for the minimal EMD at the cost of computational time. +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the +#' histograms at all offsets that are candidates for the minimal EMD at the cost +#' of computational time. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms. #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to @@ -26,29 +40,40 @@ #' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice). #' @param feature_type Type of graphlet-based feature to count: "graphlet" -#' counts the number of graphlets each node participates in; "orbit" (default) calculates -#' the number of graphlet orbits each node participates in. +#' counts the number of graphlets each node participates in; "orbit" (default) +#' calculates the number of graphlet orbits each node participates in. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Possible values are 4, and 5 (default). #' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. -#' @return NetEMD measure for the two sets of discrete histograms (or graphs). If -#' (\code{return_details = FALSE}) then a list with the following named elements is returned -#' \code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: -#' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -#' offsets giving the minimal EMD for each pair of histograms +#' include nodes for each ego-network. NetEmd was proposed for individual nodes +#' alone, hence the default value is 0. +#' @return NetEMD measure for the two sets of discrete histograms (or graphs). +#' If (\code{return_details = FALSE}) then a list with the following named +#' elements is returned \code{net_emd}: the NetEMD for the set of histogram +#' pairs (or graphs), \code{min_emds}: the minimal EMD for each pair of +#' histograms, \code{min_offsets}: the associated offsets giving the minimal EMD +#' for each pair of histograms #' @examples #' require(igraph) #' graph_1 <- graph.lattice(c(8, 8)) #' graph_2 <- graph.lattice(c(44, 44)) -#' netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", max_graphlet_size = 5) +#' netemd_one_to_one( +#' graph_1 = graph_1, +#' graph_2 = graph_2, +#' feature_type = "orbit", +#' max_graphlet_size = 5 +#' ) #' #' # Providing a matrix of network features #' props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) #' props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) #' -#' netemd_one_to_one(dhists_1 = props_a, dhists_2 = props_b, smoothing_window_width = 1) +#' netemd_one_to_one( +#' dhists_1 = props_a, +#' dhists_2 = props_b, +#' smoothing_window_width = 1 +#' ) #' #' # Providing the network features as lists of dhist objects #' dhists_1 <- graph_features_to_histograms(props_a) @@ -59,22 +84,59 @@ #' #' # A variation of NetEmd: Using the Laplacian spectrum #' # Laplacian -#' Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) -#' Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) +#' Lapg_1 <- igraph::laplacian_matrix( +#' graph = graph_1, +#' normalized = FALSE, +#' sparse = FALSE +#' ) +#' Lapg_2 <- igraph::laplacian_matrix( +#' graph = graph_2, +#' normalized = FALSE, +#' sparse = FALSE +#' ) #' #' # Normalized Laplacian -#' NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) -#' NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) +#' NLapg_1 <- igraph::laplacian_matrix( +#' graph = graph_1, +#' normalized = TRUE, +#' sparse = FALSE +#' ) +#' NLapg_2 <- igraph::laplacian_matrix( +#' graph = graph_2, +#' normalized = TRUE, +#' sparse = FALSE +#' ) #' #' # Spectra (This may take a couple of minutes). -#' props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) -#' props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) +#' props_1 <- cbind( +#' L.Spectra = eigen(Lapg_1)$values, +#' NL.Spectra = eigen(NLapg_1)$values +#' ) +#' props_2 <- cbind( +#' L.Spectra = eigen(Lapg_2)$values, +#' NL.Spectra = eigen(NLapg_2)$values +#' ) #' -#' netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +#' # Use of smoothing window 1 is given for discrete integer distributions. If +#' # the network features are considered continuous variables +#' # smoothing_window_width equal to zero is recommended. +#' netemd_one_to_one( +#' dhists_1 = props_1, +#' dhists_2 = props_2, +#' smoothing_window_width = 0 +#' ) #' #' @export -netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, dhists_2 = NULL, method = "optimise", - return_details = FALSE, smoothing_window_width = 0, feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) { +netemd_one_to_one <- function(graph_1 = NULL, + graph_2 = NULL, + dhists_1 = NULL, + dhists_2 = NULL, + method = "optimise", + return_details = FALSE, + smoothing_window_width = 0, + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0) { ## ------------------------------------------------------------------------ # Check arguments 1 if (!igraph::is.igraph(graph_1) & is.null(dhists_1)) { @@ -85,7 +147,8 @@ netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, d } ## ------------------------------------------------------------------------ # Check arguments 2 - # If dhists_1 is a matrix of network features then transform them to dhist objects. + # If dhists_1 is a matrix of network features then transform them to dhist + # objects. if (is.matrix(dhists_1)) { dhists_1 <- graph_features_to_histograms(dhists_1) } @@ -120,7 +183,10 @@ netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, d ## ------------------------------------------------------------------------ # Require either a pair of "dhist" discrete histograms or two lists of "dhist" # discrete histograms - pair_of_dhist_lists <- all(purrr::map_lgl(dhists_1, is_dhist)) && all(purrr::map_lgl(dhists_2, is_dhist)) + pair_of_dhist_lists <- all( + purrr::map_lgl(dhists_1, is_dhist) + ) && + all(purrr::map_lgl(dhists_2, is_dhist)) # If input is two lists of "dhist" discrete histograms, determine the minimum # EMD and associated offset for pairs of histograms taken from the same @@ -145,7 +211,12 @@ netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, d # Note that the offsets represent shifts after the histograms have been # scaled to unit variance if (return_details) { - return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std)) + return(list( + net_emd = net_emd, + min_emds = min_emds, + min_offsets = min_offsets, + min_offsets_std = min_offsets_std + )) } else { return(arithmetic_mean) } @@ -164,14 +235,20 @@ netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, d #' NetEMDs between all graph pairs using provided Graphlet-based Degree #' Distributions -#' @param graphs A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided. -#' @param dhists A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param graphs A list of network/graph objects from the \code{igraph} package. +#' \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is +#' provided. +#' @param dhists A list whose elements contain either: A list of \code{dhist} +#' discrete histogram objects for each graph, or a list a matrix of network +#' features (each column representing a feature). \code{dhists} can be set to +#' \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object +#' can be obtained from \code{graph_features_to_histograms}. #' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms @@ -183,27 +260,42 @@ netemd_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, dhists_1 = NULL, d #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @param feature_type Type of graphlet-based feature to count: "graphlet" -#' counts the number of graphlets each node participates in; "orbit" (default) calculates -#' the number of graphlet orbits each node participates in. +#' counts the number of graphlets each node participates in; "orbit" (default) +#' calculates the number of graphlet orbits each node participates in. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Possible values are 4, and 5 (default). #' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. +#' include nodes for each ego-network. NetEmd was proposed for individual nodes +#' alone, hence the default value is 0. #' @return NetEMD measures between all pairs of graphs for which features #' were provided. Format of returned data depends on the \code{return_details} #' parameter. If set to FALSE, a list is returned with the following named #' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, #' \code{comp_spec}: a comparison specification table containing the graph names #' and indices within the input GDD list for each pair of graphs compared. -#' If \code{return_details} is set to FALSE, the list also contains the following -#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD -#' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving -#' the minimal EMD for each GDD +#' If \code{return_details} is set to FALSE, the list also contains the +#' following matrices for each graph pair: \code{min_emds}: the minimal EMD for +#' each GDD used to compute the NetEMD, \code{min_offsets}: the associated +#' offsets giving the minimal EMD for each GDD #' @export -netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L), feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) { - if (max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores, " cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) +netemd_many_to_many <- function(graphs = NULL, + dhists = NULL, + method = "optimise", + smoothing_window_width = 0, + return_details = FALSE, + mc.cores = getOption("mc.cores", 2L), + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0) { + if (max_graphlet_size > 4 & mc.cores > 1) { + print(paste( + "This function will compute orbits of graphlets up to size 5 using ", + mc.cores, + " cores. Depending on the density and size of the graphs, this may lead ", + "to a large compsumption of RAM." + )) + } # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. @@ -219,7 +311,10 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise which_imput_type <- NULL if (!is.null(graphs) & is.null(dhists)) { if (!all((unlist(sapply(X = graphs, FUN = igraph::is.igraph))))) { - stop("Graphs need to be igraph graph objects, or a list of dhists network features should be supplied.") + stop( + "Graphs need to be igraph graph objects, or a list of dhists network ", + "features should be supplied." + ) } which_imput_type <- "Graphs" } @@ -236,14 +331,22 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise which_imput_type <- "dhist" } if (is.null(which_imput_type)) { - warning("dhists does not conform to a Matrix or dhist class for all elmenents/netwroks in the list.") + warning( + "dhists does not conform to a Matrix or dhist class for all ", + "elmenents/netwroks in the list." + ) } } ## ------------------------------------------------------------------------ # Check arguments 2 - # If dhists is a list of matrices of network features then transform them to dhist objects. + # If dhists is a list of matrices of network features then transform them to + # dhist objects. if (which_imput_type == "Matrix") { - dhists <- sapply(X = dhists, FUN = graph_features_to_histograms, simplify = FALSE) + dhists <- sapply( + X = dhists, + FUN = graph_features_to_histograms, + simplify = FALSE + ) } ## ------------------------------------------------------------------------ # Check arguments 3 @@ -269,22 +372,51 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise ## ------------------------------------------------------------------------ comp_spec <- cross_comparison_spec(dhists) num_features <- length(dhists[[1]]) - out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { - netemd_one_to_one( - dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], - method = method, return_details = return_details, - smoothing_window_width = smoothing_window_width + out <- purrr::simplify( + parallel::mcmapply(function(index_a, index_b) { + netemd_one_to_one( + dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], + method = method, return_details = return_details, + smoothing_window_width = smoothing_window_width + ) + }, + comp_spec$index_a, comp_spec$index_b, + SIMPLIFY = FALSE, mc.cores = mc.cores ) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) + ) if (return_details) { net_emds <- purrr::simplify(purrr::map(out, ~ .$net_emd)) - min_emds <- matrix(purrr::simplify(purrr::map(out, ~ .$min_emds)), ncol = num_features, byrow = TRUE) - colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = ""))) - min_offsets <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets)), ncol = num_features, byrow = TRUE) - colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = ""))) - min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), ncol = num_features, byrow = TRUE) - colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = ""))) - ret <- list(netemds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) + min_emds <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_emds)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_emds) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = "")) + ) + min_offsets <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_offsets)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_offsets) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = "")) + ) + min_offsets_std <- matrix( + purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), + ncol = num_features, + byrow = TRUE + ) + colnames(min_offsets_std) <- purrr::simplify( + purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = "")) + ) + ret <- list( + netemds = net_emds, + comp_spec = comp_spec, + min_emds = min_emds, + min_offsets = min_offsets, + min_offsets_std = min_offsets_std + ) } else { net_emds <- out ret <- list(netemds = net_emds, comp_spec = comp_spec) @@ -292,21 +424,25 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise return(ret) } -#' Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms +#' Internal function to compute the minimum Earth Mover's Distance between +#' standarized and translated histograms #' #' Calculates the minimum Earth Mover's Distance (EMD) between two -#' discrete histograms after normalising each histogram to unit mass and variance. +#' discrete histograms after normalising each histogram to unit mass and +#' variance. #' This is calculated as follows: #' 1. Normalise each histogram to have unit mass and unit variance #' 2. Find the minimum EMD between the histograms -#' @param dhists_1 A \code{dhist} discrete histogram object or a list of such objects -#' @param dhists_2 A \code{dhist} discrete histogram object or a list of such objects +#' @param dhists_1 A \code{dhist} discrete histogram object or a list of such +#' objects +#' @param dhists_2 A \code{dhist} discrete histogram object or a list of such +#' objects #' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset #' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' minimum if multiple local minima EMDs exist. You can alternatively specify +#' the "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to #' "smear" point masses across a finite width in the real domain. Default is 0, @@ -314,8 +450,10 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise #' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice) #' @return A list with the following named elements -#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated -#' offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. +#' \code{net_emd}: the NetEMD for the set of histogram pairs, +#' \code{min_offsets}: the associated offsets giving the minimal EMD for each +#' pair of histograms and \code{min_offset_std}: Offset used in the standardised +#' histograms. #' @examples #' require(igraph) #' goldstd_1 <- graph.lattice(c(8, 8)) @@ -325,7 +463,12 @@ netemd_many_to_many <- function(graphs = NULL, dhists = NULL, method = "optimise #' dhists_1 <- graph_features_to_histograms(props_1) #' dhists_2 <- graph_features_to_histograms(props_2) #' # Obtain the minimum NetEMD_edges between the histograms -#' netemd_single_pair(dhists_1[[1]], dhists_2[[1]], method = "optimise", smoothing_window_width = 0) +#' netemd_single_pair( +#' dhists_1[[1]], +#' dhists_2[[1]], +#' method = "optimise", +#' smoothing_window_width = 0 +#' ) #' @export netemd_single_pair <- function(dhist1, dhist2, method = "optimise", smoothing_window_width = 0) { diff --git a/R/net_emd_speed_benchmark.R b/R/net_emd_speed_benchmark.R index 62c97611..3f37ccb9 100644 --- a/R/net_emd_speed_benchmark.R +++ b/R/net_emd_speed_benchmark.R @@ -4,10 +4,11 @@ netEMDSpeedTest <- function() { print(source_dir) edge_format <- "ncol" file_pattern <- "" - # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - # edge_format = "ncol" - # file_pattern = ".txt" - graphs <- read_simple_graphs(source_dir = source_dir, format = edge_format, pattern = file_pattern) + graphs <- read_simple_graphs( + source_dir = source_dir, + format = edge_format, + pattern = file_pattern + ) n1 <- names(graphs) lab1 <- c() gddBuildTime <- c() @@ -26,7 +27,10 @@ netEMDSpeedTest <- function() { netEMDStart <- Sys.time() net_emd(gdd1, gdd2) endTime <- Sys.time() - gddBuildTime <- append(gddBuildTime, as.double(netEMDStart - fulltimeStart)) + gddBuildTime <- append( + gddBuildTime, + as.double(netEMDStart - fulltimeStart) + ) netEMDtime <- append(netEMDtime, as.double(endTime - netEMDStart)) } } diff --git a/R/orca_interface.R b/R/orca_interface.R index 51614203..10065126 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -68,11 +68,18 @@ indexed_edges_to_graph <- function(indexed_edges) { #' @return A named list of simplified igraph graph object, with the name of each #' graph set to the name of the file it was read from. #' @examples -#' # Set source directory for Virus protein-protein interaction edge files stored in the netdist package. -#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +#' # Set source directory for Virus protein-protein interaction edge files +#' # stored in the netdist package. +#' source_dir <- system.file( +#' file.path("extdata", "VRPINS"), +#' package = "netdist" +#' ) #' print(source_dir) #' # Load query graphs as igraph objects -#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +#' graph_1 <- read_simple_graph( +#' file.path(source_dir, "EBV.txt"), +#' format = "ncol" +#' ) #' graph_1 #' @export read_simple_graphs <- function(source_dir, @@ -208,10 +215,12 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, #' #' Converts a matrix of node level features (e.g. for example counts #' of multiple graphlets or orbits at each node) to -#' a set of histogram like objects (observed frequency distribution of each feature/column) -#' @param features_matrix A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i. -#' @return Feature histograms: List of "discrete histograms" for each -#' feature +#' a set of histogram like objects (observed frequency distribution of each +#' feature/column) +#' @param features_matrix A matrix whose rows represent nodes and whose columns +#' represent different node level features. This means that entry ij provides +#' the value of feature j for node i. +#' @return Feature histograms: List of "discrete histograms" for each feature #' @export graph_features_to_histograms <- function(features_matrix) { apply(features_matrix, 2, dhist_from_obs) @@ -224,8 +233,9 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' Graphlet-based degree distributions (GDDs) #' -#' Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object -#' using the ORCA fast graphlet orbit counting package. +#' Short-cut function to create graphlet-based degree distributions from +#' \code{igraph} graph object using the ORCA fast graphlet orbit counting +#' package. #' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates @@ -233,7 +243,8 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Currently only size 4 and 5 are supported. -#' @param ego_neighbourhood_size The number of steps from the source node used to select the +#' @param ego_neighbourhood_size The number of steps from the source node used +#' to select the #' neighboring nodes to be included in the source node ego-network. #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. @@ -353,8 +364,9 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be #' counted. Currently only size 4 (default) and 5 are supported. -#' @param neighbourhood_size The number of steps from the source node used to select the -#' neighboring nodes to be included in the source node ego-network. (Default 2). +#' @param neighbourhood_size The number of steps from the source node used to +#' select the neighboring nodes to be included in the source node ego-network. +#' (Default 2). #' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} #' nodes are returned. (Default 3). #' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} @@ -517,7 +529,8 @@ orbit_to_graphlet_counts <- function(orbit_counts) { #' Graphlet key #' #' Metdata about graphlet groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. +#' Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain @@ -553,7 +566,8 @@ graphlet_key <- function(max_graphlet_size) { #' Orbit key #' #' Metdata about orbit groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. +#' Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain @@ -611,11 +625,11 @@ graphlet_ids_for_size <- function(graphlet_size) { #' @param feature_type Type of graphlet-based degree distributions. Can be #' \code{graphlet} to count graphlets or \code{orbit} to count orbits. #' @return A named list where each element contains a set of GDDs for a single -#' @param max_graphlet_size Maximum size of graphlets to use when generating GDD. -#' Currently only size 4 and 5 are supported. -#' @param ego_neighbourhood_size The number of steps from the source node used to select the -#' neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be -#' used. +#' @param max_graphlet_size Maximum size of graphlets to use when generating +#' GDD. Currently only size 4 and 5 are supported. +#' @param ego_neighbourhood_size The number of steps from the source node used +#' to select the neighboring nodes to be included in the source node +#' ego-network. If set to 0, ego-networks will not be used. #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @return A named list where each element contains a set of GDDs for a single diff --git a/tests/testthat/test_dhist.R b/tests/testthat/test_dhist.R index 37dc59b2..66ce6574 100644 --- a/tests/testthat/test_dhist.R +++ b/tests/testthat/test_dhist.R @@ -1,253 +1,301 @@ context("dhist: Discrete histogram from observations") -test_that("discrete_hist generates correct discrete histograms for random integer observations", { - # Method for generating random observations containing specific locations a - # specific number of times - random_observations <- function(locations, counts) { - # Construct vector containing each location replicated "count" times - observations <- purrr::simplify(purrr::map2(locations, counts, rep)) - # Randomise the order of the observations - sample(observations, size = length(observations), replace = FALSE) - } +test_that( + paste( + "discrete_hist generates correct discrete histograms for random integer", + "observations" + ), + { + # Method for generating random observations containing specific locations a + # specific number of times + random_observations <- function(locations, counts) { + # Construct vector containing each location replicated "count" times + observations <- purrr::simplify(purrr::map2(locations, counts, rep)) + # Randomise the order of the observations + sample(observations, size = length(observations), replace = FALSE) + } - set.seed(2684) - num_tests <- 100 - - run_test <- function() { - # Set parameters for generation of random observation sets - num_observations <- 100 - location_range <- -(num_observations * 3):(num_observations * 3) - # Do not allow zero counts as these locations will not be present in the - # observations generated from the locations and counts - count_range <- 1:10 - - # Generate random observation sets - locations <- sample(location_range, num_observations, replace = FALSE) - counts <- sample(count_range, num_observations, replace = TRUE) - - # Construct vector containing each location replicated "count" times - observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) - # Randomise the order of the observations - observations <- sample(observations_orig, size = length(observations_orig), replace = FALSE) - - # Generate discrete histograms - hist <- dhist_from_obs(observations) - - # discrete_hist will drop bins with zero counts, so remove these from the - # expected data (not necessary now we've restricted counts to be >= 1, but - # the bug where we generated test locations with zero counts was so annoying - # to identify that we're going with a belt and braces approach) - non_zero_count_indexes <- counts != 0 - expected_locations <- locations[non_zero_count_indexes] - expected_counts <- counts[non_zero_count_indexes] - # dhist_from_obs will return results with bins ordered by ascending location, - # so sort expected data to match - sorted_locations <- sort(expected_locations, index.return = TRUE) - sorted_location_indexes <- sorted_locations$ix - expected_locations <- expected_locations[sorted_location_indexes] - expected_counts <- expected_counts[sorted_location_indexes] - - # Check that histogram locations and counts match those used to generate the - # observations - expect_true(all.equal(hist$locations, expected_locations)) - expect_true(all.equal(hist$masses, expected_counts)) - } + set.seed(2684) + num_tests <- 100 + + run_test <- function() { + # Set parameters for generation of random observation sets + num_observations <- 100 + location_range <- -(num_observations * 3):(num_observations * 3) + # Do not allow zero counts as these locations will not be present in the + # observations generated from the locations and counts + count_range <- 1:10 + + # Generate random observation sets + locations <- sample(location_range, num_observations, replace = FALSE) + counts <- sample(count_range, num_observations, replace = TRUE) + + # Construct vector containing each location replicated "count" times + observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) + # Randomise the order of the observations + observations <- sample( + observations_orig, + size = length(observations_orig), + replace = FALSE + ) + + # Generate discrete histograms + hist <- dhist_from_obs(observations) + + # discrete_hist will drop bins with zero counts, so remove these from the + # expected data (not necessary now we've restricted counts to be >= 1, but + # the bug where we generated test locations with zero counts was so + # annoying to identify that we're going with a belt and braces approach) + non_zero_count_indexes <- counts != 0 + expected_locations <- locations[non_zero_count_indexes] + expected_counts <- counts[non_zero_count_indexes] + # dhist_from_obs will return results with bins ordered by ascending + # location, so sort expected data to match + sorted_locations <- sort(expected_locations, index.return = TRUE) + sorted_location_indexes <- sorted_locations$ix + expected_locations <- expected_locations[sorted_location_indexes] + expected_counts <- expected_counts[sorted_location_indexes] + + # Check that histogram locations and counts match those used to generate + # the observations + expect_true(all.equal(hist$locations, expected_locations)) + expect_true(all.equal(hist$masses, expected_counts)) + } - for (i in 1:num_tests) { - run_test() + for (i in 1:num_tests) { + run_test() + } } -}) - -context("dhist: constructor, equality operator and as_* transformation functions") -test_that("dhist constuctor has correct locations and masses (default smoothing, unsorted)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 - - expected1 <- list( - locations = locations1, masses = masses1, - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class - - expected2 <- list( - locations = locations2, masses = masses2, - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class - - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) - -test_that("dhist constuctor has correct locations and masses (default smoothing, sorted)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 - - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class - - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class - - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) - -test_that("dhist constuctor has correct locations and masses (default smoothing, default sorting)", { - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2) - - expected_class <- "dhist" - expected_smoothing_window_width <- 0 +) + +context( + "dhist: constructor, equality operator and as_* transformation functions" +) +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "unsorted)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) + + expected_class <- "dhist" + expected_smoothing_window_width <- 0 + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = expected_smoothing_window_width + ) + class(expected1) <- expected_class - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected1) <- expected_class + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = expected_smoothing_window_width + ) + class(expected2) <- expected_class - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width - ) - class(expected2) <- expected_class + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "sorted)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) + + expected_class <- "dhist" + expected_smoothing_window_width <- 0 + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) + class(expected1) <- expected_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) + class(expected2) <- expected_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, unsorted)", { - smoothing_window_width <- 1 + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (default smoothing,", + "default sorting)" + ), + { + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist(locations = locations1, masses = masses1) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist(locations = locations2, masses = masses2) + + expected_class <- "dhist" + expected_smoothing_window_width <- 0 + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) + class(expected1) <- expected_class - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = FALSE - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = FALSE - ) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) + class(expected2) <- expected_class - expected_class <- "dhist" + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "unsorted)" + ), + { + smoothing_window_width <- 1 - expected1 <- list( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) - expected2 <- list( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + expected_class <- "dhist" - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- expected_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, sorted)", { - smoothing_window_width <- 1 + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- expected_class - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = TRUE - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = TRUE - ) + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "sorted)" + ), + { + smoothing_window_width <- 1 - expected_class <- "dhist" + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + expected_class <- "dhist" - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- expected_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- expected_class -test_that("dhist constuctor has correct locations and masses (specified smoothing, default sorting)", { - smoothing_window_width <- 1 + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) + +test_that( + paste( + "dhist constuctor has correct locations and masses (specified smoothing,", + "default sorting)" + ), + { + smoothing_window_width <- 1 - locations1 <- c(7, 42, 1, 21, 101, 9) - masses1 <- c(15, 12, 16, 13, 11, 14) - actual1 <- dhist( - locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width - ) - locations2 <- c(3, 0, -62, 7, 16, -58) - masses2 <- c(23, 24, 26, 22, 21, 25) - actual2 <- dhist( - locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width - ) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) - expected_class <- "dhist" + expected_class <- "dhist" - expected1 <- list( - locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13, 12, 11), - smoothing_window_width = smoothing_window_width - ) - class(expected1) <- expected_class + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) + class(expected1) <- expected_class - expected2 <- list( - locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width - ) - class(expected2) <- expected_class + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) + class(expected2) <- expected_class - expect_equal(actual1, expected1) - expect_equal(actual2, expected2) -}) + expect_equal(actual1, expected1) + expect_equal(actual2, expected2) + } +) test_that("as_smoothed_dhist sets smoothing_window_width correctly", { dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), @@ -260,7 +308,10 @@ test_that("as_smoothed_dhist sets smoothing_window_width correctly", { dhist_pre$smoothing_window_width, expected_smoothing_window_width_pre ) - dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) + dhist_post <- as_smoothed_dhist( + dhist_pre, + expected_smoothing_window_width_post + ) expect_equal( dhist_post$smoothing_window_width, expected_smoothing_window_width_post @@ -279,7 +330,10 @@ test_that("as_unsmoothed_dhist sets smoothing_window_width correctly", { dhist_pre$smoothing_window_width, expected_smoothing_window_width_pre ) - dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) + dhist_post <- as_smoothed_dhist( + dhist_pre, + expected_smoothing_window_width_post + ) expect_equal( dhist_post$smoothing_window_width, expected_smoothing_window_width_post @@ -303,7 +357,8 @@ test_that("Non-identical dhists are NOT considered equal", { # Change a single element of the locations field dhist2_one_location_mismatch <- dhist1 - dhist2_one_location_mismatch$locations[3] <- dhist2_one_location_mismatch$locations[1] + 1 + dhist2_one_location_mismatch$locations[3] <- + dhist2_one_location_mismatch$locations[1] + 1 expect_false(dhist1 == dhist2_one_location_mismatch) # Change a single element of the masses field @@ -323,25 +378,37 @@ test_that("Non-identical dhists are NOT considered equal", { }) context("dhist: Discrete histogram variance") -test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoothing_window_width^2 / 12", { - dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) - # Be careful: ensure that no smoothing window width results in overlapping bins - smoothing_window_width_A <- 1 - smoothing_window_width_B <- 2 - dhist_unsmoothed <- as_unsmoothed_dhist(dhist) - dhist_smoothed_A <- as_smoothed_dhist(dhist, smoothing_window_width_A) - dhist_smoothed_B <- as_smoothed_dhist(dhist, smoothing_window_width_B) - - var_unsmoothed <- dhist_variance(dhist_unsmoothed) - var_smoothed_A <- dhist_variance(dhist_smoothed_A) - var_smoothed_B <- dhist_variance(dhist_smoothed_B) - - expected_var_smoothed_A <- var_unsmoothed + ((smoothing_window_width_A^2) / 12) - expected_var_smoothed_B <- var_unsmoothed + ((smoothing_window_width_B^2) / 12) - - expect_equal(var_smoothed_A, expected_var_smoothed_A) - expect_equal(var_smoothed_B, expected_var_smoothed_B) -}) +test_that( + paste( + "dhist_variance difference for smoothed and unsmoothed dhists is", + "smoothing_window_width^2 / 12" + ), + { + dhist <- dhist( + locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) + # Be careful: ensure that no smoothing window width results in overlapping + # bins + smoothing_window_width_A <- 1 + smoothing_window_width_B <- 2 + dhist_unsmoothed <- as_unsmoothed_dhist(dhist) + dhist_smoothed_A <- as_smoothed_dhist(dhist, smoothing_window_width_A) + dhist_smoothed_B <- as_smoothed_dhist(dhist, smoothing_window_width_B) + + var_unsmoothed <- dhist_variance(dhist_unsmoothed) + var_smoothed_A <- dhist_variance(dhist_smoothed_A) + var_smoothed_B <- dhist_variance(dhist_smoothed_B) + + expected_var_smoothed_A <- var_unsmoothed + + ((smoothing_window_width_A^2) / 12) + expected_var_smoothed_B <- var_unsmoothed + + ((smoothing_window_width_B^2) / 12) + + expect_equal(var_smoothed_A, expected_var_smoothed_A) + expect_equal(var_smoothed_B, expected_var_smoothed_B) + } +) test_that("dhist_variance returns sigma^2 for unsmoothed normal histograms", { num_hists <- 5 @@ -410,110 +477,149 @@ test_that("normalise_dhist_mass output sums to 1", { }) context("dhist: Discrete histogram variance normalisation") -test_that("normalise_histogram_variance output has variance of 1 for random integer histograms", { - # Generate histograms with random masses and random centres - num_hists <- 10 - num_bins <- 70 - - mass_min <- 0 - mass_max <- 100 - rand_masses <- function() { - return(runif(num_bins, mass_min, mass_max)) - } +test_that( + paste( + "normalise_histogram_variance output has variance of 1 for random integer", + "histograms" + ), + { + # Generate histograms with random masses and random centres + num_hists <- 10 + num_bins <- 70 + + mass_min <- 0 + mass_max <- 100 + rand_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } - centre_min <- -30 - centre_max <- 70 - rand_locations <- function() { - return(round(sample(centre_min:centre_max, num_bins), digits = 0)) - } + centre_min <- -30 + centre_max <- 70 + rand_locations <- function() { + return(round(sample(centre_min:centre_max, num_bins), digits = 0)) + } - rand_dhists <- replicate(num_hists, dhist(masses = rand_masses(), locations = rand_locations()), simplify = FALSE) + rand_dhists <- replicate( + num_hists, + dhist(masses = rand_masses(), locations = rand_locations()), + simplify = FALSE + ) - smoothing_window_width <- 1 - rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) - rand_dhists_smoothed <- purrr::map(rand_dhists, as_smoothed_dhist, smoothing_window_width = smoothing_window_width) + smoothing_window_width <- 1 + rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) + rand_dhists_smoothed <- purrr::map( + rand_dhists, + as_smoothed_dhist, + smoothing_window_width = smoothing_window_width + ) - expected_post_norm_smoothing_windows <- purrr::map_dbl(rand_dhists_smoothed, function(dhist) { - smoothing_window_width / dhist_std(dhist) - }) + expected_post_norm_smoothing_windows <- purrr::map_dbl( + rand_dhists_smoothed, + function(dhist) { + smoothing_window_width / dhist_std(dhist) + } + ) - actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) { - normalise_dhist_variance(dhist) - }) - actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { - normalise_dhist_variance(dhist) - }) - expected_variance <- 1 - # Check variance of normalised hostograms is as expected - purrr::walk(actual_dhist_unsmoothed, function(dhist) { - expect_equal(dhist_variance(dhist), expected_variance) - }) - purrr::walk(actual_dhist_smoothed, function(dhist) { - expect_equal(dhist_variance(dhist), expected_variance) - }) - # Check smoothing window is as expected (0 for unsmoothe; smoothing_window_width/sigma for smoothed) - purrr::walk(actual_dhist_unsmoothed, function(dhist) { - expect_equal(dhist$smoothing_window_width, 0) - }) - purrr::walk2( - actual_dhist_smoothed, expected_post_norm_smoothing_windows, - function(dhist, sww) { - expect_equal(dhist$smoothing_window_width, sww) - } - ) - # Check masses unaltered - purrr::walk2( - actual_dhist_unsmoothed, rand_dhists_unsmoothed, - function(actual, expected) { - expect_equal(actual$masses, expected$masses) - } - ) - purrr::walk2( - actual_dhist_smoothed, rand_dhists_smoothed, - function(actual, expected) { - expect_equal(actual$masses, expected$masses) + actual_dhist_unsmoothed <- purrr::map( + rand_dhists_unsmoothed, + function(dhist) { + normalise_dhist_variance(dhist) + } + ) + actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) + expected_variance <- 1 + # Check variance of normalised hostograms is as expected + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) + purrr::walk(actual_dhist_smoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) + # Check smoothing window is as expected (0 for unsmoothe; smoothing_window + # width/sigma for smoothed) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist$smoothing_window_width, 0) + }) + purrr::walk2( + actual_dhist_smoothed, expected_post_norm_smoothing_windows, + function(dhist, sww) { + expect_equal(dhist$smoothing_window_width, sww) + } + ) + # Check masses unaltered + purrr::walk2( + actual_dhist_unsmoothed, rand_dhists_unsmoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + purrr::walk2( + actual_dhist_smoothed, rand_dhists_smoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + } +) + +test_that( + paste( + "normalise_histogram_variance output has variance of 1 for normal", + "histograms" + ), + { + num_hists <- 5 + num_bins <- 100001 + + mus <- runif(num_hists, -10, 10) + sigmas <- runif(num_hists, 0, 10) + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) } - ) -}) -test_that("normalise_histogram_variance output has variance of 1 for normal histograms", { - num_hists <- 5 - num_bins <- 100001 - - mus <- runif(num_hists, -10, 10) - sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { + locations <- rand_locations(mu, sigma) + masses <- dnorm(locations, mean = mu, sd = sigma) + return(dhist(masses = masses, locations = locations)) + }) + + actuals <- purrr::map(rand_dhists, function(dhist) { + dhist_variance(normalise_dhist_variance(dhist)) + }) + expected <- 1 + purrr::map_dbl(actuals, function(actual) { + expect_equal(actual, expected) + }) } - - rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { - locations <- rand_locations(mu, sigma) - masses <- dnorm(locations, mean = mu, sd = sigma) - return(dhist(masses = masses, locations = locations)) - }) - - actuals <- purrr::map(rand_dhists, function(dhist) { - dhist_variance(normalise_dhist_variance(dhist)) - }) - expected <- 1 - purrr::map_dbl(actuals, function(actual) { - expect_equal(actual, expected) - }) -}) +) context("dhist: Sort dhist") test_that("sort_dhist works", { # NOTE: Need to construct dhist objects explicitly as the dhist constructor # now returns a sorted dhist and we want to be independent of this - dhist1 <- list(locations = c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) + dhist1 <- list( + locations = c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) class(dhist1) <- "dhist" - dhist2 <- list(locations = c(3, 0, -62, 7, 16, -58), masses = c(23, 24, 26, 22, 21, 25)) + dhist2 <- list( + locations = c(3, 0, -62, 7, 16, -58), + masses = c(23, 24, 26, 22, 21, 25) + ) class(dhist2) <- "dhist" - expected1 <- list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13, 12, 11)) + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11) + ) class(expected1) <- "dhist" - expected2 <- list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21) + ) class(expected2) <- "dhist" actual1 <- sort_dhist(dhist1) @@ -524,174 +630,269 @@ test_that("sort_dhist works", { }) context("dhist: ECMF") -test_that("dhist_ecmf returns correct step function when smoothing_window_width is zero", { - dhist1 <- dhist(locations = c(1, 2, 4, 7, 11, 16, 22), masses = c(21, 22, 23, 27, 31, 36, 42)) - dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) - - ecmf1 <- dhist_ecmf(dhist1) - actual_knots1 <- ecmf_knots(ecmf1) - actual_knots_ecds1 <- ecmf1(actual_knots1) - inter_knots_x <- head(actual_knots1, length(actual_knots1) - 1) - actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) - extra_knots <- c(actual_knots1[1] - 1, actual_knots1[length(actual_knots1)] + 1) - actual_extra_knots_ecds1 <- ecmf1(extra_knots) - - cum_masses1 <- cumsum(dhist1$masses) - max_cum_mass <- cum_masses1[length(cum_masses1)] - expected_knots_ecds1 <- cum_masses1 - expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) - 1) - expected_extra_knots_ecds1 <- c(0, max_cum_mass) - expected_knots1 <- dhist1$locations - - expect_equal(actual_knots1, expected_knots1) - expect_equal(actual_knots_ecds1, expected_knots_ecds1) - expect_equal(actual_inter_knots_ecds1, expected_inter_knots_ecds1) - expect_equal(actual_extra_knots_ecds1, expected_extra_knots_ecds1) -}) +test_that( + paste( + "dhist_ecmf returns correct step function when smoothing_window_width is", + "zero" + ), + { + dhist1 <- dhist( + locations = c(1, 2, 4, 7, 11, 16, 22), + masses = c(21, 22, 23, 27, 31, 36, 42) + ) + dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) + + ecmf1 <- dhist_ecmf(dhist1) + actual_knots1 <- ecmf_knots(ecmf1) + actual_knots_ecds1 <- ecmf1(actual_knots1) + inter_knots_x <- head(actual_knots1, length(actual_knots1) - 1) + actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) + extra_knots <- c( + actual_knots1[1] - 1, + actual_knots1[length(actual_knots1)] + 1 + ) + actual_extra_knots_ecds1 <- ecmf1(extra_knots) + + cum_masses1 <- cumsum(dhist1$masses) + max_cum_mass <- cum_masses1[length(cum_masses1)] + expected_knots_ecds1 <- cum_masses1 + expected_inter_knots_ecds1 <- head( + expected_knots_ecds1, + length(expected_knots_ecds1) - 1 + ) + expected_extra_knots_ecds1 <- c(0, max_cum_mass) + expected_knots1 <- dhist1$locations + + expect_equal(actual_knots1, expected_knots1) + expect_equal(actual_knots_ecds1, expected_knots_ecds1) + expect_equal(actual_inter_knots_ecds1, expected_inter_knots_ecds1) + expect_equal(actual_extra_knots_ecds1, expected_extra_knots_ecds1) + } +) context("dhist: Area between ECMFs (simple integer dhists)") -test_that("area_between_dhist_ecmfs returns correct value for simple integer dhists", { - # Example dhists constructed by hand to result in lots of "bowtie" segments - # for smoothed ECMFs and to allow expected areas to be calculated by hand - # Unsmoothed locations are on an integer grid, smoothed bin edges are on a - # half-integer grid - # Smoothed and unsmoothed ECMF cumulative masses are on integer grid - # Smoothed ECMF crossing points are on a quarter-integer grid - dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) - dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) - - # Set up smoothed and unsmoothed versions of histograms - smoothing_window_width <- 1 - dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) - dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) - dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) - dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - - # Set expected area - expected_area_unsmoothed <- 4 - expected_area_smoothed <- 3 - - # Generate ecmfs - ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) - ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) - ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) - ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - - # Calculate area between ECMFs - actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) - actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - - # Compare caculated areas with expected areas - expect_equal(actual_area_unsmoothed, expected_area_unsmoothed) - expect_equal(actual_area_smoothed, expected_area_smoothed) -}) +test_that( + paste( + "area_between_dhist_ecmfs returns correct value for simple integer dhists" + ), + { + # Example dhists constructed by hand to result in lots of "bowtie" segments + # for smoothed ECMFs and to allow expected areas to be calculated by hand + # Unsmoothed locations are on an integer grid, smoothed bin edges are on a + # half-integer grid + # Smoothed and unsmoothed ECMF cumulative masses are on integer grid + # Smoothed ECMF crossing points are on a quarter-integer grid + dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) + dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) + + # Set up smoothed and unsmoothed versions of histograms + smoothing_window_width <- 1 + dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) + dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) + dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) + dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) + + # Set expected area + expected_area_unsmoothed <- 4 + expected_area_smoothed <- 3 + + # Generate ecmfs + ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) + ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) + ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) + ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) + + # Calculate area between ECMFs + actual_area_unsmoothed <- area_between_dhist_ecmfs( + ecmfA_unsmoothed, + ecmfB_unsmoothed + ) + actual_area_smoothed <- area_between_dhist_ecmfs( + ecmfA_smoothed, + ecmfB_smoothed + ) -context("dhist: Area between ECMFs (non-integer normalised dhists)") -test_that("area_between_dhist_ecmfs returns correct value for non-integer normalised dhists", { + # Compare caculated areas with expected areas + expect_equal(actual_area_unsmoothed, expected_area_unsmoothed) + expect_equal(actual_area_smoothed, expected_area_smoothed) + } +) - # Previous simple integer grid where both histograms have been separately - # normalised to unit mass and variance. Has locations and masses at a range - # of floating point locations. Has bowties, triangles and trapeziums. - dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) - dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) +context("dhist: Area between ECMFs (non-integer normalised dhists)") +test_that( + paste( + "area_between_dhist_ecmfs returns correct value for non-integer normalised", + "dhists" + ), + { + + # Previous simple integer grid where both histograms have been separately + # normalised to unit mass and variance. Has locations and masses at a range + # of floating point locations. Has bowties, triangles and trapeziums. + dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) + dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) + + dhistA <- normalise_dhist_mass(normalise_dhist_variance(dhistA)) + dhistB <- normalise_dhist_mass(normalise_dhist_variance(dhistB)) + + # Set up smoothed and unsmoothed versions of histograms + smoothing_window_width <- 1 + dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) + dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) + dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) + dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) + + # Generate ecmfs + ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) + ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) + ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) + ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) + + # Define some functions to make calculation of manually measured areas + # easier + rectangle_area <- function(width, height) { + return(width * height) + } + triangle_area <- function(base, height) { + return(0.5 * base * height) + } + trapezium_area <- function(side_a, side_b, height) { + return(0.5 * (side_a + side_b) * height) + } + # Measurements of expected area between ECMFs done by hand by printing + # normalised ECMFs on a grid with x-spacing of 0.02 and y-spacing of 0.01) + # Actual grid counts preserved in data to facilitate less tedious manual + # checking if required + # --- Unsmoothed --- + area_A_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) + area_B_unsmoothed <- rectangle_area(width = 50.5 * 0.02, height = 37.5 * 0.01) + area_C_unsmoothed <- rectangle_area(width = 26 * 0.02, height = 12.5 * 0.01) + area_D_unsmoothed <- rectangle_area(width = 34.5 * 0.02, height = 12.5 * 0.01) + area_E_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) + expected_area_unsmoothed <- + sum( + area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, + area_D_unsmoothed, area_E_unsmoothed + ) + # --- Smoothed --- + area_A_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) + area_B_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) + area_C_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) + area_D_smoothed <- trapezium_area( + side_a = 18.5 * 0.01, + side_b = 37.5 * 0.01, + height = 14.5 * 0.02 + ) + area_E_smoothed <- trapezium_area( + side_a = 37.5 * 0.01, + side_b = 37.5 * 0.01, + height = 16 * 0.02 + ) + area_F_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) + area_G_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) + area_H_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) + area_I_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) + area_J_smoothed <- trapezium_area( + side_a = 12.5 * 0.01, + side_b = 20 * 0.01, + height = 30.5 * 0.02 + ) + area_K_smoothed <- trapezium_area( + side_a = 20 * 0.01, + side_b = 18 * 0.01, + height = 8 * 0.02 + ) + area_L_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) + expected_area_smoothed <- + sum( + area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, + area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, + area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed + ) + + # Calculate area between ECMFs + actual_area_unsmoothed <- area_between_dhist_ecmfs( + ecmfA_unsmoothed, + ecmfB_unsmoothed + ) + actual_area_smoothed <- area_between_dhist_ecmfs( + ecmfA_smoothed, + ecmfB_smoothed + ) - dhistA <- normalise_dhist_mass(normalise_dhist_variance(dhistA)) - dhistB <- normalise_dhist_mass(normalise_dhist_variance(dhistB)) + # Compare caculated areas with expected areas + expect_equalish_manual <- function(actual, expected, relative_tolerance) { + relative_diff <- abs(actual - expected) / expected + expect_lte(relative_diff, relative_tolerance) + } - # Set up smoothed and unsmoothed versions of histograms - smoothing_window_width <- 1 - dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) - dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) - dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) - dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - - # Generate ecmfs - ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) - ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) - ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) - ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - - # Define some functions to make calculation of manually measured areas easier - rectangle_area <- function(width, height) { - return(width * height) - } - triangle_area <- function(base, height) { - return(0.5 * base * height) - } - trapezium_area <- function(side_a, side_b, height) { - return(0.5 * (side_a + side_b) * height) + # Given manual measurement of areas between curves, consider area correct + # if actual and expected areas are within 1% of each other + expect_equalish_manual(actual_area_unsmoothed, expected_area_unsmoothed, 0.01) + expect_equalish_manual(actual_area_smoothed, expected_area_smoothed, 0.01) } - # Measurements of expected area between ECMFs done by hand by printing - # normalised ECMFs on a grid with x-spacing of 0.02 and y-spacing of 0.01) - # Actual grid counts preserved in data to facilitate less tedious manual - # checking if required - # --- Unsmoothed --- - area_A_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) - area_B_unsmoothed <- rectangle_area(width = 50.5 * 0.02, height = 37.5 * 0.01) - area_C_unsmoothed <- rectangle_area(width = 26 * 0.02, height = 12.5 * 0.01) - area_D_unsmoothed <- rectangle_area(width = 34.5 * 0.02, height = 12.5 * 0.01) - area_E_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) - expected_area_unsmoothed <- - sum( - area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, - area_D_unsmoothed, area_E_unsmoothed - ) - # --- Smoothed --- - area_A_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) - area_B_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) - area_C_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) - area_D_smoothed <- trapezium_area(side_a = 18.5 * 0.01, side_b = 37.5 * 0.01, height = 14.5 * 0.02) - area_E_smoothed <- trapezium_area(side_a = 37.5 * 0.01, side_b = 37.5 * 0.01, height = 16 * 0.02) - area_F_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) - area_G_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) - area_H_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) - area_I_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) - area_J_smoothed <- trapezium_area(side_a = 12.5 * 0.01, side_b = 20 * 0.01, height = 30.5 * 0.02) - area_K_smoothed <- trapezium_area(side_a = 20 * 0.01, side_b = 18 * 0.01, height = 8 * 0.02) - area_L_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) - expected_area_smoothed <- - sum( - area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, - area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, - area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed - ) - - # Calculate area between ECMFs - actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) - actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - - # Compare caculated areas with expected areas - expect_equalish_manual <- function(actual, expected, relative_tolerance) { - relative_diff <- abs(actual - expected) / expected - expect_lte(relative_diff, relative_tolerance) - } - - # Given manual measurement of areas between curves, consider area correct - # if actual and expected areas are within 1% of each other - expect_equalish_manual(actual_area_unsmoothed, expected_area_unsmoothed, 0.01) - expect_equalish_manual(actual_area_smoothed, expected_area_smoothed, 0.01) -}) +) context("dhist: Harmonise dhist locations") test_that("harmonise_dhist_locations works A", { - dhist1 <- dhist(masses = c(11, 12, 13), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) - dhist2 <- dhist(masses = c(21, 22, 23), locations = c(2, 4, 6), smoothing_window_width = 1, sorted = FALSE) + dhist1 <- dhist( + masses = c(11, 12, 13), + locations = c(1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) + dhist2 <- dhist( + masses = c(21, 22, 23), + locations = c(2, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ) expected <- list( - dhist1 = dhist(masses = c(11, 12, 13, 0, 0, 0), locations = c(1, 3, 5, 2, 4, 6), smoothing_window_width = 1, sorted = FALSE), - dhist2 = dhist(masses = c(21, 22, 23, 0, 0, 0), locations = c(2, 4, 6, 1, 3, 5), smoothing_window_width = 1, sorted = FALSE) + dhist1 = dhist( + masses = c(11, 12, 13, 0, 0, 0), + locations = c(1, 3, 5, 2, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ), + dhist2 = dhist( + masses = c(21, 22, 23, 0, 0, 0), + locations = c(2, 4, 6, 1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) ) actual <- harmonise_dhist_locations(dhist1, dhist2) expect_equal(actual, expected) }) test_that("harmonise_dhist_locations works B", { - dhist1 <- dhist(masses = c(1, 1, 1), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) - dhist2 <- dhist(masses = c(1, 1, 1), locations = c(4, 5, 6), smoothing_window_width = 1, sorted = FALSE) + dhist1 <- dhist( + masses = c(1, 1, 1), + locations = c(1, 3, 5), + smoothing_window_width = 1, + sorted = FALSE + ) + dhist2 <- dhist( + masses = c(1, 1, 1), + locations = c(4, 5, 6), + smoothing_window_width = 1, + sorted = FALSE + ) expected <- list( - dhist1 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(1, 3, 5, 4, 6), smoothing_window_width = 1, sorted = FALSE), - dhist2 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(4, 5, 6, 1, 3), smoothing_window_width = 1, sorted = FALSE) + dhist1 = dhist( + masses = c(1, 1, 1, 0, 0), + locations = c(1, 3, 5, 4, 6), + smoothing_window_width = 1, + sorted = FALSE + ), + dhist2 = dhist( + masses = c(1, 1, 1, 0, 0), + locations = c(4, 5, 6, 1, 3), + smoothing_window_width = 1, + sorted = FALSE + ) ) actual <- harmonise_dhist_locations(dhist1, dhist2) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 9db131ee..3c782aec 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -1,14 +1,17 @@ context("EMD: Cost matrix") # COST_MATRIX: Property-based tests -test_that("cost_matrix returns all zeros when all bin locations are identical", { - bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) - bin_centres2 <- bin_centres1 - expected <- matrix(0, - nrow = length(bin_centres1), - ncol = length(bin_centres2) - ) - expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) -}) +test_that( + "cost_matrix returns all zeros when all bin locations are identical", + { + bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) + bin_centres2 <- bin_centres1 + expected <- matrix(0, + nrow = length(bin_centres1), + ncol = length(bin_centres2) + ) + expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) + } +) test_that("cost_matrix returns zeros along diagonal when both sets of bin locations are the same", { diff --git a/tests/testthat/test_graph_binning.R b/tests/testthat/test_graph_binning.R index b54d4796..f2bbf28d 100644 --- a/tests/testthat/test_graph_binning.R +++ b/tests/testthat/test_graph_binning.R @@ -1,29 +1,46 @@ context("Graph binning: Adaptive binning") -test_that("adaptive_breaks merges 2 lowest bins where only first bin is below minimum", { - min_count <- 5 - x <- c( - 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 3, 4, 5, 6, 7) - - expect_equal(final_breaks_actual, final_breaks_expected) -}) +test_that( + "adaptive_breaks merges 2 lowest bins where only first bin is below minimum", + { + min_count <- 5 + x <- c( + 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 3, 4, 5, 6, 7) -test_that("adaptive_breaks merges 3 lowest bins where lowest 2 combined are below minimum", { - min_count <- 5 - x <- c( - 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 4, 5, 6, 7) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) + +test_that( + paste( + "adaptive_breaks merges 3 lowest bins where lowest 2 combined are below", + "minimum" + ), + { + min_count <- 5 + x <- c( + 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 4, 5, 6, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) test_that("adaptive_breaks merges pair of bins in middle", { min_count <- 5 @@ -32,7 +49,11 @@ test_that("adaptive_breaks merges pair of bins in middle", { rep(5.5, min_count), rep(6.5, min_count + 1) ) initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 3, 5, 6, 7) expect_equal(final_breaks_actual, final_breaks_expected) @@ -45,57 +66,92 @@ test_that("adaptive_breaks merges two spearated pairs of bins in middle", { rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count) ) initial_breaks <- 1:8 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 4, 5, 7, 8) expect_equal(final_breaks_actual, final_breaks_expected) }) -test_that("adaptive_breaks merges 2 uppermost bins where both are below minimum", { - min_count <- 5 - x <- c( - rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - - expect_equal(final_breaks_actual, final_breaks_expected) -}) +test_that( + "adaptive_breaks merges 2 uppermost bins where both are below minimum", + { + min_count <- 5 + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) -test_that("adaptive_breaks merges 2 uppermost bins where only last bin is below minimum", { - min_count <- 5 - x <- c( - rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) - ) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 3, 4, 5, 7) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) + +test_that( + paste( + "adaptive_breaks merges 2 uppermost bins where only last bin is below", + "minimum" + ), + { + min_count <- 5 + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) + ) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) test_that("adaptive_breaks merges bins with no members with the next bin", { min_count <- 5 x <- c(rep(1.5, min_count), rep(5.5, min_count), rep(6.5, min_count)) initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) final_breaks_expected <- c(1, 2, 6, 7) expect_equal(final_breaks_actual, final_breaks_expected) }) -test_that("adaptive_breaks merges 2 bins below minimum, plus the empty bins between them", { - min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.3, 1), rep(5.5, 4), rep(6.5, min_count)) - initial_breaks <- 1:7 - final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2, 6, 7) +test_that( + paste( + "adaptive_breaks merges 2 bins below minimum, plus the empty bins between", + "them" + ), + { + min_count <- 5 + x <- c(rep(1.5, min_count), rep(2.3, 1), rep(5.5, 4), rep(6.5, min_count)) + initial_breaks <- 1:7 + final_breaks_actual <- adaptive_breaks( + x, + min_count = min_count, + breaks = initial_breaks + ) + final_breaks_expected <- c(1, 2, 6, 7) - expect_equal(final_breaks_actual, final_breaks_expected) -}) + expect_equal(final_breaks_actual, final_breaks_expected) + } +) context("Graph binning: Adaptively binned densities") test_that("binned_densities_adaptive works", { @@ -129,7 +185,20 @@ test_that("binned_densities_adaptive works", { expected_interval_indexes = expected_interval_indexes ) # Test 2: - densities <- c(0, 0.012, 0.099, 0.201, 0.299, 0.402, 0.49, 0.596, 0.699, 0.803, 0.899, 1.0) + densities <- c( + 0, + 0.012, + 0.099, + 0.201, + 0.299, + 0.402, + 0.49, + 0.596, + 0.699, + 0.803, + 0.899, + 1.0 + ) min_counts_per_interval <- 2 num_intervals <- 100 expected_breaks <- c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index ac1a68c2..39cc1992 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -65,7 +65,8 @@ test_that(test_message, { # === TEST count_graphlet_tuples_ego === # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar # to the method under test. However, it's a simple method so maybe that's ok? - graphlet_tuple_counts_ego <- function(graphlet_counts_ego, max_graphlet_size) { + graphlet_tuple_counts_ego <- function(graphlet_counts_ego, + max_graphlet_size) { t(apply(graphlet_counts_ego, 1, count_graphlet_tuples, max_graphlet_size = max_graphlet_size @@ -81,53 +82,130 @@ test_that(test_message, { # Generate expected tuple counts for graphlets up to size 4 and 5 # 1. For ego-networks of order 1 - expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) - expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) - expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) - expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) - expected_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego1, 5) - expected_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego1, 5) + expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n11_ego1, 4 + ) + expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n37_ego1, 4 + ) + expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego( + graph_n73_ego1, 4 + ) + expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n11_ego1, 5 + ) + expected_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n37_ego1, 5 + ) + expected_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego( + graph_n73_ego1, 5 + ) # 2. For ego-networks of order 2 - expected_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego2, 4) - expected_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego2, 4) - expected_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego2, 4) - expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) - expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) - expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) + expected_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n11_ego2, 4 + ) + expected_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n37_ego2, 4 + ) + expected_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego( + graph_n73_ego2, 4 + ) + expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n11_ego2, 5 + ) + expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n37_ego2, 5 + ) + expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego( + graph_n73_ego2, 5 + ) # Calculate actual tuple counts # 1. For ego-networks of order 1 - actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) - actual_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego(graph_n37_ego1, 4) - actual_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego(graph_n73_ego1, 4) - actual_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego(graph_n11_ego1, 5) - actual_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego(graph_n37_ego1, 5) - actual_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego(graph_n73_ego1, 5) + actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n11_ego1, 4 + ) + actual_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n37_ego1, 4 + ) + actual_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego( + graph_n73_ego1, 4 + ) + actual_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n11_ego1, 5 + ) + actual_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n37_ego1, 5 + ) + actual_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego( + graph_n73_ego1, 5 + ) # 2. For ego-networks of order 2 - actual_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego(graph_n11_ego2, 4) - actual_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego(graph_n37_ego2, 4) - actual_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego(graph_n73_ego2, 4) - actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) - actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) - actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) + actual_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n11_ego2, 4 + ) + actual_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n37_ego2, 4 + ) + actual_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego( + graph_n73_ego2, 4 + ) + actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n11_ego2, 5 + ) + actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n37_ego2, 5 + ) + actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego( + graph_n73_ego2, 5 + ) # Compare expected with actual - expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) - expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) - expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) - expect_equal(expected_tuple_count_n11_ego1_gs5, actual_tuple_count_n11_ego1_gs5) - expect_equal(expected_tuple_count_n37_ego1_gs5, actual_tuple_count_n37_ego1_gs5) - expect_equal(expected_tuple_count_n73_ego1_gs5, actual_tuple_count_n73_ego1_gs5) + expect_equal( + expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4 + ) + expect_equal( + expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4 + ) + expect_equal( + expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4 + ) + expect_equal( + expected_tuple_count_n11_ego1_gs5, actual_tuple_count_n11_ego1_gs5 + ) + expect_equal( + expected_tuple_count_n37_ego1_gs5, actual_tuple_count_n37_ego1_gs5 + ) + expect_equal( + expected_tuple_count_n73_ego1_gs5, actual_tuple_count_n73_ego1_gs5 + ) # 2. For ego-networks of order 2 - expect_equal(expected_tuple_count_n11_ego2_gs4, actual_tuple_count_n11_ego2_gs4) - expect_equal(expected_tuple_count_n37_ego2_gs4, actual_tuple_count_n37_ego2_gs4) - expect_equal(expected_tuple_count_n73_ego2_gs4, actual_tuple_count_n73_ego2_gs4) - expect_equal(expected_tuple_count_n11_ego2_gs5, actual_tuple_count_n11_ego2_gs5) - expect_equal(expected_tuple_count_n37_ego2_gs5, actual_tuple_count_n37_ego2_gs5) - expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) + expect_equal( + expected_tuple_count_n11_ego2_gs4, actual_tuple_count_n11_ego2_gs4 + ) + expect_equal( + expected_tuple_count_n37_ego2_gs4, actual_tuple_count_n37_ego2_gs4 + ) + expect_equal( + expected_tuple_count_n73_ego2_gs4, actual_tuple_count_n73_ego2_gs4 + ) + expect_equal( + expected_tuple_count_n11_ego2_gs5, actual_tuple_count_n11_ego2_gs5 + ) + expect_equal( + expected_tuple_count_n37_ego2_gs5, actual_tuple_count_n37_ego2_gs5 + ) + expect_equal( + expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5 + ) }) -context("Measures Netdis: Ego-network density values match those for manually verified networks") +context( + paste( + "Measures Netdis: Ego-network density values match those for manually", + "verified networks" + ) +) test_that("Ego-network 4-node density values match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets @@ -201,170 +279,188 @@ test_that("Ego-network 4-node density values match manually verified totals", { expect_equal(actual_densities_o2, expected_densities_o2) }) -context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") -test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set parameters for test - max_graphlet_size <- 4 - min_counts_per_interval <- 2 - num_intervals <- 100 - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - - # Set manually verified ego-network node counts and edge densities - # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - # Order 1 expected densities should be: - # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 - # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - # Order 2 expected densities should be: - # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - - # Set manually verified density bins for ego-networks - # 1. Ego-networks of order 1 - expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) - expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - expected_binned_densities_o1 <- list( - densities = expected_densities_o1, - interval_indexes = expected_interval_indexes_o1, - breaks = expected_breaks_o1 - ) - # Check binned densities are as expected - actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) - # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1 / 3 - expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- - (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 - expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) - expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - expected_binned_densities_o2 <- list( - densities = expected_densities_o2, - interval_indexes = expected_interval_indexes_o2, - breaks = expected_breaks_o2 - ) - # Check binned densities are as expected - actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - expected_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels - # 2-step ego networks - expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels - - # Calculate binned average expected counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + - expected_counts_o1[8, ]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + - expected_counts_o1[5, ] + expected_counts_o1[9, ] + - expected_counts_o1[10, ]) / 5 - expected_mean_density_binned_counts_o1 <- rbind( - mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 - ) - rownames(expected_mean_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + - expected_counts_o2[7, ]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + - expected_counts_o2[10, ]) / 3 - expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, - mean_counts_bin4_o2 - ) - rownames(expected_mean_density_binned_counts_o2) <- 1:4 +context( + paste( + "Measures Netdis: Ego-network density-binned reference counts for manually", + "verified networks" + ) +) +test_that( + paste( + "Ego-network 4-node density-binned reference counts match manually", + "verified totals" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + expected_binned_densities_o1 <- list( + densities = expected_densities_o1, + interval_indexes = expected_interval_indexes_o1, + breaks = expected_breaks_o1 + ) + # Check binned densities are as expected + actual_binned_densities_o1 <- binned_densities_adaptive( + expected_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) + # 2. Ego-networks of order 2 + expected_min_break_o2 <- 1 / 3 + expected_max_break_o2 <- 0.6 + expected_initial_interval_o2 <- + (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) + expected_breaks_o2 <- expected_min_break_o2 + + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) + expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + expected_binned_densities_o2 <- list( + densities = expected_densities_o2, + interval_indexes = expected_interval_indexes_o2, + breaks = expected_breaks_o2 + ) + # Check binned densities are as expected + actual_binned_densities_o2 <- binned_densities_adaptive( + expected_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 + expected_mean_density_binned_counts_o1 <- rbind( + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(expected_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + + expected_counts_o2[9, ] + expected_counts_o2[10, ]) / 3 + expected_mean_density_binned_counts_o2 <- rbind( + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(expected_mean_density_binned_counts_o2) <- 1:4 - # Calculate actual output of function under test - actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1 - ) - actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2 - ) + # Calculate actual output of function under test + actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( + expected_counts_o1, expected_interval_indexes_o1 + ) + actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( + expected_counts_o2, expected_interval_indexes_o2 + ) - # Check actual output vs expected - expect_equal( - actual_mean_density_binned_counts_o1, - expected_mean_density_binned_counts_o1 - ) - expect_equal( - actual_mean_density_binned_counts_o2, - expected_mean_density_binned_counts_o2 - ) -}) + # Check actual output vs expected + expect_equal( + actual_mean_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_mean_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) + } +) -context("Measures Netdis: scale_graphlet_counts_ego for manually verified networks") +context( + "Measures Netdis: scale_graphlet_counts_ego for manually verified networks" +) test_that("Ego-network 4-node graphlet counts match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets @@ -391,13 +487,14 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 + # Count graphlets in each ego network of the graph with neighbourhood sizes of + # 1 and 2 max_graphlet_size <- 4 min_ego_edges <- 0 min_ego_nodes <- 0 - # Use previously tested functions to generate ego networks and calcualte graphlet - # counts. + # Use previously tested functions to generate ego networks and calcualte + # graphlet counts. # ego nets ego_networks_o1 <- make_named_ego_graph(graph, order = 1, @@ -471,293 +568,328 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { expect_equal(actual_counts_o2, expected_counts_o2) }) -context("Measures Netdis: Ego-network density-binned counts for manually verified networks") -test_that("density_binned_counts output matches manually verified totals with different scaling and aggregation functions", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +context( + paste( + "Measures Netdis: Ego-network density-binned counts for manually", + "verified networks" + ) +) +test_that( + paste( + "density_binned_counts output matches manually verified totals with", + "different scaling and aggregation functions" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Set node and graphlet labels to use for row and col names in + # expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + expected_binned_densities_o1 <- list( + densities = expected_densities_o1, + interval_indexes = expected_interval_indexes_o1, + breaks = expected_breaks_o1 + ) + # 2. Ego-networks of order 2 + expected_min_break_o2 <- 1 / 3 + expected_max_break_o2 <- 0.6 + expected_initial_interval_o2 <- + (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) + expected_breaks_o2 <- expected_min_break_o2 + + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) + expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + expected_binned_densities_o2 <- list( + densities = expected_densities_o2, + interval_indexes = expected_interval_indexes_o2, + breaks = expected_breaks_o2 + ) - # Set parameters for test - max_graphlet_size <- 4 - min_counts_per_interval <- 2 - num_intervals <- 100 - min_ego_edges <- 0 - min_ego_nodes <- 0 + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 + expected_mean_density_binned_counts_o1 <- rbind( + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(expected_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 + expected_mean_density_binned_counts_o2 <- rbind( + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(expected_mean_density_binned_counts_o2) <- 1:4 - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # density_binned_counts with default arguments should give + # mean graphlet count in each density bin + actual_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1 + ) - # Set manually verified ego-network node counts and edge densities - # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - # Order 1 expected densities should be: - # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 - # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - # Order 2 expected densities should be: - # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + actual_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2 + ) - # Set manually verified density bins for ego-networks - # 1. Ego-networks of order 1 - expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) - expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - expected_binned_densities_o1 <- list( - densities = expected_densities_o1, - interval_indexes = expected_interval_indexes_o1, - breaks = expected_breaks_o1 - ) - # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1 / 3 - expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- - (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 - expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) - expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - expected_binned_densities_o2 <- list( - densities = expected_densities_o2, - interval_indexes = expected_interval_indexes_o2, - breaks = expected_breaks_o2 - ) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - expected_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels - # 2-step ego networks - expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels + # Check actual output vs expected + expect_equal( + actual_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) - # Calculate binned average expected counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + - expected_counts_o1[8, ]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + - expected_counts_o1[5, ] + expected_counts_o1[9, ] + - expected_counts_o1[10, ]) / 5 - expected_mean_density_binned_counts_o1 <- rbind( - mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 - ) - rownames(expected_mean_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + - expected_counts_o2[7, ]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + - expected_counts_o2[10, ]) / 3 - expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, - mean_counts_bin4_o2 - ) - rownames(expected_mean_density_binned_counts_o2) <- 1:4 - - # density_binned_counts with default arguments should give - # mean graphlet count in each density bin - actual_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1 - ) - - actual_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2 - ) - - # Check actual output vs expected - expect_equal( - actual_density_binned_counts_o1, - expected_mean_density_binned_counts_o1 - ) - expect_equal( - actual_density_binned_counts_o2, - expected_mean_density_binned_counts_o2 - ) - - # Calculate max binned counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - # apply(x, 2, max): returns max of each column in x - max_counts_bin1_o1 <- apply(rbind(expected_counts_o1[1, ], expected_counts_o1[2, ]), 2, max) - max_counts_bin2_o1 <- apply(rbind( - expected_counts_o1[6, ], expected_counts_o1[7, ], - expected_counts_o1[8, ] - ), 2, max) - max_counts_bin3_o1 <- apply(rbind( - expected_counts_o1[3, ], expected_counts_o1[4, ], - expected_counts_o1[5, ], expected_counts_o1[9, ], - expected_counts_o1[10, ] - ), 2, max) - - expected_max_density_binned_counts_o1 <- rbind( - max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 - ) - rownames(expected_max_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - max_counts_bin1_o2 <- apply(rbind(expected_counts_o2[1, ], expected_counts_o2[4, ]), 2, max) - max_counts_bin2_o2 <- apply(rbind( - expected_counts_o2[2, ], expected_counts_o2[6, ], - expected_counts_o2[7, ] - ), 2, max) - max_counts_bin3_o2 <- apply(rbind(expected_counts_o2[3, ], expected_counts_o2[5, ]), 2, max) - max_counts_bin4_o2 <- apply(rbind( - expected_counts_o2[8, ], expected_counts_o2[9, ], - expected_counts_o2[10, ] - ), 2, max) - - expected_max_density_binned_counts_o2 <- rbind( - max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, - max_counts_bin4_o2 - ) - rownames(expected_max_density_binned_counts_o2) <- 1:4 - - # density_binned_counts with agg_fn = max should give - # max graphlet count in each density bin - agg_fn <- max - scale_fn <- NULL - - actual_max_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1, - agg_fn = agg_fn, - scale_fn = scale_fn - ) - - actual_max_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2, - agg_fn = agg_fn, - scale_fn = scale_fn - ) - - # Check actual output vs expected - expect_equal( - actual_max_density_binned_counts_o1, - expected_max_density_binned_counts_o1 - ) - expect_equal( - actual_max_density_binned_counts_o2, - expected_max_density_binned_counts_o2 - ) + # Calculate max binned counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + # apply(x, 2, max): returns max of each column in x + max_counts_bin1_o1 <- apply( + rbind(expected_counts_o1[1, ], expected_counts_o1[2, ]), 2, max + ) + max_counts_bin2_o1 <- apply(rbind( + expected_counts_o1[6, ], expected_counts_o1[7, ], + expected_counts_o1[8, ] + ), 2, max) + max_counts_bin3_o1 <- apply(rbind( + expected_counts_o1[3, ], expected_counts_o1[4, ], + expected_counts_o1[5, ], expected_counts_o1[9, ], + expected_counts_o1[10, ] + ), 2, max) + + expected_max_density_binned_counts_o1 <- rbind( + max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 + ) + rownames(expected_max_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + max_counts_bin1_o2 <- apply( + rbind(expected_counts_o2[1, ], expected_counts_o2[4, ]), 2, max + ) + max_counts_bin2_o2 <- apply(rbind( + expected_counts_o2[2, ], expected_counts_o2[6, ], + expected_counts_o2[7, ] + ), 2, max) + max_counts_bin3_o2 <- apply( + rbind(expected_counts_o2[3, ], expected_counts_o2[5, ]), 2, max + ) + max_counts_bin4_o2 <- apply(rbind( + expected_counts_o2[8, ], expected_counts_o2[9, ], + expected_counts_o2[10, ] + ), 2, max) + + expected_max_density_binned_counts_o2 <- rbind( + max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, + max_counts_bin4_o2 + ) + rownames(expected_max_density_binned_counts_o2) <- 1:4 + + # density_binned_counts with agg_fn = max should give + # max graphlet count in each density bin + agg_fn <- max + scale_fn <- NULL + + actual_max_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn + ) - # density_binned_counts with scale_fn = scale_graphlet_counts_ego - # should give mean graphlet counts in each density bin scaled by - # count_graphlet_tuples. - agg_fn <- mean - scale_fn <- scale_graphlet_counts_ego + actual_max_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn + ) - # calculate expected counts using previously tested function - expected_scaled_counts_o1 <- - scale_graphlet_counts_ego(expected_counts_o1, - max_graphlet_size = max_graphlet_size + # Check actual output vs expected + expect_equal( + actual_max_density_binned_counts_o1, + expected_max_density_binned_counts_o1 ) - expected_scaled_counts_o2 <- - scale_graphlet_counts_ego(expected_counts_o2, - max_graphlet_size = max_graphlet_size + expect_equal( + actual_max_density_binned_counts_o2, + expected_max_density_binned_counts_o2 ) - # calculate mean expected counts using expected density bins - mean_scaled_counts_bin1_o1 <- (expected_scaled_counts_o1[1, ] + expected_scaled_counts_o1[2, ]) / 2 - mean_scaled_counts_bin2_o1 <- (expected_scaled_counts_o1[6, ] + expected_scaled_counts_o1[7, ] + - expected_scaled_counts_o1[8, ]) / 3 - mean_scaled_counts_bin3_o1 <- (expected_scaled_counts_o1[3, ] + expected_scaled_counts_o1[4, ] + - expected_scaled_counts_o1[5, ] + expected_scaled_counts_o1[9, ] + - expected_scaled_counts_o1[10, ]) / 5 - expected_scaled_density_binned_counts_o1 <- rbind( - mean_scaled_counts_bin1_o1, mean_scaled_counts_bin2_o1, mean_scaled_counts_bin3_o1 - ) - rownames(expected_scaled_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_scaled_counts_bin1_o2 <- (expected_scaled_counts_o2[1, ] + expected_scaled_counts_o2[4, ]) / 2 - mean_scaled_counts_bin2_o2 <- (expected_scaled_counts_o2[2, ] + expected_scaled_counts_o2[6, ] + - expected_scaled_counts_o2[7, ]) / 3 - mean_scaled_counts_bin3_o2 <- (expected_scaled_counts_o2[3, ] + expected_scaled_counts_o2[5, ]) / 2 - mean_scaled_counts_bin4_o2 <- (expected_scaled_counts_o2[8, ] + expected_scaled_counts_o2[9, ] + - expected_scaled_counts_o2[10, ]) / 3 - expected_scaled_density_binned_counts_o2 <- rbind( - mean_scaled_counts_bin1_o2, mean_scaled_counts_bin2_o2, mean_scaled_counts_bin3_o2, - mean_scaled_counts_bin4_o2 - ) - rownames(expected_scaled_density_binned_counts_o2) <- 1:4 - - # Calculate scaled binned counts with density_binned_counts (function to test) - actual_scaled_density_binned_counts_o1 <- density_binned_counts( - expected_counts_o1, - expected_interval_indexes_o1, - agg_fn = agg_fn, - scale_fn = scale_fn, - max_graphlet_size = max_graphlet_size - ) + # density_binned_counts with scale_fn = scale_graphlet_counts_ego + # should give mean graphlet counts in each density bin scaled by + # count_graphlet_tuples. + agg_fn <- mean + scale_fn <- scale_graphlet_counts_ego + + # calculate expected counts using previously tested function + expected_scaled_counts_o1 <- + scale_graphlet_counts_ego(expected_counts_o1, + max_graphlet_size = max_graphlet_size + ) + expected_scaled_counts_o2 <- + scale_graphlet_counts_ego(expected_counts_o2, + max_graphlet_size = max_graphlet_size + ) + + # calculate mean expected counts using expected density bins + mean_scaled_counts_bin1_o1 <- (expected_scaled_counts_o1[1, ] + + expected_scaled_counts_o1[2, ]) / 2 + mean_scaled_counts_bin2_o1 <- (expected_scaled_counts_o1[6, ] + + expected_scaled_counts_o1[7, ] + + expected_scaled_counts_o1[8, ]) / 3 + mean_scaled_counts_bin3_o1 <- (expected_scaled_counts_o1[3, ] + + expected_scaled_counts_o1[4, ] + + expected_scaled_counts_o1[5, ] + + expected_scaled_counts_o1[9, ] + + expected_scaled_counts_o1[10, ]) / 5 + expected_scaled_density_binned_counts_o1 <- rbind( + mean_scaled_counts_bin1_o1, + mean_scaled_counts_bin2_o1, + mean_scaled_counts_bin3_o1 + ) + rownames(expected_scaled_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_scaled_counts_bin1_o2 <- (expected_scaled_counts_o2[1, ] + + expected_scaled_counts_o2[4, ]) / 2 + mean_scaled_counts_bin2_o2 <- (expected_scaled_counts_o2[2, ] + + expected_scaled_counts_o2[6, ] + + expected_scaled_counts_o2[7, ]) / 3 + mean_scaled_counts_bin3_o2 <- (expected_scaled_counts_o2[3, ] + + expected_scaled_counts_o2[5, ]) / 2 + mean_scaled_counts_bin4_o2 <- (expected_scaled_counts_o2[8, ] + + expected_scaled_counts_o2[9, ] + + expected_scaled_counts_o2[10, ]) / 3 + expected_scaled_density_binned_counts_o2 <- rbind( + mean_scaled_counts_bin1_o2, + mean_scaled_counts_bin2_o2, + mean_scaled_counts_bin3_o2, + mean_scaled_counts_bin4_o2 + ) + rownames(expected_scaled_density_binned_counts_o2) <- 1:4 + + # Calculate scaled binned counts with density_binned_counts (function to + # test) + actual_scaled_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) - actual_scaled_density_binned_counts_o2 <- density_binned_counts( - expected_counts_o2, - expected_interval_indexes_o2, - agg_fn = agg_fn, - scale_fn = scale_fn, - max_graphlet_size = max_graphlet_size - ) + actual_scaled_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) - # Check actual output vs expected - expect_equal( - actual_scaled_density_binned_counts_o1, - expected_scaled_density_binned_counts_o1 - ) - expect_equal( - actual_scaled_density_binned_counts_o2, - expected_scaled_density_binned_counts_o2 - ) -}) + # Check actual output vs expected + expect_equal( + actual_scaled_density_binned_counts_o1, + expected_scaled_density_binned_counts_o1 + ) + expect_equal( + actual_scaled_density_binned_counts_o2, + expected_scaled_density_binned_counts_o2 + ) + } +) context("Measures Netdis: Expected graphlet counts") test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { @@ -834,7 +966,11 @@ test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { } # Determine expected and actual expected graphlet counts expected_expected_graphlet_counts <- - purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) + purrr::map2( + density_indexes, + num_nodes, + expected_expected_graphlet_counts_fn + ) actual_expected_graphlet_counts <- purrr::map(graphlet_counts, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, @@ -899,13 +1035,17 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) max_edges_o1 <- choose(num_nodes_o1, 2) densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 + # Order 1 densities should be: + # 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 + # 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 # 2. Ego-networks of order 2 num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) max_edges_o2 <- choose(num_nodes_o2, 2) densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 + # Order 2 densities should be: + # 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 + # 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 # Set manually defined density breaks and indexes breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) @@ -952,8 +1092,10 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o1) <- + rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- + rownames(graphlet_counts_ego_o1) # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- @@ -1006,8 +1148,10 @@ test_that("netdis_expected_counts works for graphlets up to 4 nodes", { colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o2) + rownames(expected_expected_graphlet_counts_ego_o1) <- + rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- + rownames(graphlet_counts_ego_o2) # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- @@ -1069,31 +1213,34 @@ test_that("netdis statistic function output matches manually verified result", { expect_equal(expected_netdis_4, actual_netdis_4) expect_equal(expected_netdis_5, actual_netdis_5) }) -test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { - # arbitrary counts of correct size for graphlets up to size 5 - counts_1 <- c( - 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, - 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 - ) - counts_2 <- c( - 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, - 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 - ) +test_that( + "netdis_uptok gives expected netdis result for graphlets up to size k", + { + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c( + 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 + ) + counts_2 <- c( + 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 + ) - # add graphlet names - ids <- graphlet_key(5)$id - names(counts_1) <- ids - names(counts_2) <- ids + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids - # manually verified results - expected_netdis <- c(0.03418796, 0.02091792, 0.03826385) - names(expected_netdis) <- c("netdis3", "netdis4", "netdis5") + # manually verified results + expected_netdis <- c(0.03418796, 0.02091792, 0.03826385) + names(expected_netdis) <- c("netdis3", "netdis4", "netdis5") - # check function to test - actual_netdis <- netdis_uptok(counts_1, counts_2, 5) + # check function to test + actual_netdis <- netdis_uptok(counts_1, counts_2, 5) - expect_equal(expected_netdis, actual_netdis) -}) + expect_equal(expected_netdis, actual_netdis) + } +) context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index d7aeb68c..bf6bfef9 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -1,6 +1,10 @@ self_net_emd <- function(histogram, shift, method) { - netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method) + netemd_one_to_one( + dhists_1 = histogram, + dhists_2 = shift_dhist(histogram, shift), + method = method + ) } expected <- 0 @@ -89,38 +93,46 @@ expect_self_netemd_correct(histogram, return_details = TRUE ) -test_that("net_emd returns 0 when comparing any normal histogram against itself (no offset)", { - num_hists <- 5 - num_bins <- 101 +test_that( + paste( + "net_emd returns 0 when comparing any normal histogram against itself", + "(no offset)" + ), + { + num_hists <- 5 + num_bins <- 101 - mus <- runif(num_hists, -10, 10) - sigmas <- runif(num_hists, 0, 10) + mus <- runif(num_hists, -10, 10) + sigmas <- runif(num_hists, 0, 10) - rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } - rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { - locations <- rand_locations(mu, sigma) - masses <- dnorm(locations, mean = mu, sd = sigma) - return(dhist(masses = masses, locations = locations)) - }) + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { + locations <- rand_locations(mu, sigma) + masses <- dnorm(locations, mean = mu, sd = sigma) + return(dhist(masses = masses, locations = locations)) + }) - expected <- 0 - actuals_opt <- purrr::map(rand_dhists, function(dhist) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") - }) - purrr::walk(actuals_opt, function(actual) { - expect_equal(actual, expected) - }) + expected <- 0 + actuals_opt <- purrr::map(rand_dhists, function(dhist) { + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") + }) + purrr::walk(actuals_opt, function(actual) { + expect_equal(actual, expected) + }) - actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive") - }) - purrr::walk(actuals_exhaustive_default, function(actual) { - expect_equal(actual, expected) - }) -}) + actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { + netemd_one_to_one( + dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive" + ) + }) + purrr::walk(actuals_exhaustive_default, function(actual) { + expect_equal(actual, expected) + }) + } +) test_that("net_emd returns 0 when comparing any normal histogram randomly offset against itself", { @@ -146,7 +158,11 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset netemd_offset_self <- function(dhist, offsets, method) { netemds <- purrr::map_dbl(offsets, function(offset) { - netemd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) + netemd_one_to_one( + dhists_1 = dhist, + dhists_2 = shift_dhist(dhist, offset), + method = method + ) }) return(netemds) } @@ -200,7 +216,12 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method, return_details = return_details) + self_net_emd <- netemd_one_to_one( + dhists_1 = histogram, + dhists_2 = shift_dhist(histogram, shift), + method = method, + return_details = return_details + ) loc <- histogram$locations mass <- histogram$masses var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 @@ -228,31 +249,56 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any }) }) -test_that("net_emd returns analytically derived non-zero solutions for distributions - where the analytical solution is known", { - # Helper functions to create dhists for a given value of "p" - two_bin_dhist <- function(p) { - dhist(locations = c(0, 1), masses = c(p, 1 - p)) - } - three_bin_dhist <- function(p) { - dhist(locations = c(-1, 0, 1), masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p))) - } +test_that( + paste( + "net_emd returns analytically derived non-zero solutions for", + "distributions where the analytical solution is known" + ), + { + # Helper functions to create dhists for a given value of "p" + two_bin_dhist <- function(p) { + dhist(locations = c(0, 1), masses = c(p, 1 - p)) + } + three_bin_dhist <- function(p) { + dhist( + locations = c(-1, 0, 1), + masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p)) + ) + } - # Helper function to test actual vs expected - test_pair <- function(p, expected) { - dhistA <- two_bin_dhist(p) - dhistB <- three_bin_dhist(p) - expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "exhaustive"), expected, tolerance = 1e-12) - # Even setting the stats::optimise method tolerance to machine double precision, the - # optimised NetEMD is ~1e-09, so set a slightly looser tolerance here - expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "optimise"), expected, tolerance = 1e-08) - } + # Helper function to test actual vs expected + test_pair <- function(p, expected) { + dhistA <- two_bin_dhist(p) + dhistB <- three_bin_dhist(p) + expect_equal( + netemd_one_to_one( + dhists_1 = dhistA, + dhists_2 = dhistB, + method = "exhaustive" + ), + expected, + tolerance = 1e-12 + ) + # Even setting the stats::optimise method tolerance to machine double + # precision, the optimised NetEMD is ~1e-09, so set a slightly looser + # tolerance here + expect_equal( + netemd_one_to_one( + dhists_1 = dhistA, + dhists_2 = dhistB, + method = "optimise" + ), + expected, + tolerance = 1e-08 + ) + } - # Test for p values with analytically calculated NetEMD - test_pair(1 / 2, 1) - test_pair(1 / 3, 1 / sqrt(2)) - test_pair(1 / 5, 1 / 2) -}) + # Test for p values with analytically calculated NetEMD + test_pair(1 / 2, 1) + test_pair(1 / 3, 1 / sqrt(2)) + test_pair(1 / 5, 1 / 2) + } +) context("Measures NetEMD: Virus PPI (EMD)") # EMD and NET_EMD: Virus PPI datasets @@ -405,11 +451,16 @@ test_that("netemd_many_to_many works", { expected_netemd_fn <- function(gdds) { list( netemds = c( - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$HSV), - netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$VZV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$KSHV), - netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 = gdds$KSHV, dhists_2 = gdds$VZV) + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$KSHV, dhists_2 = gdds$VZV) ), comp_spec = cross_comparison_spec(gdds) ) diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index 5c48dbc4..070ef949 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -1,48 +1,60 @@ context("ORCA interface: Graph to ORCA edgelist round-trip") test_that("Graph to indexed edge list round trip conversion works", { data_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - g_orig <- igraph::read_graph(file = file.path(data_dir, "EBV.txt"), format = "ncol") + g_orig <- igraph::read_graph( + file = file.path(data_dir, "EBV.txt"), + format = "ncol" + ) g_rtrip <- netdist::indexed_edges_to_graph(graph_to_indexed_edges(g_orig)) - expect_true(all.equal(igraph::get.edgelist(g_orig), igraph::get.edgelist(g_orig))) + expect_true(all.equal( + igraph::get.edgelist(g_orig), + igraph::get.edgelist(g_orig) + )) }) context("ORCA interface: Graphlet key") -test_that("graphlet_key gives correct output for all supported max graphlet sizes", { - correct_graphlet_key_2 <- list(max_nodes = 2, id = c("G0"), node_count = c(2)) - correct_graphlet_key_3 <- list( - max_nodes = 3, id = c("G0", "G1", "G2"), - node_count = c(2, 3, 3) - ) - correct_graphlet_key_4 <- list( - max_nodes = 4, - id = c( - "G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8" - ), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) - ) - correct_graphlet_key_5 <- list( - max_nodes = 5, - id = c( - "G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8", "G9", "G10", "G11", "G12", - "G13", "G14", "G15", "G16", "G17", - "G18", "G19", "G20", "G21", "G22", - "G23", "G24", "G25", "G26", "G27", - "G28", "G29" - ), - node_count = c( - 2, 3, 3, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5 +test_that( + "graphlet_key gives correct output for all supported max graphlet sizes", + { + correct_graphlet_key_2 <- list( + max_nodes = 2, id = c("G0"), + node_count = c(2) ) - ) - expect_equal(graphlet_key(2), correct_graphlet_key_2) - expect_equal(graphlet_key(3), correct_graphlet_key_3) - expect_equal(graphlet_key(4), correct_graphlet_key_4) - expect_equal(graphlet_key(5), correct_graphlet_key_5) -}) + correct_graphlet_key_3 <- list( + max_nodes = 3, id = c("G0", "G1", "G2"), + node_count = c(2, 3, 3) + ) + correct_graphlet_key_4 <- list( + max_nodes = 4, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8" + ), + node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) + ) + correct_graphlet_key_5 <- list( + max_nodes = 5, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8", "G9", "G10", "G11", "G12", + "G13", "G14", "G15", "G16", "G17", + "G18", "G19", "G20", "G21", "G22", + "G23", "G24", "G25", "G26", "G27", + "G28", "G29" + ), + node_count = c( + 2, 3, 3, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5 + ) + ) + expect_equal(graphlet_key(2), correct_graphlet_key_2) + expect_equal(graphlet_key(3), correct_graphlet_key_3) + expect_equal(graphlet_key(4), correct_graphlet_key_4) + expect_equal(graphlet_key(5), correct_graphlet_key_5) + } +) test_that("graphlet_key gives error for unsupported max graphlet sizes", { max_size_too_low <- c(1, 0, -1, -2, -3, -4, -5, -6) @@ -60,53 +72,56 @@ test_that("graphlet_key gives error for unsupported max graphlet sizes", { }) context("ORCA interface: Orbit key") -test_that("orbit_key gives correct output for all supported max graphlet sizes", { - correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) - correct_orbit_key_3 <- list( - max_nodes = 3, id = c("O0", "O1", "O2", "O3"), - node_count = c(2, 3, 3, 3) - ) - correct_orbit_key_4 <- list( - max_nodes = 4, - id = c( - "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14" - ), - node_count = c( - 2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +test_that( + "orbit_key gives correct output for all supported max graphlet sizes", + { + correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) + correct_orbit_key_3 <- list( + max_nodes = 3, id = c("O0", "O1", "O2", "O3"), + node_count = c(2, 3, 3, 3) ) - ) - correct_orbit_key_5 <- list( - max_nodes = 5, - id = c( - "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", - "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72" - ), - node_count = c( - 2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5 + correct_orbit_key_4 <- list( + max_nodes = 4, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 + ) ) - ) - expect_equal(orbit_key(2), correct_orbit_key_2) - expect_equal(orbit_key(3), correct_orbit_key_3) - expect_equal(orbit_key(4), correct_orbit_key_4) - expect_equal(orbit_key(5), correct_orbit_key_5) -}) + correct_orbit_key_5 <- list( + max_nodes = 5, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", + "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5 + ) + ) + expect_equal(orbit_key(2), correct_orbit_key_2) + expect_equal(orbit_key(3), correct_orbit_key_3) + expect_equal(orbit_key(4), correct_orbit_key_4) + expect_equal(orbit_key(5), correct_orbit_key_5) + } +) context("ORCA interface: Graph cross comparison") test_that("cross_comparison_spec works for virus PPI data", { @@ -143,7 +158,10 @@ test_that("cross_comparison_spec works for virus PPI data", { context("ORCA interface: Orbit count wrapper") test_that("Single and zero node graphs are gracefully handled", { - single_node_graph <- igraph::graph_from_adjacency_matrix(0, mode = "undirected") + single_node_graph <- igraph::graph_from_adjacency_matrix( + 0, + mode = "undirected" + ) zero_node_graph <- igraph::delete.vertices(single_node_graph, 1) names4 <- c( "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", @@ -248,7 +266,10 @@ test_that("simplify_graph works", { # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -259,7 +280,10 @@ test_that("simplify_graph works", { # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -270,7 +294,10 @@ test_that("simplify_graph works", { # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -281,7 +308,10 @@ test_that("simplify_graph works", { # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -292,7 +322,10 @@ test_that("simplify_graph works", { # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -303,7 +336,10 @@ test_that("simplify_graph works", { # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -314,7 +350,10 @@ test_that("simplify_graph works", { # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -339,7 +378,10 @@ test_that("simplify_graph works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -350,7 +392,10 @@ test_that("simplify_graph works", { # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -358,11 +403,14 @@ test_that("simplify_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -373,7 +421,10 @@ test_that("simplify_graph works", { # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -381,11 +432,14 @@ test_that("simplify_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -393,11 +447,15 @@ test_that("simplify_graph works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), igraph::as_adjacency_matrix(simplify_graph( graph, @@ -464,7 +522,10 @@ test_that("gdd simplifies works", { # 1a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -475,7 +536,10 @@ test_that("gdd simplifies works", { # 1b. Multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -486,7 +550,10 @@ test_that("gdd simplifies works", { # 1c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -497,7 +564,10 @@ test_that("gdd simplifies works", { # 1ab. Loop + multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -508,7 +578,10 @@ test_that("gdd simplifies works", { # 1ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -519,7 +592,10 @@ test_that("gdd simplifies works", { # 1bc. Multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -530,7 +606,10 @@ test_that("gdd simplifies works", { # 1abc. Loop + multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), gdd(simplify_graph( graph, @@ -555,7 +634,10 @@ test_that("gdd simplifies works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -566,7 +648,10 @@ test_that("gdd simplifies works", { # 2c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), gdd(simplify_graph( graph, @@ -574,11 +659,14 @@ test_that("gdd simplifies works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -589,7 +677,10 @@ test_that("gdd simplifies works", { # 2ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), gdd(simplify_graph( graph, @@ -597,11 +688,14 @@ test_that("gdd simplifies works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -609,11 +703,15 @@ test_that("gdd simplifies works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), gdd(simplify_graph( graph, @@ -658,7 +756,10 @@ test_that("Features to Histograms Test", { expect_equal(res[[1]]$locations, c(10^-10, 10^-9, 10^-8, 10^-2, 10^3)) expect_equal(res[[1]]$masses, c(1, 1, 2, 1, 1)) # irrational - c1 <- matrix(c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), nrow = 6) + c1 <- matrix( + c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), + nrow = 6 + ) res <- graph_features_to_histograms(c1) expect_equal(res[[1]]$locations, c(sqrt(2) / pi, sqrt(2), sqrt(3), pi)) expect_equal(res[[1]]$masses, c(2, 2, 1, 1)) @@ -736,7 +837,10 @@ test_that("read_simple_graph works", { # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_loops(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -747,7 +851,10 @@ test_that("read_simple_graph works", { # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -758,7 +865,10 @@ test_that("read_simple_graph works", { # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -769,7 +879,10 @@ test_that("read_simple_graph works", { # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -780,7 +893,10 @@ test_that("read_simple_graph works", { # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -791,7 +907,10 @@ test_that("read_simple_graph works", { # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -802,7 +921,10 @@ test_that("read_simple_graph works", { # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "directed" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -827,7 +949,10 @@ test_that("read_simple_graph works", { # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(adj_mat), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -838,7 +963,10 @@ test_that("read_simple_graph works", { # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(adj_mat), + mode = "plus" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -846,11 +974,14 @@ test_that("read_simple_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_multiples(remove_loops(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -861,7 +992,10 @@ test_that("read_simple_graph works", { # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_loops(adj_mat)), + mode = "plus" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -869,11 +1003,14 @@ test_that("read_simple_graph works", { remove_multiple = FALSE, remove_isolates = TRUE )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating + # multiple edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(adj_mat)), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -881,11 +1018,15 @@ test_that("read_simple_graph works", { remove_multiple = TRUE, remove_isolates = TRUE )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple - # edges where nodes are mutually connected in adjacency matrix) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid + # generating multiple edges where nodes are mutually connected in adjacency + # matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + igraph::graph_from_adjacency_matrix( + remove_isolates(remove_multiples(remove_loops(adj_mat))), + mode = "max" + ) ), igraph::as_adjacency_matrix(read_simple_graph( file = path, format = format, @@ -912,10 +1053,26 @@ test_that("read_simple_files works (all files in a directory)", { # Save graphs to temp directory format <- "graphml" base_dir <- tempdir() - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_1.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_2.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_3.txt"), + format = format + ) + igraph::write_graph( + graph, + file = file.path(base_dir, "oltw54387eNS_4.txt"), + format = format + ) # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { @@ -967,8 +1124,9 @@ test_that("orbit_to_graphlet_counts summation works", { edges <- graph_to_indexed_edges(graph) orbit_counts_4 <- orca::count4(edges) orbit_counts_5 <- orca::count5(edges) - # Define orbit indexes belonging to each graphlet using the xero-based indexing - # from the journal papers, adding one to conver tot he one-based indexing of R + # Define orbit indexes belonging to each graphlet using the xero-based + # indexing from the journal papers, adding one to conver to the one-based + # indexing of R g0_indexes <- c(0) + 1 g1_indexes <- c(1:2) + 1 g2_indexes <- c(3) + 1 @@ -1047,8 +1205,8 @@ test_that("orbit_to_graphlet_counts summation works", { "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", "G29" ) - # Define epected graphlet count matrix for graphlets up to 4 nodes by selecting - # a subset of the matrix for graphlets up to 5 nodes + # Define epected graphlet count matrix for graphlets up to 4 nodes by + # selecting a subset of the matrix for graphlets up to 5 nodes expected_graphlet_counts_4 <- expected_graphlet_counts_5[, 1:9] # Calculate actual graphlet counts from functions under test actual_graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) @@ -1059,137 +1217,142 @@ test_that("orbit_to_graphlet_counts summation works", { }) context("ORCA interface: Named ego networks") -test_that("make_named_ego_graph labels each ego-network with the correct node name", { - # Helper function to sort edgelists in consistent order - sort_edge_list <- function(edge_list) { - edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] - } - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # The expectation below is based on igraph::graph_from_edgelist adding nodes - # in the order they appear in the edge list, and igraph::V returning them - # in this same order - expected_node_names <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10") +test_that( + "make_named_ego_graph labels each ego-network with the correct node name", + { + # Helper function to sort edgelists in consistent order + sort_edge_list <- function(edge_list) { + edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] + } + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + # The expectation below is based on igraph::graph_from_edgelist adding nodes + # in the order they appear in the edge list, and igraph::V returning them + # in this same order + expected_node_names <- c( + "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10" + ) - # Expected edgelists for ego networks of order 1 - expected_ego_elist_n1_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6") - ) - expected_ego_elist_n2_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n2", "n3"), - c("n2", "n4"), - c("n2", "n5") - ) - expected_ego_elist_n3_o1 <- rbind( - c("n2", "n3") - ) - expected_ego_elist_n4_o1 <- rbind( - c("n1", "n2"), - c("n1", "n4"), - c("n1", "n6"), - c("n2", "n4"), - c("n4", "n6") - ) - expected_ego_elist_n5_o1 <- rbind( - c("n2", "n5") - ) - expected_ego_elist_n6_o1 <- rbind( - c("n1", "n4"), - c("n1", "n6"), - c("n4", "n6"), - c("n6", "n8") - ) - expected_ego_elist_n7_o1 <- rbind( - c("n1", "n7"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n8_o1 <- rbind( - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n9_o1 <- rbind( - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - expected_ego_elist_n10_o1 <- rbind( - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) + # Expected edgelists for ego networks of order 1 + expected_ego_elist_n1_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6") + ) + expected_ego_elist_n2_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n2", "n3"), + c("n2", "n4"), + c("n2", "n5") + ) + expected_ego_elist_n3_o1 <- rbind( + c("n2", "n3") + ) + expected_ego_elist_n4_o1 <- rbind( + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n2", "n4"), + c("n4", "n6") + ) + expected_ego_elist_n5_o1 <- rbind( + c("n2", "n5") + ) + expected_ego_elist_n6_o1 <- rbind( + c("n1", "n4"), + c("n1", "n6"), + c("n4", "n6"), + c("n6", "n8") + ) + expected_ego_elist_n7_o1 <- rbind( + c("n1", "n7"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + expected_ego_elist_n8_o1 <- rbind( + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + expected_ego_elist_n9_o1 <- rbind( + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + expected_ego_elist_n10_o1 <- rbind( + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) - # Test ego-networks of order 1. - # We compare edgelists as igraphs do not implement comparison - order <- 1 - min_ego_nodes <- 0 - min_ego_edges <- 0 + # Test ego-networks of order 1. + # We compare edgelists as igraphs do not implement comparison + order <- 1 + min_ego_nodes <- 0 + min_ego_edges <- 0 - expected_ego_elists_o1 <- list( - n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), - n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), - n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), - n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), - n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), - n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), - n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), - n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), - n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), - n10 = dplyr::arrange(data.frame(expected_ego_elist_n10_o1), X1, X2) - ) - # Generate actual ego-networks and convert to edge lists for comparison - actual_ego_elists_o1 <- - purrr::map( - make_named_ego_graph(graph, order, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ), - function(g) { - dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) - } + expected_ego_elists_o1 <- list( + n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), + n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), + n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), + n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), + n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), + n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), + n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), + n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), + n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), + n10 = dplyr::arrange(data.frame(expected_ego_elist_n10_o1), X1, X2) ) - expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) -}) + # Generate actual ego-networks and convert to edge lists for comparison + actual_ego_elists_o1 <- + purrr::map( + make_named_ego_graph(graph, order, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ), + function(g) { + dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) + } + ) + expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) + } +) context("ORCA interface: Graphlet counts") test_that("count_graphlets_for_graph works", { @@ -1215,7 +1378,9 @@ test_that("count_graphlets_for_graph works", { graph <- igraph::graph_from_edgelist(elist, directed = FALSE) # Setgraphlet labels to use for names in expected counts - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) # Manually verified graphlet counts expected_counts <- c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1) @@ -1227,534 +1392,287 @@ test_that("count_graphlets_for_graph works", { }) context("ORCA interface: Ego-network graphlet counts") -test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +test_that( + paste( + "count_graphlets_ego: Ego-network 4-node graphlet counts match manually", + "verified totals for test graph" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) - max_graphlet_size <- 4 - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), - c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_order_1 <- rbind( + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(expected_counts_order_1) <- node_labels + colnames(expected_counts_order_1) <- graphlet_labels + # 2-step ego networks + expected_counts_order_2 <- rbind( + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(expected_counts_order_2) <- node_labels + colnames(expected_counts_order_2) <- graphlet_labels - # Count graphlets in each ego network of the graph with only counts requested - min_ego_nodes <- 0 - min_ego_edges <- 0 + # Count graphlets in each ego network of the graph with only counts + # requested + min_ego_nodes <- 0 + min_ego_edges <- 0 - actual_counts_order_1 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, + actual_counts_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + actual_counts_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Test that actual counts match expected with only counts requested + # (default) + expect_equal(actual_counts_order_1, expected_counts_order_1) + expect_equal(actual_counts_order_2, expected_counts_order_2) + + # Test that actual and returned ego networks match expected + # 1. Define expected + expected_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - actual_counts_order_2 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, + expected_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - - # Test that actual counts match expected with only counts requested (default) - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) - - # Test that actual and returned ego networks match expected - # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - expected_counts_with_networks_order_1 <- - list( - graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1 - ) - expected_counts_with_networks_order_2 <- - list( - graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2 - ) - # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE + expected_counts_with_networks_order_1 <- + list( + graphlet_counts = expected_counts_order_1, + ego_networks = expected_ego_networks_order_1 + ) + expected_counts_with_networks_order_2 <- + list( + graphlet_counts = expected_counts_order_2, + ego_networks = expected_ego_networks_order_2 + ) + # 2. Calculate actual + actual_counts_with_networks_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + # Test that actual counts match expected with ego-networks requested + expect_equal( + actual_counts_with_networks_order_1$graphlet_counts, + expected_counts_order_1 ) - actual_counts_with_networks_order_2 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE + expect_equal( + actual_counts_with_networks_order_2$graphlet_counts, + expected_counts_order_2 ) - # Test that actual counts match expected with ego-networks requested - expect_equal(actual_counts_with_networks_order_1$graphlet_counts, expected_counts_order_1) - expect_equal(actual_counts_with_networks_order_2$graphlet_counts, expected_counts_order_2) - # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to - # indexed edge list and then compare. Do in-situ replacement of igraphs with - # indexed edge lists to ensure we are checking full properties of returned - # objects (i.e. named lists with matching elements). - # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map( - expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map( - expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map( - actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges + # 3. Compare + # Comparison is not implemented for igraph objects, so convert all igraphs + # to indexed edge list and then compare. Do in-situ replacement of igraphs + # with indexed edge lists to ensure we are checking full properties of + # returned objects (i.e. named lists with matching elements). + # 3a. Convert expected and actual ego networks from igraphs to indexed edges + expected_counts_with_networks_order_1$ego_networks <- + purrr::map( + expected_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + expected_counts_with_networks_order_2$ego_networks <- + purrr::map( + expected_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + # 3b. Do comparison + expect_equal( + actual_counts_with_networks_order_1, + expected_counts_with_networks_order_1 ) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map( - actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges + expect_equal( + actual_counts_with_networks_order_2, + expected_counts_with_networks_order_2 ) - # 3b. Do comparison - expect_equal( - actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1 - ) - expect_equal( - actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2 - ) -}) + } +) context("ORCA interface: Ego-network graphlet counts") -test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually verified totals for test graph", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") +test_that( + paste( + "ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually", + "verified totals for test graph" + ), + { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - max_graphlet_size <- 4 - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), - c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), - c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), - c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), - c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), - c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels + # Set node and graphlet labels to use for row and col names in expected + # counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c( + "N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8" + ) - # Count graphlets in each ego network of the graph with only counts requested - min_ego_nodes <- 0 - min_ego_edges <- 0 + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_order_1 <- rbind( + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(expected_counts_order_1) <- node_labels + colnames(expected_counts_order_1) <- graphlet_labels + # 2-step ego networks + expected_counts_order_2 <- rbind( + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(expected_counts_order_2) <- node_labels + colnames(expected_counts_order_2) <- graphlet_labels - # Test that actual and returned ego graphlet counts match - # 1. Generate ego networks with previously tested function. - ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) + # Count graphlets in each ego network of the graph with only counts + # requested + min_ego_nodes <- 0 + min_ego_edges <- 0 - # 2. Calculate counts with ego_to_graphlet_counts. - actual_counts_order_1 <- - ego_to_graphlet_counts(ego_networks_order_1, - max_graphlet_size = max_graphlet_size + # Test that actual and returned ego graphlet counts match + # 1. Generate ego networks with previously tested function. + ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges ) - actual_counts_order_2 <- - ego_to_graphlet_counts(ego_networks_order_2, - max_graphlet_size = max_graphlet_size + ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges ) - # 3. Test that actual counts match expected - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) -}) + # 2. Calculate counts with ego_to_graphlet_counts. + actual_counts_order_1 <- + ego_to_graphlet_counts(ego_networks_order_1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_order_2 <- + ego_to_graphlet_counts(ego_networks_order_2, + max_graphlet_size = max_graphlet_size + ) -# context("ORCA interface: Graphlet-based degree distributions") -# test_that("gdd works", { -# graph <- netdist::virusppi$EBV -# edges <- graph_to_indexed_edges(graph) -# # Caclulate expected outputs (NOTE: relies on orbit_to_graphlet_counts and -# # orca_counts_to_graphlet_orbit_degree_distribution methods) -# orbit_counts_4 <- orca::count4(edges) -# orbit_counts_5 <- orca::count5(edges) -# graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) -# graphlet_counts_5 <- orbit_to_graphlet_counts(orbit_counts_5) -# gdd_orbit_4_expected <- orca_counts_to_graphlet_orbit_degree_distribution(orbit_counts_4) -# gdd_orbit_5_expected <- orca_counts_to_graphlet_orbit_degree_distribution(orbit_counts_5) -# gdd_graphlet_4_expected <- orca_counts_to_graphlet_orbit_degree_distribution(graphlet_counts_4) -# gdd_graphlet_5_expected <- orca_counts_to_graphlet_orbit_degree_distribution(graphlet_counts_5) -# # Calculate actual outputs from code under test -# gdd_orbit_4_actual <- gdd(graph, feature_type = "orbit", max_graphlet_size = 4) -# gdd_orbit_5_actual <- gdd(graph, feature_type = "orbit", max_graphlet_size = 5) -# gdd_graphlet_4_actual <- gdd(graph, feature_type = "graphlet", max_graphlet_size = 4) -# gdd_graphlet_5_actual <- gdd(graph, feature_type = "graphlet", max_graphlet_size = 5) -# gdd_default_4_actual <- gdd(graph, max_graphlet_size = 4) -# gdd_default_5_actual <- gdd(graph, max_graphlet_size = 5) -# gdd_orbit_default_actual <- gdd(graph, feature_type = "orbit") -# gdd_graphlet_default_actual <- gdd(graph, feature_type = "graphlet") -# gdd_default_default_actual <- gdd(graph) -# # Compare actual gdd with expected gdd -# expect_equal(gdd_orbit_4_actual, gdd_orbit_4_expected) -# expect_equal(gdd_orbit_5_actual, gdd_orbit_5_expected) -# expect_equal(gdd_graphlet_4_actual, gdd_graphlet_4_expected) -# expect_equal(gdd_graphlet_5_actual, gdd_graphlet_5_expected) -# expect_equal(gdd_default_4_actual, gdd_orbit_4_expected) -# expect_equal(gdd_default_5_actual, gdd_orbit_5_expected) -# expect_equal(gdd_orbit_default_actual, gdd_orbit_4_expected) -# expect_equal(gdd_graphlet_default_actual, gdd_graphlet_4_expected) -# expect_equal(gdd_default_default_actual, gdd_orbit_4_expected) -# -# # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5)) -# # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 2)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 3)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 6)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6)) -# -# }) -# -# context("ORCA interface: Ego-network graphlet outputs for manually verified networks") -# test_that("Ego-network 4-node graphlet counts match manually verified totals -# and gdd gives expected discrete histograms",{ -# # Set up a small sample network with at least one ego-network that contains -# # at least one of each graphlets -# elist <- rbind( -# c("n1","n2"), -# c("n2","n3"), -# c("n1","n4"), -# c("n2","n5"), -# c("n1","n6"), -# c("n1","n7"), -# c("n2","n4"), -# c("n4","n6"), -# c("n6","n8"), -# c("n7","n8"), -# c("n7","n9"), -# c("n7","n10"), -# c("n8","n9"), -# c("n8","n10"), -# c("n9","n10") -# ) -# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# -# # Set node and graphlet labels to use for row and col names in expected counts -# node_labels <- igraph::V(graph)$name -# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# -# # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 -# max_graphlet_size <- 4 -# actual_counts_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1) -# actual_counts_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2) -# -# # Set manually verified ego-network graphlet counts -# # 1-step ego networks -# expected_counts_order_1 <- rbind( -# c(6, 5, 2, 0, 1, 0, 2, 1, 0), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0), -# c(5, 2, 2, 0, 0, 0, 0, 1, 0), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0), -# c(4, 2, 1, 0, 0, 0, 1, 0, 0), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) -# ) -# rownames(expected_counts_order_1) <- node_labels -# colnames(expected_counts_order_1) <- graphlet_labels -# # 2-step ego networks -# expected_counts_order_2 <- rbind( -# c(15, 18, 6, 21, 3, 1, 11, 1, 1), -# c( 8, 10, 2, 6, 3, 0, 4, 1, 0), -# c( 5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(10, 14, 2, 11, 3, 1, 5, 1, 0), -# c( 5, 5, 1, 0, 2, 0, 2, 0, 0), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1), -# c(11, 10, 5, 10 ,0 ,1, 8, 0, 1), -# c( 9, 8, 4, 4, 0, 1, 6, 0, 1), -# c( 9, 8, 4, 4, 0, 1, 6, 0, 1) -# ) -# rownames(expected_counts_order_2) <- node_labels -# colnames(expected_counts_order_2) <- graphlet_labels -# -# # Test that actual counts match expected with only counts requested (default) -# expect_equal(actual_counts_order_1, expected_counts_order_1) -# expect_equal(actual_counts_order_2, expected_counts_order_2) -# -# # Test that actual counts and returned ego networks match expected -# # 1. Define expected -# expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) -# expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) -# expected_counts_with_networks_order_1 <- -# list(graphlet_counts = expected_counts_order_1, -# ego_networks = expected_ego_networks_order_1) -# expected_counts_with_networks_order_2 <- -# list(graphlet_counts = expected_counts_order_2, -# ego_networks = expected_ego_networks_order_2) -# # 2. Calculate actual -# actual_counts_with_networks_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, return_ego_networks = TRUE) -# actual_counts_with_networks_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, return_ego_networks = TRUE) -# # 3. Compare -# # Comparison is not implemented for igraph objects, so convert all igraphs to -# # indexed edge list and then compare. Do in-situ replacement of igraphs with -# # indexed edge lists to ensure we are checking full properties of returned -# # objects (i.e. named lists with matching elements). -# # 3a. Convert expected and actual ego networks from igraphs to indexed edges -# expected_counts_with_networks_order_1$ego_networks <- -# purrr::map(expected_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges) -# expected_counts_with_networks_order_2$ego_networks <- -# purrr::map(expected_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges) -# actual_counts_with_networks_order_1$ego_networks <- -# purrr::map(actual_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges) -# actual_counts_with_networks_order_2$ego_networks <- -# purrr::map(actual_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges) -# # 3b. Do comparison -# expect_equal(actual_counts_with_networks_order_1, -# expected_counts_with_networks_order_1) -# expect_equal(actual_counts_with_networks_order_2, -# expected_counts_with_networks_order_2) -# -# # Test that gdd method gives the expected graphlet degree distributions -# # 1-step ego-networks -# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", -# max_graphlet_size = 4, ego_neighbourhood_size = 1) -# expected_gdd_order_1 <- list( -# G0 = dhist(locations = c(1, 4, 5, 6, 7), masses = c(2, 1, 2, 3, 2)), -# G1 = dhist(locations = c(0, 2, 3, 5), masses = c(4, 2, 2, 2)), -# G2 = dhist(locations = c(0, 1, 2, 4), masses = c(2, 2, 2, 4)), -# G3 = dhist(locations = c(0), masses = c(10)), -# G4 = dhist(locations = c(0, 1, 2), masses = c(8, 1, 1)), -# G5 = dhist(locations = c(0), masses = c(10)), -# G6 = dhist(locations = c(0, 1, 2, 3), masses = c(5, 1, 2, 2)), -# G7 = dhist(locations = c(0, 1), masses = c(8, 2)), -# G8 = dhist(locations = c(0, 1), masses = c(6, 4)) -# ) -# expect_equal(actual_gdd_order_1, expected_gdd_order_1) -# # 2-step ego-networks -# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", -# max_graphlet_size = 4, ego_neighbourhood_size = 2) -# expected_gdd_order_2 <- list( -# G0 = dhist(locations = c(5, 8, 9, 10, 11, 13, 15), masses = c(2, 1, 2, 1, 1, 2, 1)), -# G1 = dhist(locations = c(5, 8, 10, 13, 14, 18), masses = c(2, 2, 2, 2, 1, 1)), -# G2 = dhist(locations = c(1, 2, 4, 5, 6), masses = c(2, 2, 2, 1, 3)), -# G3 = dhist(locations = c(0, 4, 6, 10, 11, 15, 21), masses = c(2, 2, 1, 1, 1, 2, 1)), -# G4 = dhist(locations = c(0, 1, 2, 3), masses = c(3, 2, 2, 3)), -# G5 = dhist(locations = c(0, 1), masses = c(3, 7)), -# G6 = dhist(locations = c(2, 4, 5, 6, 8, 9, 11), masses = c(2, 1, 1, 2, 1, 2, 1)), -# G7 = dhist(locations = c(0, 1), masses = c(5, 5)), -# G8 = dhist(locations = c(0, 1), masses = c(4, 6)) -# ) -# expect_equal(actual_gdd_order_2, expected_gdd_order_2) -# -# # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, -# ego_neighbourhood_size = 1)) -# # We don't support orbit feature type for ego networks (i.e. neighbourhood > 0) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, -# ego_neighbourhood_size = 1)) -# # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, -# ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, -# ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, -# ego_neighbourhood_size = 1)) -# }) -# -# context("ORCA interface: GDD for all graphs in a directory") -# test_that("gdd_for_all_graphs works", { -# # Set source directory and file properties for Virus PPI graph edge files -# source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# edge_format = "ncol" -# file_pattern = ".txt" -# -# # Set number of threads to use at once for parallel processing. -# num_threads = getOption("mc.cores", 2L) -# -# # Use previously tested gdd code to calculate expected gdds -# expected_gdd_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdds <- list( -# gdd(virusppi$EBV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$ECL, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$HSV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$KSHV, feature_type, max_graphlet_size, ego_neighbourhood_size), -# gdd(virusppi$VZV, feature_type, max_graphlet_size, ego_neighbourhood_size) -# ) -# names(gdds) <- c("EBV", "ECL", "HSV-1", "KSHV", "VZV") -# gdds -# } -# -# # Use code under test to generate actual gdds -# actual_gdd_fn <- function (feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, -# pattern = file_pattern, feature_type = feature_type, -# max_graphlet_size = max_graphlet_size, -# ego_neighbourhood_size = ego_neighbourhood_size, -# mc.cores = num_threads) -# } -# # Helper function to make comparison code clearer -# compare_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { -# actual_gdds <- actual_gdd_fn(feature_type, max_graphlet_size, ego_neighbourhood_size) -# expected_gdds <- expected_gdd_fn(feature_type, max_graphlet_size, ego_neighbourhood_size) -# expect_equal(actual_gdds, expected_gdds) -# } -# # Map over test parameters, comparing actual gdds to expected -# # No ego-networks -# compare_fn(feature_type = "orbit", max_graphlet_size = 4, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "orbit", max_graphlet_size = 5, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 0) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 0) -# # Ego networks of order 1 -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1) -# # Ego networks of order 2 -# compare_fn(feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2) -# compare_fn(feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2) -# }) + # 3. Test that actual counts match expected + expect_equal(actual_counts_order_1, expected_counts_order_1) + expect_equal(actual_counts_order_2, expected_counts_order_2) + } +) diff --git a/tests/testthat/test_utility_functions.R b/tests/testthat/test_utility_functions.R index 091e02af..370e6e8b 100755 --- a/tests/testthat/test_utility_functions.R +++ b/tests/testthat/test_utility_functions.R @@ -2,63 +2,171 @@ context("Utility Functions") test_that("rotl_vec rotates left by specified number of places", { test_vec <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) - expect_equal(rotl_vec(test_vec, -13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, -12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, -11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, -9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, -8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, -7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotl_vec(test_vec, -6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotl_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotl_vec(test_vec, -4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotl_vec(test_vec, -3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, -2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, -1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, 1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, 2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, 3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotl_vec(test_vec, 4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotl_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotl_vec(test_vec, 6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotl_vec(test_vec, 7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotl_vec(test_vec, 8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotl_vec(test_vec, 9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotl_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotl_vec(test_vec, 11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotl_vec(test_vec, 12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotl_vec(test_vec, 13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) + expect_equal( + rotl_vec(test_vec, -13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal(rotl_vec( + test_vec, -12 + ), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) + expect_equal( + rotl_vec(test_vec, -11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, -9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, -8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, -7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotl_vec(test_vec, -6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotl_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotl_vec(test_vec, -4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotl_vec(test_vec, -3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotl_vec(test_vec, -2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotl_vec(test_vec, -1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, 1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, 2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, 3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotl_vec(test_vec, 4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotl_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotl_vec(test_vec, 6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotl_vec(test_vec, 7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotl_vec(test_vec, 8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotl_vec(test_vec, 9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotl_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotl_vec(test_vec, 11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotl_vec(test_vec, 12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotl_vec(test_vec, 13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) }) test_that("rotr_vec rotates right by specified number of places", { test_vec <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) - expect_equal(rotr_vec(test_vec, 13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, 12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, 11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, 9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, 8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, 7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotr_vec(test_vec, 6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotr_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotr_vec(test_vec, 4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotr_vec(test_vec, 3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, 2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, 1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, -1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, -2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, -3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) - expect_equal(rotr_vec(test_vec, -4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40)) - expect_equal(rotr_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50)) - expect_equal(rotr_vec(test_vec, -6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60)) - expect_equal(rotr_vec(test_vec, -7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70)) - expect_equal(rotr_vec(test_vec, -8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80)) - expect_equal(rotr_vec(test_vec, -9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90)) - expect_equal(rotr_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) - expect_equal(rotr_vec(test_vec, -11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10)) - expect_equal(rotr_vec(test_vec, -12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20)) - expect_equal(rotr_vec(test_vec, -13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30)) + expect_equal( + rotr_vec(test_vec, 13), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, 12), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, 11), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, 10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, 9), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, 8), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, 7), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotr_vec(test_vec, 6), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotr_vec(test_vec, 5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotr_vec(test_vec, 4), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotr_vec(test_vec, 3), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, 2), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, 1), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, 0), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, -1), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, -2), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, -3), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) + expect_equal( + rotr_vec(test_vec, -4), c(50, 60, 70, 80, 90, 100, 10, 20, 30, 40) + ) + expect_equal( + rotr_vec(test_vec, -5), c(60, 70, 80, 90, 100, 10, 20, 30, 40, 50) + ) + expect_equal( + rotr_vec(test_vec, -6), c(70, 80, 90, 100, 10, 20, 30, 40, 50, 60) + ) + expect_equal( + rotr_vec(test_vec, -7), c(80, 90, 100, 10, 20, 30, 40, 50, 60, 70) + ) + expect_equal( + rotr_vec(test_vec, -8), c(90, 100, 10, 20, 30, 40, 50, 60, 70, 80) + ) + expect_equal( + rotr_vec(test_vec, -9), c(100, 10, 20, 30, 40, 50, 60, 70, 80, 90) + ) + expect_equal( + rotr_vec(test_vec, -10), c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100) + ) + expect_equal( + rotr_vec(test_vec, -11), c(20, 30, 40, 50, 60, 70, 80, 90, 100, 10) + ) + expect_equal( + rotr_vec(test_vec, -12), c(30, 40, 50, 60, 70, 80, 90, 100, 10, 20) + ) + expect_equal( + rotr_vec(test_vec, -13), c(40, 50, 60, 70, 80, 90, 100, 10, 20, 30) + ) }) From 9c0a716391e02f75990914fcd25ddda8f4880f33 Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Tue, 7 Jun 2022 11:20:22 +0000 Subject: [PATCH 64/84] Update documentation --- man/count_graphlets_ego.Rd | 5 +- man/emd_cs.Rd | 6 +- man/gdd.Rd | 8 +- man/gdd_for_all_graphs.Rd | 10 +-- man/graph_features_to_histograms.Rd | 10 ++- man/graphlet_key.Rd | 3 +- man/min_emd.Rd | 4 +- man/min_emd_exhaustive.Rd | 8 +- man/netdis.plot.Rd | 17 +++-- man/netemd.plot.Rd | 17 +++-- man/netemd_many_to_many.Rd | 29 +++++--- man/netemd_one_to_one.Rd | 110 +++++++++++++++++++++------- man/netemd_single_pair.Rd | 29 +++++--- man/orbit_key.Rd | 3 +- man/read_simple_graphs.Rd | 13 +++- 15 files changed, 186 insertions(+), 86 deletions(-) diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index 5ae422c8..35afac8d 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -20,8 +20,9 @@ count_graphlets_ego( Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Currently only size 4 (default) and 5 are supported.} -\item{neighbourhood_size}{The number of steps from the source node used to select the -neighboring nodes to be included in the source node ego-network. (Default 2).} +\item{neighbourhood_size}{The number of steps from the source node used to +select the neighboring nodes to be included in the source node ego-network. +(Default 2).} \item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} nodes are returned. (Default 3).} diff --git a/man/emd_cs.Rd b/man/emd_cs.Rd index 1e580760..99a45be7 100755 --- a/man/emd_cs.Rd +++ b/man/emd_cs.Rd @@ -20,7 +20,7 @@ Distance between the two histograms by summing the absolute difference between the two cumulative histograms. } \references{ -Calculation of the Wasserstein Distance Between Probability Distributions on the Line -S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 -\url{http://dx.doi.org/10.1137/1118101} +Calculation of the Wasserstein Distance Between Probability Distributions on +the Line S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, +784-786 \url{http://dx.doi.org/10.1137/1118101} } diff --git a/man/gdd.Rd b/man/gdd.Rd index 7444bbe7..3102d4b3 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -22,7 +22,8 @@ the number of graphlet orbits each node participates in.} Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node used to select the +\item{ego_neighbourhood_size}{The number of steps from the source node used +to select the neighboring nodes to be included in the source node ego-network.} } \value{ @@ -30,6 +31,7 @@ List of graphlet-based degree distributions, with each distribution represented as a \code{dhist} discrete histogram object. } \description{ -Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object -using the ORCA fast graphlet orbit counting package. +Short-cut function to create graphlet-based degree distributions from +\code{igraph} graph object using the ORCA fast graphlet orbit counting +package. } diff --git a/man/gdd_for_all_graphs.Rd b/man/gdd_for_all_graphs.Rd index 7e6956bf..c9703631 100644 --- a/man/gdd_for_all_graphs.Rd +++ b/man/gdd_for_all_graphs.Rd @@ -25,12 +25,12 @@ gdd_for_all_graphs( \item{feature_type}{Type of graphlet-based degree distributions. Can be \code{graphlet} to count graphlets or \code{orbit} to count orbits.} -\item{max_graphlet_size}{Maximum size of graphlets to use when generating GDD. -Currently only size 4 and 5 are supported.} +\item{max_graphlet_size}{Maximum size of graphlets to use when generating +GDD. Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node used to select the -neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be -used.} +\item{ego_neighbourhood_size}{The number of steps from the source node used +to select the neighboring nodes to be included in the source node +ego-network. If set to 0, ego-networks will not be used.} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to the \code{mc.cores} option set in the R environment.} diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index ae526218..816a847d 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -8,14 +8,16 @@ each feature.} graph_features_to_histograms(features_matrix) } \arguments{ -\item{features_matrix}{A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i.} +\item{features_matrix}{A matrix whose rows represent nodes and whose columns +represent different node level features. This means that entry ij provides +the value of feature j for node i.} } \value{ -Feature histograms: List of "discrete histograms" for each -feature +Feature histograms: List of "discrete histograms" for each feature } \description{ Converts a matrix of node level features (e.g. for example counts of multiple graphlets or orbits at each node) to -a set of histogram like objects (observed frequency distribution of each feature/column) +a set of histogram like objects (observed frequency distribution of each +feature/column) } diff --git a/man/graphlet_key.Rd b/man/graphlet_key.Rd index 23f2289b..b63c9315 100644 --- a/man/graphlet_key.Rd +++ b/man/graphlet_key.Rd @@ -7,7 +7,8 @@ graphlet_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. +Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: diff --git a/man/min_emd.Rd b/man/min_emd.Rd index 2799f48a..08cf27f7 100644 --- a/man/min_emd.Rd +++ b/man/min_emd.Rd @@ -15,8 +15,8 @@ min_emd(dhist1, dhist2, method = "optimise") offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} } \value{ diff --git a/man/min_emd_exhaustive.Rd b/man/min_emd_exhaustive.Rd index cd02830b..91b46ce3 100644 --- a/man/min_emd_exhaustive.Rd +++ b/man/min_emd_exhaustive.Rd @@ -25,8 +25,8 @@ to calculate the EMD at all offsets where any knots from the two ECMFs align to ensure that the offset with the global minimum EMD is found. This is because of the piece-wise linear nature of the two ECMFs. Between any -two offsets where knots from the two ECMFs align, EMD will be either constant, -or uniformly increasing or decreasing. Therefore, there the EMD between two -sets of aligned knots cannot be smaller than the EMD at one or other of the -bounding offsets. +two offsets where knots from the two ECMFs align, EMD will be either +constant, or uniformly increasing or decreasing. Therefore, there the EMD +between two sets of aligned knots cannot be smaller than the EMD at one or +other of the bounding offsets. } diff --git a/man/netdis.plot.Rd b/man/netdis.plot.Rd index 44cc33ed..c5bb8bd6 100644 --- a/man/netdis.plot.Rd +++ b/man/netdis.plot.Rd @@ -15,17 +15,24 @@ netdis.plot( \arguments{ \item{netdislist}{Default output of \code{netdis_many_to_many}.} -\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} +to be used for plotting.} -\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} +function from the \code{pheatmap} package. The dendrogram will appear if +\code{docluster} is TRUE (default).} \item{main}{Title of the plot.} -\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} +\item{docluster}{controls the order of the rows and columns. If TRUE +(default) the rows and columns will be reordered to create the dendrogram. If +FALSE, then only the heatmap is drawn.} } \value{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } \description{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } diff --git a/man/netemd.plot.Rd b/man/netemd.plot.Rd index 269e8010..91177187 100644 --- a/man/netemd.plot.Rd +++ b/man/netemd.plot.Rd @@ -12,19 +12,26 @@ netemd.plot( ) } \arguments{ -\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} +function from the \code{pheatmap} package. The dendrogram will appear if +\code{docluster} is TRUE (default).} \item{main}{Title of the plot.} -\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} +\item{docluster}{controls the order of the rows and columns. If TRUE +(default) the rows and columns will be reordered to create the dendrogram. If +FALSE, then only the heatmap is drawn.} \item{netdislist}{Default output of \code{netdis_many_to_many}.} -\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} +to be used for plotting.} } \value{ -Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heat map and dendrogram for the network comparisons via +\code{pheatmap}. } \description{ -Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +Provides a heatmap and dendrogram for the network comparisons via +\code{pheatmap}. } diff --git a/man/netemd_many_to_many.Rd b/man/netemd_many_to_many.Rd index 0e7e8da9..0ce1f608 100644 --- a/man/netemd_many_to_many.Rd +++ b/man/netemd_many_to_many.Rd @@ -18,16 +18,22 @@ netemd_many_to_many( ) } \arguments{ -\item{graphs}{A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided.} +\item{graphs}{A list of network/graph objects from the \code{igraph} package. +\code{graphs} can be set to \code{NULL} (default) if \code{dhists} is +provided.} -\item{dhists}{A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} +\item{dhists}{A list whose elements contain either: A list of \code{dhist} +discrete histogram objects for each graph, or a list a matrix of network +features (each column representing a feature). \code{dhists} can be set to +\code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object +can be obtained from \code{graph_features_to_histograms}.} \item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to @@ -43,15 +49,16 @@ minimal EMDs and associated offsets for all pairs of histograms} the \code{mc.cores} option set in the R environment.} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" -counts the number of graphlets each node participates in; "orbit" (default) calculates -the number of graphlet orbits each node participates in.} +counts the number of graphlets each node participates in; "orbit" (default) +calculates the number of graphlet orbits each node participates in.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Possible values are 4, and 5 (default).} \item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} +include nodes for each ego-network. NetEmd was proposed for individual nodes +alone, hence the default value is 0.} } \value{ NetEMD measures between all pairs of graphs for which features @@ -60,10 +67,10 @@ parameter. If set to FALSE, a list is returned with the following named elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, \code{comp_spec}: a comparison specification table containing the graph names and indices within the input GDD list for each pair of graphs compared. -If \code{return_details} is set to FALSE, the list also contains the following -matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD -used to compute the NetEMD, \code{min_offsets}: the associated offsets giving -the minimal EMD for each GDD +If \code{return_details} is set to FALSE, the list also contains the +following matrices for each graph pair: \code{min_emds}: the minimal EMD for +each GDD used to compute the NetEMD, \code{min_offsets}: the associated +offsets giving the minimal EMD for each GDD } \description{ NetEMDs between all graph pairs using provided Graphlet-based Degree diff --git a/man/netemd_one_to_one.Rd b/man/netemd_one_to_one.Rd index caebd1e7..7750c68b 100644 --- a/man/netemd_one_to_one.Rd +++ b/man/netemd_one_to_one.Rd @@ -18,21 +18,30 @@ netemd_one_to_one( ) } \arguments{ -\item{graph_1}{A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided.} +\item{graph_1}{A network/graph object from the \code{igraph} package. +\code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is +provided.} -\item{graph_2}{A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided.} +\item{graph_2}{A network/graph object from the \code{igraph} package. +\code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is +provided.} -\item{dhists_1}{Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} +\item{dhists_1}{Either, a \code{dhist} discrete histogram object, or list of +such objects, or a matrix of network features (each column representing a +feature). \code{dhists_1} can be set to \code{NULL} (default) if +\code{graph_1} is provided. A \code{dhist} object can be obtained from +\code{graph_features_to_histograms}.} \item{dhists_2}{Same as \code{dhists_1}.} -\item{method}{The method to be used to find the minimum EMD across all potential -offsets for each pair of histograms. Default is "optimise" to use +\item{method}{The method to be used to find the minimum EMD across all +potential offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the -histograms at all offsets that are candidates for the minimal EMD at the cost of computational time.} +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the +histograms at all offsets that are candidates for the minimal EMD at the cost +of computational time.} \item{return_details}{Logical indicating whether to return the individual minimal EMDs and associated offsets for all pairs of histograms.} @@ -44,43 +53,59 @@ which results in no smoothing. Care should be taken to select a (e.g.for the integer domain a width of 1 is the natural choice).} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" -counts the number of graphlets each node participates in; "orbit" (default) calculates -the number of graphlet orbits each node participates in.} +counts the number of graphlets each node participates in; "orbit" (default) +calculates the number of graphlet orbits each node participates in.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. Possible values are 4, and 5 (default).} \item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} +include nodes for each ego-network. NetEmd was proposed for individual nodes +alone, hence the default value is 0.} } \value{ -NetEMD measure for the two sets of discrete histograms (or graphs). If -(\code{return_details = FALSE}) then a list with the following named elements is returned -\code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: -the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -offsets giving the minimal EMD for each pair of histograms +NetEMD measure for the two sets of discrete histograms (or graphs). +If (\code{return_details = FALSE}) then a list with the following named +elements is returned \code{net_emd}: the NetEMD for the set of histogram +pairs (or graphs), \code{min_emds}: the minimal EMD for each pair of +histograms, \code{min_offsets}: the associated offsets giving the minimal EMD +for each pair of histograms } \description{ Calculates the network Earth Mover's Distance (EMD) between -two sets of network features. This is done by individually normalising the distribution -of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. +two sets of network features. This is done by individually normalising the +distribution of each feature so that they have unit mass and unit variance. +Then the minimun EMD between the same pair of features (one for each +corresponding graph) is calculated by considering all possible translations +of the feature distributions. Finally the average over all features is +reported. This is calculated as follows: 1. Normalise each feature histogram to have unit mass and unit variance. - 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. + 2. For each feature, find the minimum EMD between each pair of histograms + considering all possible histogram translations. 3. Take the average minimum EMD across all features. } \examples{ require(igraph) graph_1 <- graph.lattice(c(8, 8)) graph_2 <- graph.lattice(c(44, 44)) -netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", max_graphlet_size = 5) +netemd_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + feature_type = "orbit", + max_graphlet_size = 5 +) # Providing a matrix of network features props_a <- count_orbits_per_node(graph = graph_1, max_graphlet_size = 5) props_b <- count_orbits_per_node(graph = graph_2, max_graphlet_size = 5) -netemd_one_to_one(dhists_1 = props_a, dhists_2 = props_b, smoothing_window_width = 1) +netemd_one_to_one( + dhists_1 = props_a, + dhists_2 = props_b, + smoothing_window_width = 1 +) # Providing the network features as lists of dhist objects dhists_1 <- graph_features_to_histograms(props_a) @@ -91,17 +116,46 @@ netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2) # A variation of NetEmd: Using the Laplacian spectrum # Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) +Lapg_1 <- igraph::laplacian_matrix( + graph = graph_1, + normalized = FALSE, + sparse = FALSE +) +Lapg_2 <- igraph::laplacian_matrix( + graph = graph_2, + normalized = FALSE, + sparse = FALSE +) # Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) +NLapg_1 <- igraph::laplacian_matrix( + graph = graph_1, + normalized = TRUE, + sparse = FALSE +) +NLapg_2 <- igraph::laplacian_matrix( + graph = graph_2, + normalized = TRUE, + sparse = FALSE +) # Spectra (This may take a couple of minutes). -props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) -props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) +props_1 <- cbind( + L.Spectra = eigen(Lapg_1)$values, + NL.Spectra = eigen(NLapg_1)$values +) +props_2 <- cbind( + L.Spectra = eigen(Lapg_2)$values, + NL.Spectra = eigen(NLapg_2)$values +) -netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If +# the network features are considered continuous variables +# smoothing_window_width equal to zero is recommended. +netemd_one_to_one( + dhists_1 = props_1, + dhists_2 = props_2, + smoothing_window_width = 0 +) } diff --git a/man/netemd_single_pair.Rd b/man/netemd_single_pair.Rd index 5cf3b3ed..0c805821 100644 --- a/man/netemd_single_pair.Rd +++ b/man/netemd_single_pair.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_emd.R \name{netemd_single_pair} \alias{netemd_single_pair} -\title{Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms} +\title{Internal function to compute the minimum Earth Mover's Distance between +standarized and translated histograms} \usage{ netemd_single_pair( dhist1, @@ -16,8 +17,8 @@ netemd_single_pair( offsets for each pair of histograms. Default is "optimise" to use R's built-in \code{stats::optimise} method to efficiently find the offset with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +minimum if multiple local minima EMDs exist. You can alternatively specify +the "exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to @@ -26,18 +27,23 @@ which results in no smoothing. Care should be taken to select a \code{smoothing_window_width} that is appropriate for the discrete domain (e.g.for the integer domain a width of 1 is the natural choice)} -\item{dhists_1}{A \code{dhist} discrete histogram object or a list of such objects} +\item{dhists_1}{A \code{dhist} discrete histogram object or a list of such +objects} -\item{dhists_2}{A \code{dhist} discrete histogram object or a list of such objects} +\item{dhists_2}{A \code{dhist} discrete histogram object or a list of such +objects} } \value{ A list with the following named elements -\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated -offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. +\code{net_emd}: the NetEMD for the set of histogram pairs, +\code{min_offsets}: the associated offsets giving the minimal EMD for each +pair of histograms and \code{min_offset_std}: Offset used in the standardised +histograms. } \description{ Calculates the minimum Earth Mover's Distance (EMD) between two -discrete histograms after normalising each histogram to unit mass and variance. +discrete histograms after normalising each histogram to unit mass and +variance. This is calculated as follows: 1. Normalise each histogram to have unit mass and unit variance 2. Find the minimum EMD between the histograms @@ -51,5 +57,10 @@ props_2 <- count_orbits_per_node(graph = goldstd_2, max_graphlet_size = 5) dhists_1 <- graph_features_to_histograms(props_1) dhists_2 <- graph_features_to_histograms(props_2) # Obtain the minimum NetEMD_edges between the histograms -netemd_single_pair(dhists_1[[1]], dhists_2[[1]], method = "optimise", smoothing_window_width = 0) +netemd_single_pair( + dhists_1[[1]], + dhists_2[[1]], + method = "optimise", + smoothing_window_width = 0 +) } diff --git a/man/orbit_key.Rd b/man/orbit_key.Rd index 79d8afb4..e7921d92 100644 --- a/man/orbit_key.Rd +++ b/man/orbit_key.Rd @@ -7,7 +7,8 @@ orbit_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. +Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 95416894..5a4addc9 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -50,10 +50,17 @@ following order: previous alterations) } \examples{ -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Set source directory for Virus protein-protein interaction edge files +# stored in the netdist package. +source_dir <- system.file( + file.path("extdata", "VRPINS"), + package = "netdist" +) print(source_dir) # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_1 <- read_simple_graph( + file.path(source_dir, "EBV.txt"), + format = "ncol" +) graph_1 } From 681106cb34c81130038f3a5831e24e840fa8936e Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Tue, 7 Jun 2022 12:30:16 +0100 Subject: [PATCH 65/84] styler vignettes --- .github/workflows/quality.yaml | 2 +- vignettes/ManyToMany.Rmd | 84 +++++++------- vignettes/NetEmdTimeOrdering.Rmd | 54 ++++----- vignettes/NetdisGPStepByStep.Rmd | 126 ++++++++++++--------- vignettes/NetdisStepByStep.Rmd | 128 +++++++++++++--------- vignettes/NewNetdisCustomisations.Rmd | 128 ++++++++++++---------- vignettes/PreComputedProps.Rmd | 90 +++++++-------- vignettes/default_pairwise_usage.Rmd | 79 ++++++------- vignettes/dendrogram_example_net_dis.Rmd | 95 ++++++++-------- vignettes/dendrogram_example_net_emd.Rmd | 65 +++++------ vignettes/netdis_customisations.Rmd | 74 +++++++------ vignettes/netdis_pairwise_comparisons.Rmd | 47 ++++---- vignettes/quickstart_netdis_2graphs.Rmd | 108 ++++++++++-------- 13 files changed, 585 insertions(+), 495 deletions(-) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 7a53a048..8a29ff60 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -28,7 +28,7 @@ jobs: any::styler - name: Styler - run: styler::style_pkg(dry='fail') + run: styler::style_pkg(filetype = c('R', 'Rprofile', 'Rmd'), dry='fail') shell: Rscript {0} - name: Lintr diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd index 722a0c0b..89dc2ea5 100644 --- a/vignettes/ManyToMany.Rmd +++ b/vignettes/ManyToMany.Rmd @@ -12,10 +12,10 @@ editor_options: --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -46,20 +46,20 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 ```{r, netwokrs,fig.align='center',fig.dim=c(8,4)} # Create networks set.seed(3171) -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(40,40)) -gRing_1 <- igraph::make_ring(20^2) +gLat_1 <- igraph::graph.lattice(c(20, 20)) +gLat_2 <- igraph::graph.lattice(c(40, 40)) +gRing_1 <- igraph::make_ring(20^2) gRing_2 <- igraph::make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -par(mfrow=c(1,2)) -plot(gLat_1,vertex.size=0.8,vertex.label=NA) -plot(gLat_2,vertex.size=0.8,vertex.label=NA) -plot(gRing_1,vertex.size=0.8,vertex.label=NA) -plot(gRing_2,vertex.size=0.8,vertex.label=NA) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +par(mfrow = c(1, 2)) +plot(gLat_1, vertex.size = 0.8, vertex.label = NA) +plot(gLat_2, vertex.size = 0.8, vertex.label = NA) +plot(gRing_1, vertex.size = 0.8, vertex.label = NA) +plot(gRing_2, vertex.size = 0.8, vertex.label = NA) +plot(gTree_1, vertex.size = 0.8, vertex.label = NA) +plot(gTree_2, vertex.size = 0.8, vertex.label = NA) ``` ## NetEmd using subgraph counts @@ -67,9 +67,9 @@ plot(gTree_2,vertex.size=0.8,vertex.label=NA) Subgraph count based NetEmd comparisons: ```{r, netemdorbits,fig.align='center'} # NetEMD using subgraph counts. -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) +glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) -netemdlist <- netemd_many_to_many(graphs = glist,smoothing_window_width = 1,mc.cores = 1) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +netemdlist <- netemd_many_to_many(graphs = glist, smoothing_window_width = 1, mc.cores = 1) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. netemdlist ``` @@ -83,7 +83,7 @@ mat Illustration of the multiple NetEmd comparisons based on subgraph counts. ```{r,netemdorbitsPLOT,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd subgraph counts") +netemd.plot(netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd subgraph counts") ``` ## NetEmd using the Laplacian and Normalized Laplacian Spectrum @@ -91,20 +91,20 @@ netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd subgra Pre-compute the Laplacian and normalized Laplacian for each graph considered: ```{r, netemdspectrum} # NetEMD using the Laplacian and normalized Laplacian Spectrum. -SPECT<-list() +SPECT <- list() -#This step may take several minutes. -for(i in 1:length(glist)){ - Lapg <- igraph::laplacian_matrix(graph = glist[[i]],normalized = FALSE,sparse = FALSE) - NLap <- igraph::laplacian_matrix(graph = glist[[i]],normalized = TRUE,sparse = FALSE) - SPECT[[ names(glist)[i] ]] <- cbind(L.Spectra= eigen(Lapg)$values, NL.Spectra= eigen(NLap)$values) +# This step may take several minutes. +for (i in 1:length(glist)) { + Lapg <- igraph::laplacian_matrix(graph = glist[[i]], normalized = FALSE, sparse = FALSE) + NLap <- igraph::laplacian_matrix(graph = glist[[i]], normalized = TRUE, sparse = FALSE) + SPECT[[names(glist)[i]]] <- cbind(L.Spectra = eigen(Lapg)$values, NL.Spectra = eigen(NLap)$values) } str(SPECT) ``` Compute NetEmd: ```{r} -netemdlist <- netemd_many_to_many(dhists = SPECT,smoothing_window_width = 0) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +netemdlist <- netemd_many_to_many(dhists = SPECT, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. netemdlist ``` @@ -112,7 +112,7 @@ netemdlist ### Illustration of the multiple NetEmd comparisons based on the Laplacian and Normalized Laplacian spectra ```{r,netemdspectrumPLOT ,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd Spectra") +netemd.plot(netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd Spectra") ``` @@ -135,26 +135,26 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 ```{r, netdisgoldstandnetworks,fig.align='center',fig.dim=c(8,4)} # Create networks set.seed(3171) -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(40,40)) -gRing_1 <- igraph::make_ring(20^2) +gLat_1 <- igraph::graph.lattice(c(20, 20)) +gLat_2 <- igraph::graph.lattice(c(40, 40)) +gRing_1 <- igraph::make_ring(20^2) gRing_2 <- igraph::make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) # Create a random graph to be used as a gold-standard -gst_1 <- igraph::as.undirected( graph.star(20^2) ) -gst_2 <- igraph::as.undirected( graph.star(40^2) ) +gst_1 <- igraph::as.undirected(graph.star(20^2)) +gst_2 <- igraph::as.undirected(graph.star(40^2)) -par(mfrow=c(1,2)) -plot(gst_1,vertex.size=0.8,vertex.label=NA) -plot(gst_2,vertex.size=0.8,vertex.label=NA) +par(mfrow = c(1, 2)) +plot(gst_1, vertex.size = 0.8, vertex.label = NA) +plot(gst_2, vertex.size = 0.8, vertex.label = NA) ``` Obtain the comparison via Netdis using each of the reference graph networks. ```{r,netdisgoldstand ,fig.align='center'} -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) +glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) # Netdis using the goldstd_1 graph as gold-standard reference point netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = gst_1) @@ -177,11 +177,11 @@ cross_comp_to_matrix(measure = netdis_mat_gst2$netdis, cross_comparison_spec = n Heatmap of the Netdis comparisons: ```{r,netdisgoldstandPLOT ,fig.align='center',fig.dim=c(8,8)} -#Network comparisons heatmap with Gold-Standard 1 -netdis.plot(netdislist = netdis_mat_gst1, whatrow = 2,main = "Netdis GoldStd-1") +# Network comparisons heatmap with Gold-Standard 1 +netdis.plot(netdislist = netdis_mat_gst1, whatrow = 2, main = "Netdis GoldStd-1") -#Network comparisons heatmap with Gold-Standard 2 -netdis.plot(netdislist = netdis_mat_gst2, whatrow = 2,main = "Netdis GoldStd-2") +# Network comparisons heatmap with Gold-Standard 2 +netdis.plot(netdislist = netdis_mat_gst2, whatrow = 2, main = "Netdis GoldStd-2") ``` @@ -196,7 +196,7 @@ netdis_mat ``` ```{r,netdisGPPLOT ,fig.align='center',fig.dim=c(8,8)} -netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-GP") +netdis.plot(netdislist = netdis_mat, whatrow = 2, main = "Netdis-GP") ``` ### Using Netdis with no expectation ($E_w=0$) @@ -210,7 +210,7 @@ netdis_mat ``` ```{r,netdiszeroPLOT ,fig.align='center',fig.dim=c(8,8)} -netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-zero") +netdis.plot(netdislist = netdis_mat, whatrow = 2, main = "Netdis-zero") ``` ------------------------- diff --git a/vignettes/NetEmdTimeOrdering.Rmd b/vignettes/NetEmdTimeOrdering.Rmd index 01c91197..2a199a41 100644 --- a/vignettes/NetEmdTimeOrdering.Rmd +++ b/vignettes/NetEmdTimeOrdering.Rmd @@ -28,17 +28,17 @@ For other vignettes in this package see the ["Menu"](V-Menu.html). The package contains the world trade networks and pre-computed subgraph/graphlet counts in the R data object ` worldtradesub`. This object contains a list of two lists. The first list is `worldtradesub$wtnets` which contains a small sample of the trade networks (2001-2014) and the second `worldtradesub$Counts` which contains pre-computed counts for a larger set of trade networks going from 1985 to 2014. ```{r, message=FALSE} - library("netdist") - library("igraph") - data(worldtradesub) - summary(worldtradesub) - wtnets<- worldtradesub$wtnets - summary(wtnets) +library("netdist") +library("igraph") +data(worldtradesub) +summary(worldtradesub) +wtnets <- worldtradesub$wtnets +summary(wtnets) ``` These world trade networks are denser than typically sparse social networks. For example, the edge density for the network in 2001 is `r igraph::graph.density(worldtradesub$wtnets$wtn2001)`. Here is a plot of this network highlighting the relatively large number of edges: ```{r,fig.align='center',fig.dim=c(5,5)} - plot(wtnets$wtn2001,vertex.size=5,vertex.label.cex=0.4) +plot(wtnets$wtn2001, vertex.size = 5, vertex.label.cex = 0.4) ``` @@ -47,11 +47,11 @@ These world trade networks are denser than typically sparse social networks. For In this example **NetEmd** will consider orbit counts of subgraphs containing up to 5 nodes. If NetEmd is to be called a single time, then the command `netemd_many_to_many(graphs = wtnets)` would suffice. The following code provides such an example: ```{r} - # As the trade networks are considerable dense, this example first considers a small number of networks. - #This example may take some minutes to run. - netemd_result <- netemd_many_to_many(graphs = wtnets[1:4],mc.cores = 1) +# As the trade networks are considerable dense, this example first considers a small number of networks. +# This example may take some minutes to run. +netemd_result <- netemd_many_to_many(graphs = wtnets[1:4], mc.cores = 1) - print(netemd_result) +print(netemd_result) ``` However, if there are pre-computed counts or features NetEmd can be called via these features instead. @@ -61,26 +61,26 @@ World trade networks consist of relatively dense networks, thus leading to longe ```{r} # This example may take more than a few minutes to run (approx. 20 mins) , and it is not necessary to run it for the upcoming examples as a larger set of counts has been already computed. -if(FALSE){# It is not necessary to run, as these counts are already available in. - Counts <- list() - for(i in 1:length(wtnets)){ - Counts[[ names(wtnets)[i] ]] <- count_orbits_per_node(graph = wtnets[[i]],max_graphlet_size = 5) - } +if (FALSE) { # It is not necessary to run, as these counts are already available in. + Counts <- list() + for (i in 1:length(wtnets)) { + Counts[[names(wtnets)[i]]] <- count_orbits_per_node(graph = wtnets[[i]], max_graphlet_size = 5) + } } ``` Now, with pre-computed counts NetEmd can be calculated more rapidly as the computations of the counts are often the bottle neck in the computational time of NetEmd. NetEmd will be called `r length(worldtradesub$Counts) * (length(worldtradesub$Counts) - 1)/2 ` times in order to obtain all pairwise comparisons between the trade networks from 1985 to 2014 (networks with pre-computed subgraph counts): ```{r} - # The pre-computed counts already in the package - Counts <- worldtradesub$Counts - - #Calling NetEmd - netemd_result <- netemd_many_to_many(dhists = Counts ,mc.cores = 1) - - #Results - netemd_matrix <- cross_comp_to_matrix(measure = netemd_result$netemds, cross_comparison_spec = netemd_result$comp_spec) - - print(netemd_matrix[1:10,1:5]) +# The pre-computed counts already in the package +Counts <- worldtradesub$Counts + +# Calling NetEmd +netemd_result <- netemd_many_to_many(dhists = Counts, mc.cores = 1) + +# Results +netemd_matrix <- cross_comp_to_matrix(measure = netemd_result$netemds, cross_comparison_spec = netemd_result$comp_spec) + +print(netemd_matrix[1:10, 1:5]) ``` # Evidence of change in world trade @@ -88,7 +88,7 @@ Now, with pre-computed counts NetEmd can be calculated more rapidly as the compu Based on the comparison of the world trade networks across the years, we can identify periods of time where possible considerable changes in world trade have occurred. The following heat map clearly shows the existence of two potential changes in the world trade system, and which correspond to 1995-1996 and 2010-2011. ```{r,fig.align='center',fig.dim=c(8.5,8.5)} - netemd.plot(netemdlist=netemd_result,clustering_method="ward.D",main="NetEmd",docluster = FALSE) +netemd.plot(netemdlist = netemd_result, clustering_method = "ward.D", main = "NetEmd", docluster = FALSE) ``` The World Trade Organization (WTO) said the following about these years: diff --git a/vignettes/NetdisGPStepByStep.Rmd b/vignettes/NetdisGPStepByStep.Rmd index f7ee738f..988efe97 100644 --- a/vignettes/NetdisGPStepByStep.Rmd +++ b/vignettes/NetdisGPStepByStep.Rmd @@ -13,10 +13,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -92,11 +92,11 @@ Generation of tree-like networks with 400 nodes and 1600 nodes. ```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} # Create networks set.seed(34) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +plot(gTree_1, vertex.size = 0.8, vertex.label = NA) +plot(gTree_2, vertex.size = 0.8, vertex.label = NA) ``` @@ -126,17 +126,19 @@ num_bins <- 100 One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: ```{r} # Get ego-networks for query graphs -ego_1 <- make_named_ego_graph(gTree_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(gTree_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) -head(ego_1,n=2) -head(ego_2,n=2) +ego_1 <- make_named_ego_graph(gTree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(gTree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) +head(ego_1, n = 2) +head(ego_2, n = 2) ``` ## Count the number of nodes and the subgraphs in the ego-networks of each graph ($N_w$) @@ -159,13 +161,17 @@ densities_1 <- ego_network_density(graphlet_counts = subgraph_counts_1) densities_2 <- ego_network_density(graphlet_counts = subgraph_counts_2) # Adaptively bin ego-network densities -binned_densities_1 <- binned_densities_adaptive(densities = densities_1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - -binned_densities_2 <- binned_densities_adaptive(densities = densities_2, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities_1 <- binned_densities_adaptive( + densities = densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) + +binned_densities_2 <- binned_densities_adaptive( + densities = densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) str(binned_densities_1) str(binned_densities_2) @@ -174,13 +180,17 @@ str(binned_densities_2) ## Calculate expected subgraph counts in each density bin by the Geometric-Poisson approximation ($E_w$) With the ego-network binning obtained, the Geometric-Poisson approximation of the expected subgraph counts, $E_w$, can be obtained for each subgraph $w$ and each density bin: ```{r} -binned_gp_subgraph_counts_1 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_1, - density_interval_indexes = binned_densities_1$interval_indexes, - max_graphlet_size = max_subgraph_size) - -binned_gp_subgraph_counts_2 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_2, - density_interval_indexes = binned_densities_2$interval_indexes, - max_graphlet_size = max_subgraph_size) +binned_gp_subgraph_counts_1 <- density_binned_counts_gp( + graphlet_counts = subgraph_counts_1, + density_interval_indexes = binned_densities_1$interval_indexes, + max_graphlet_size = max_subgraph_size +) + +binned_gp_subgraph_counts_2 <- density_binned_counts_gp( + graphlet_counts = subgraph_counts_2, + density_interval_indexes = binned_densities_2$interval_indexes, + max_graphlet_size = max_subgraph_size +) binned_gp_subgraph_counts_1 binned_gp_subgraph_counts_2 ``` @@ -189,27 +199,35 @@ binned_gp_subgraph_counts_2 With $E_w$ now obtained, Netdis-GP, can be compute as per its construction by first centring the observed counts: ```{r} # Calculate expected subgraph counts for each ego network -exp_gp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, - density_breaks = binned_densities_1$breaks, - density_binned_reference_counts = binned_gp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size, - scale_fn=NULL) - - -exp_gp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, - density_breaks = binned_densities_2$breaks, - density_binned_reference_counts = binned_gp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size, - scale_fn=NULL) +exp_gp_subgraph_counts_1 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_1$breaks, + density_binned_reference_counts = binned_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size, + scale_fn = NULL +) + + +exp_gp_subgraph_counts_2 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_2$breaks, + density_binned_reference_counts = binned_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size, + scale_fn = NULL +) # Centre subgraph counts by subtracting expected counts -centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, - exp_graphlet_counts = exp_gp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size) - -centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, - exp_graphlet_counts = exp_gp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +centred_subgraph_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size +) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) head(centred_subgraph_counts_1) head(centred_subgraph_counts_2) @@ -229,9 +247,11 @@ sum_subgraph_counts_2 Finally, the total centred counts can be used to obtain the Netdis statistic: ```{r} -netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, - centred_graphlet_count_vector_2 = sum_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +netdis_result <- netdis_uptok( + centred_graphlet_count_vector_1 = sum_subgraph_counts_1, + centred_graphlet_count_vector_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) print(netdis_result) ``` diff --git a/vignettes/NetdisStepByStep.Rmd b/vignettes/NetdisStepByStep.Rmd index 9b69845e..7162f142 100644 --- a/vignettes/NetdisStepByStep.Rmd +++ b/vignettes/NetdisStepByStep.Rmd @@ -13,10 +13,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -80,11 +80,11 @@ Generation of tree-like networks with 400 nodes and 1600 nodes. ```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} # Create networks set.seed(34) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -plot(gTree_1,vertex.size=0.8,vertex.label=NA) -plot(gTree_2,vertex.size=0.8,vertex.label=NA) +plot(gTree_1, vertex.size = 0.8, vertex.label = NA) +plot(gTree_2, vertex.size = 0.8, vertex.label = NA) ``` @@ -115,17 +115,19 @@ num_bins <- 100 One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: ```{r} # Get ego-networks for query graphs -ego_1 <- make_named_ego_graph(gTree_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(gTree_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) -tail(ego_1,n=2) -tail(ego_2,n=2) +ego_1 <- make_named_ego_graph(gTree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(gTree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) +tail(ego_1, n = 2) +tail(ego_2, n = 2) ``` ## Count the number of nodes and the subgraphs in ego-networks of query graphs ($N_w$) @@ -158,8 +160,8 @@ For this case the user must provide the gold-standard network of their choosing. The following considers a tree-like network with `r 30^2` nodes as the gold-standard. ```{r,fig.align='center',fig.dim=c(6,6)} # Network used as gold-standard -gst_1 <- erdos.renyi.game(n = 30^2,p.or.m = graph.density(graph = gTree_2)) -plot(gst_1,vertex.size=0.8,vertex.label=NA) +gst_1 <- erdos.renyi.game(n = 30^2, p.or.m = graph.density(graph = gTree_2)) +plot(gst_1, vertex.size = 0.8, vertex.label = NA) ``` ### Obtain the gold-standard ego-network counts and their binning according to their edge-density ($\rho(.)$) @@ -167,25 +169,31 @@ plot(gst_1,vertex.size=0.8,vertex.label=NA) To calculate the expected counts, $E_w$, the counts of the ego-networks of the gold-standard network need to be obtained first: ```{r} # Obtain subgraph counts and binning for gold-standard -ego_gst_1 <- make_named_ego_graph(graph = gst_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -subgraph_counts_gst_1 <- ego_to_graphlet_counts(ego_networks = ego_gst_1, - max_graphlet_size = max_subgraph_size) +ego_gst_1 <- make_named_ego_graph( + graph = gst_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +subgraph_counts_gst_1 <- ego_to_graphlet_counts( + ego_networks = ego_gst_1, + max_graphlet_size = max_subgraph_size +) head(subgraph_counts_gst_1) ``` Subsequently, these ego-networks are binned according to their edge density: ```{r} -densities_gst_1<- ego_network_density(graphlet_counts = subgraph_counts_gst_1) +densities_gst_1 <- ego_network_density(graphlet_counts = subgraph_counts_gst_1) # Adaptively bin ego-network densities -binned_densities_gst_1 <- binned_densities_adaptive(densities = densities_gst_1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities_gst_1 <- binned_densities_adaptive( + densities = densities_gst_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) str(binned_densities_gst_1) ``` @@ -197,8 +205,10 @@ $E_w$ is estimated based on the average subgraph counts of ego-networks per dens ```{r} # Scale ego-network subgraph counts by dividing by the total number of k-tuples in the # ego-network (where k is the subgraph size) -scaled_subgraph_counts_ref <- scale_graphlet_counts_ego(graphlet_counts = subgraph_counts_gst_1, - max_graphlet_size =max_subgraph_size) +scaled_subgraph_counts_ref <- scale_graphlet_counts_ego( + graphlet_counts = subgraph_counts_gst_1, + max_graphlet_size = max_subgraph_size +) str(scaled_subgraph_counts_ref) ``` @@ -215,27 +225,35 @@ ref_binned_canonical_subgraph_counts After obtaining the average scaled subgraph counts per density bin, the subgraph counts of the query networks can be centred: ```{r} # Scale the reference counts of the gold-standard network to the sizes of each of the query ego-networks. -exp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, - density_breaks = binned_densities_gst_1$breaks, - density_binned_reference_counts = ref_binned_canonical_subgraph_counts, - max_graphlet_size = max_subgraph_size, - scale_fn=count_graphlet_tuples) - - -exp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, - density_breaks = binned_densities_gst_1$breaks, - density_binned_reference_counts = ref_binned_canonical_subgraph_counts, - max_graphlet_size = max_subgraph_size, - scale_fn=count_graphlet_tuples) +exp_subgraph_counts_1 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_gst_1$breaks, + density_binned_reference_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn = count_graphlet_tuples +) + + +exp_subgraph_counts_2 <- netdis_expected_counts( + graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_gst_1$breaks, + density_binned_reference_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn = count_graphlet_tuples +) # Centre subgraph counts by subtracting expected counts -centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, - exp_graphlet_counts = exp_subgraph_counts_1, - max_graphlet_size = max_subgraph_size) - -centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, - exp_graphlet_counts = exp_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +centred_subgraph_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size +) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) tail(centred_subgraph_counts_1) tail(centred_subgraph_counts_2) @@ -256,9 +274,11 @@ sum_subgraph_counts_2 Finally, the total centred counts can be used to obtain the Netdis statistic: ```{r} -netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, - centred_graphlet_count_vector_2 = sum_subgraph_counts_2, - max_graphlet_size = max_subgraph_size) +netdis_result <- netdis_uptok( + centred_graphlet_count_vector_1 = sum_subgraph_counts_1, + centred_graphlet_count_vector_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size +) print(netdis_result) ``` diff --git a/vignettes/NewNetdisCustomisations.Rmd b/vignettes/NewNetdisCustomisations.Rmd index ba6c49b6..e8cbe645 100644 --- a/vignettes/NewNetdisCustomisations.Rmd +++ b/vignettes/NewNetdisCustomisations.Rmd @@ -12,10 +12,10 @@ chunk_output_type: console --- ```{r, include = FALSE} - knitr::opts_chunk$set( +knitr::opts_chunk$set( collapse = TRUE, comment = "#>" - ) +) ``` # Introduction @@ -43,14 +43,14 @@ Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 # Create lattice, Ring and Tree like networks of sizes 20^2 and 40^2. # Create networks set.seed(3171) -gLat_1 <- graph.lattice(c(20,20)) -gLat_2 <- graph.lattice(c(40,40)) -gRing_1 <- make_ring(20^2) +gLat_1 <- graph.lattice(c(20, 20)) +gLat_2 <- graph.lattice(c(40, 40)) +gRing_1 <- make_ring(20^2) gRing_2 <- make_ring(40^2) -gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) -gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) +gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) +gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) +glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) # Create a random graph to be used as a gold-standard gst <- igraph::as.undirected(graph.star(1000)) @@ -61,34 +61,38 @@ gst <- igraph::as.undirected(graph.star(1000)) ## Using Netdis with a reference graph as a proxy for $E_w$ For this variant a reference graph or gold-standard graph has to be given in `ref_graph`: ```{r,netdisgoldstand,fig.align='center',fig.dim=c(7,7)} -netdis_mat_gst <- netdis_many_to_many(graphs = glist, - ref_graph = gst - ) -netdis.plot(netdislist = netdis_mat_gst,whatrow = 2, main = "Netdis with reference graph") +netdis_mat_gst <- netdis_many_to_many( + graphs = glist, + ref_graph = gst +) +netdis.plot(netdislist = netdis_mat_gst, whatrow = 2, main = "Netdis with reference graph") ``` ## Using Netdis with a constant valued expectation, $E_w=k$ For this variant please set `ref_graph` to the desired constant $k$. In this example we consider $k=0$ and $k=5$. Considering $k=0$ is equivalent to computing Netdis without background expectations: ```{r,netdisconstant,fig.align='center',fig.dim=c(7,7)} -netdis_mat_zero <- netdis_many_to_many(graphs = glist, - ref_graph = 0 - ) -netdis.plot(netdislist = netdis_mat_zero,whatrow = 2, main = "Netdis Ew=0") - -netdis_mat_5 <- netdis_many_to_many(graphs = glist, - ref_graph = 5 - ) -netdis.plot(netdislist = netdis_mat_5,whatrow = 2, main = "Netdis Ew=5") +netdis_mat_zero <- netdis_many_to_many( + graphs = glist, + ref_graph = 0 +) +netdis.plot(netdislist = netdis_mat_zero, whatrow = 2, main = "Netdis Ew=0") + +netdis_mat_5 <- netdis_many_to_many( + graphs = glist, + ref_graph = 5 +) +netdis.plot(netdislist = netdis_mat_5, whatrow = 2, main = "Netdis Ew=5") ``` ## Using Netdis-GP, Geometric-Poisson approximation for $E_w$ In order to obtain the Netdis-GP variant set `ref_graph=NULL` (default). ```{r,fig.align='center',fig.dim=c(7,7)} -netdisgp_mat <- netdis_many_to_many(graphs = glist, - ref_graph = NULL - ) +netdisgp_mat <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL +) netdis.plot(netdisgp_mat, whatrow = 2, main = "Netdis-GP") ``` @@ -103,7 +107,7 @@ mybinning <- function(densities) { min_counts_per_interval <- 5 num_intervals <- 3 # - if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals @@ -118,7 +122,7 @@ mybinning <- function(densities) { breaks = breaks ) } - + # Let us see an example output of the binning function binning_example <- mybinning(runif(20)) binning_example$breaks @@ -126,12 +130,13 @@ binning_example$interval_indexes binning_example$densities # Calculate Netdis -netdisgp_mat_mybin <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - binning_fn = mybinning - ) +netdisgp_mat_mybin <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + binning_fn = mybinning +) -netdis.plot(netdislist = netdisgp_mat_mybin,whatrow = 2, main = "Netdis-GP with mybinning") +netdis.plot(netdislist = netdisgp_mat_mybin, whatrow = 2, main = "Netdis-GP with mybinning") ``` Note that whenever $E_w$ is taken as a constant value, then the binning will not have an effect on the computation of Netdis. @@ -163,42 +168,45 @@ These values can be directly imputed and changed into the shortcut Netdis functi ```{r,fig.align='center',fig.dim=c(7,7)} # (We only recommend changing these default values for those users that have a clear understanding of graph theory behind it) -#(change values with care as not all combinations may be possible). +# (change values with care as not all combinations may be possible). -#Defining a new binning function: +# Defining a new binning function: binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 10, #10-egos required per bin - num_intervals = 20) #Start binning with 20 bins - -#Changing parameter values in Netdis: -netdisgp_mat_custom <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - max_graphlet_size = 4, #Subgraphs/graphlets up to size 4 considered. - neighbourhood_size = 3,# 3-step ego-networks - min_ego_nodes = 5, #ego-networks with at least five nodes - min_ego_edges = 4, #ego-networks with at least 4 edges - binning_fn = binning_fn #Providing a custom binning function - ) + min_counts_per_interval = 10, # 10-egos required per bin + num_intervals = 20 +) # Start binning with 20 bins + +# Changing parameter values in Netdis: +netdisgp_mat_custom <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + max_graphlet_size = 4, # Subgraphs/graphlets up to size 4 considered. + neighbourhood_size = 3, # 3-step ego-networks + min_ego_nodes = 5, # ego-networks with at least five nodes + min_ego_edges = 4, # ego-networks with at least 4 edges + binning_fn = binning_fn # Providing a custom binning function +) ``` Here the default parameters are used, and a heatmap of the result of Netdis with default parameters and Netdis with the previously modified parameters is given: ```{r ,fig.align='center',fig.dim=c(7,7)} -#Default binning +# Default binning binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) -#Default computation of Netdis -netdisgp_mat <- netdis_many_to_many(graphs = glist, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn - ) -netdis.plot(netdislist = netdisgp_mat,whatrow = 2, main = "Netdis-GP: Default parameter values") -netdis.plot(netdislist = netdisgp_mat_custom,whatrow = 2, main = "Netdis-GP: illustrative parameter changes") - + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) +# Default computation of Netdis +netdisgp_mat <- netdis_many_to_many( + graphs = glist, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn +) +netdis.plot(netdislist = netdisgp_mat, whatrow = 2, main = "Netdis-GP: Default parameter values") +netdis.plot(netdislist = netdisgp_mat_custom, whatrow = 2, main = "Netdis-GP: illustrative parameter changes") ``` diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd index 815f7e85..564c9b1c 100644 --- a/vignettes/PreComputedProps.Rmd +++ b/vignettes/PreComputedProps.Rmd @@ -41,35 +41,35 @@ NetEmd and Netdis use subgraph counts, however, NetEmd takes counts directly fro ```{r, netemd,fig.align='center',fig.dim=c(8,4)} # Create lattice networks -gLat_1 <- igraph::graph.lattice(c(20,20)) -gLat_2 <- igraph::graph.lattice(c(44,44)) +gLat_1 <- igraph::graph.lattice(c(20, 20)) +gLat_2 <- igraph::graph.lattice(c(44, 44)) -par(mfrow=c(1,2)) -plot(gLat_1,vertex.size=4,vertex.label=NA) -plot(gLat_2,vertex.size=4,vertex.label=NA) +par(mfrow = c(1, 2)) +plot(gLat_1, vertex.size = 4, vertex.label = NA) +plot(gLat_2, vertex.size = 4, vertex.label = NA) ``` The simple computation of NetEmd without pre-computed features: ```{r} -netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",smoothing_window_width = 1) +netemd_one_to_one(graph_1 = gLat_1, graph_2 = gLat_2, feature_type = "orbit", smoothing_window_width = 1) ``` ### Providing a matrix of network features ```{r} -counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) -counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) -head(counts_1[,1:4]) +counts_1 <- count_orbits_per_node(graph = gLat_1, max_graphlet_size = 5) +counts_2 <- count_orbits_per_node(graph = gLat_2, max_graphlet_size = 5) +head(counts_1[, 1:4]) -netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2,smoothing_window_width = 1) +netemd_one_to_one(dhists_1 = counts_1, dhists_2 = counts_2, smoothing_window_width = 1) ``` ### Providing the network features as lists of dhist objects ```{r} -dhists_1<- graph_features_to_histograms(features_matrix = counts_1) -dhists_2<- graph_features_to_histograms(features_matrix = counts_2) +dhists_1 <- graph_features_to_histograms(features_matrix = counts_1) +dhists_2 <- graph_features_to_histograms(features_matrix = counts_2) -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2,smoothing_window_width = 1) +netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 1) ``` ___ @@ -78,35 +78,35 @@ ___ Computation of the Laplacian and Normalized Laplacian: ```{r, netemdEigen} # Networks -gLat_1 <- graph.lattice(c(20,20)) -gLat_2 <- graph.lattice(c(44,44)) +gLat_1 <- graph.lattice(c(20, 20)) +gLat_2 <- graph.lattice(c(44, 44)) -#Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) +# Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1, normalized = FALSE, sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2, normalized = FALSE, sparse = FALSE) -#Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) +# Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1, normalized = TRUE, sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2, normalized = TRUE, sparse = FALSE) # Providing a matrix of network features (e.g. Spectra). (This may take a couple of minutes). -spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +spec_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) +spec_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) head(spec_1) ``` Similarly to counts, all other features can be given as a matrix or as dhist objects: ```{r} -netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = spec_1, dhists_2 = spec_2, smoothing_window_width = 0) # Providing pre-computed dhist objects from network features -dhists_1<- graph_features_to_histograms(spec_1) -dhists_2<- graph_features_to_histograms(spec_2) +dhists_1 <- graph_features_to_histograms(spec_1) +dhists_2 <- graph_features_to_histograms(spec_2) -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 0) ``` ------------------------- @@ -124,15 +124,15 @@ The selection of a gold-standard graph as a substitute for $E_w$ could be done w source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") ``` For illustration purposes consider the lattice networks as possible gold-standard networks: ```{r,fig.align='center'} # Lattice graphs to be used as gold-standard as a reference point comparison -goldstd_1 <- igraph::graph.lattice(c(20,20)) #Graph with 8^2 nodes -goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes +goldstd_1 <- igraph::graph.lattice(c(20, 20)) # Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44, 44)) # Graph with 44^2 nodes ``` Now obtain the subgraph counts for all networks. @@ -146,17 +146,17 @@ props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) Compute Netdis using the pre-computed counts and any of the example gold-standard networks. ```{r} -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) ``` Comparison to the result of Netdis without pre-computed counts. ```{r} # Netdis using the goldstd_1 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) # Netdis using the goldstd_2 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) ``` @@ -169,17 +169,17 @@ This Netdis variant focuses on detecting more meso-level discrepancies between t source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) ``` @@ -192,17 +192,17 @@ Comparing the networks via their observed ego counts without centring them, (equ source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -#Netdis using no expectations (or equivalently, expectation equal to zero). -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) +# Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) # Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) ``` ------------------------- diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index 74db95f1..6041a04d 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -11,8 +11,8 @@ vignette: > ```{r, include = FALSE} knitr::opts_chunk$set( -collapse = TRUE, -comment = "#>" + collapse = TRUE, + comment = "#>" ) ``` @@ -41,25 +41,29 @@ Although the `virusppi` list of PPI networks is loaded along with the `netdist` source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as undirected igraph objects, with no loops, multiple edges or degree zero nodes. -graph_1 <- read_simple_graph(file = file.path(source_dir, "EBV.txt"), - format = "ncol") +graph_1 <- read_simple_graph( + file = file.path(source_dir, "EBV.txt"), + format = "ncol" +) -graph_2 <- read_simple_graph(file = file.path(source_dir, "ECL.txt"), - format = "ncol") +graph_2 <- read_simple_graph( + file = file.path(source_dir, "ECL.txt"), + format = "ncol" +) # Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. graph_1 -#Note this graph is the same as +# Note this graph is the same as # virusppi$EBV # Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. graph_2 -#Note this graph is the same as +# Note this graph is the same as # virusppi$ECL -#A simple visualization of the graphs. -plot(graph_1,vertex.size=4,vertex.label=NA) -plot(graph_2,vertex.size=4,vertex.label=NA) +# A simple visualization of the graphs. +plot(graph_1, vertex.size = 4, vertex.label = NA) +plot(graph_2, vertex.size = 4, vertex.label = NA) ``` Other networks loaded in this package are discussed in ["NetEmd: World trade networks"](NetEmdTimeOrdering.html). You can also see `?virusppi` and `?worldtradesub`. @@ -97,34 +101,36 @@ source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects # Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) # Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") + format = "ncol" +) # One to one NetEmd comparison. -netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", smoothing_window_width = 1) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum ```{r, netemdEigen,fig.align='center'} -#Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) +# Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) -#Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) +# Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) -#Spectra (this may take a couple of minutes). -props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +# Spectra (this may take a couple of minutes). +props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) -head(props_1,n=3) -head(props_2,n=3) +head(props_1, n = 3) +head(props_2, n = 3) -netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ------------------------- @@ -168,18 +174,18 @@ The selection of a gold-standard graph as a substitute for $E_w$ could be done w ```{r,netdisgoldstand,fig.align='center',fig.dim=c(6,6)} # Lattice graphs to be used as a gold-standard reference point -goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes -goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes +goldstd_1 <- igraph::graph.lattice(c(8, 8)) # Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44, 44)) # Graph with 44^2 nodes -plot(goldstd_1,vertex.size=4,vertex.label=NA) -plot(goldstd_2,vertex.size=4,vertex.label=NA) +plot(goldstd_1, vertex.size = 4, vertex.label = NA) +plot(goldstd_2, vertex.size = 4, vertex.label = NA) # Netdis using the goldstd_1 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_1) # Netdis using the goldstd_2 graph as gold-standard reference point -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) ``` ## Netdis-GP: Using a Geometric-Poisson approximation @@ -201,8 +207,8 @@ where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ f This variant focuses on detecting more meso-level discrepancies between the ego-network structures. ```{r, netdisGP} -#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) ``` @@ -210,9 +216,8 @@ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) Comparing the networks via their observed ego counts without centring them, (equivalent to using expectation equal to zero). This variant thus focus on detecting small discrepancies between the networks. ```{r,netdiszero} -#Netdis using no expectations (or equivalently, expectation equal to zero). -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) - +# Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) ``` ------------------------- diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index 5c0a3445..1090a4ab 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -33,31 +33,35 @@ supported values for the `format` parameter. ```{r, message=FALSE} library("netdist") library("igraph") -edge_format = "ncol" +edge_format <- "ncol" # Load reference graph (used for Netdis. Not required for NetEmd -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), - "ER_1250_10_1") +ref_path <- file.path( + system.file(file.path("extdata", "random"), package = "netdist"), + "ER_1250_10_1" +) ref_graph <- read_simple_graph(ref_path, format = edge_format) # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), - package = "netdist") + package = "netdist" +) edge_format <- "ncol" file_pattern <- "*" # Load all graphs in the source folder matching the filename pattern query_graphs <- read_simple_graphs(source_dir, - format = edge_format, - pattern = file_pattern) + format = edge_format, + pattern = file_pattern +) print(names(query_graphs)) ``` ```{r,fig.align='center',fig.dim=c(5,5)} -plot(query_graphs$EBV,vertex.label=NA,vertex.size=8) -plot(query_graphs$`HSV-1`,vertex.label=NA,vertex.size=8) -plot(query_graphs$KSHV,vertex.label=NA,vertex.size=8) -plot(query_graphs$VZV,vertex.label=NA,vertex.size=8) -plot(query_graphs$ECL,vertex.label=NA,vertex.size=4) +plot(query_graphs$EBV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$`HSV-1`, vertex.label = NA, vertex.size = 8) +plot(query_graphs$KSHV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$VZV, vertex.label = NA, vertex.size = 8) +plot(query_graphs$ECL, vertex.label = NA, vertex.size = 4) ``` # Generate Netdis measures between each pair of query graphs @@ -74,10 +78,12 @@ neighbourhood_size <- 2 ## Netdis using an ER reference graph ```{r} # Calculate netdis measure for subgraphs up to size max_subgraph_size -netdis_result <- netdis_many_to_many(graphs = query_graphs, - ref_graph = ref_graph, - max_graphlet_size = max_subgraph_size, - neighbourhood_size = neighbourhood_size) +netdis_result <- netdis_many_to_many( + graphs = query_graphs, + ref_graph = ref_graph, + max_graphlet_size = max_subgraph_size, + neighbourhood_size = neighbourhood_size +) # Netdis measure for subgraphs of size 3 res3 <- netdis_result$netdis["netdis3", ] @@ -105,31 +111,33 @@ cex <- 1 # Dendrogram based on Netdis measure for subgraphs of size 3 title <- paste("Netdis: subgraph size = ", 3, sep = "") plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) + use.edge.length = FALSE, + edge.width = cex * 2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex +) # Dendrogram based on Netdis measure for subgraphs of size 4 -title = paste("Netdis: subgraph size = ", 4, sep = "") +title <- paste("Netdis: subgraph size = ", 4, sep = "") plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) + use.edge.length = FALSE, + edge.width = cex * 2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex +) ``` diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd index 4a70fb8d..e0601588 100644 --- a/vignettes/dendrogram_example_net_emd.Rmd +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -13,41 +13,42 @@ vignette: > library("netdist") # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = ".txt" +edge_format <- "ncol" +file_pattern <- ".txt" -# Calculate graphlet-based degree distributions for all orbits in graphlets -# comprising up to 4 nodes for all graphs. This only needs to be done once -# per graph (feature_type = "orbit", max_graphlet_size = 4).. +# Calculate graphlet-based degree distributions for all orbits in graphlets +# comprising up to 4 nodes for all graphs. This only needs to be done once +# per graph (feature_type = "orbit", max_graphlet_size = 4).. # If feature_type is set to "feature_type", orbit counts for orbits in the # same graphlet will be summed to generate graphlet counts -# If max_graphlet_size is set to 5, graphlet-based degree distributions will +# If max_graphlet_size is set to 5, graphlet-based degree distributions will # be calculated for graphlets comprising up to 5 nodes. virus_gdds <- gdd_for_all_graphs( - source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 4) + source_dir = source_dir, format = edge_format, pattern = file_pattern, + feature_type = "orbit", max_graphlet_size = 4 +) names(virus_gdds) -# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- +# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- # based degree distributions using the default fast "optimise" method and no # smoothing (default). The "optimise" method uses the built-in R optimise # function to efficiently find the offset with the minimum EMD, but is not # guaranteed to find the global minimum if EMD as a function of offset -# is non-convex and/or multimodal. The smoothing window width determines +# is non-convex and/or multimodal. The smoothing window width determines # whether to calculate the NetEMD from the unaltered discrete GDD histograms -# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" -# smoothing by "smearing" the discrete GDD histogram point masses across bins +# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" +# smoothing by "smearing" the discrete GDD histogram point masses across bins # of unit width (smoothing_window_width = 1). Returns a named list containing: -# (i) the NetEMDs and (ii) a table containing the graph names and indices +# (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists = virus_gdds, smoothing_window_width = 0) -# You can also specify method = "fixed_step" to use the much slower method of -# exhaustively evaluating the EMD at all offsets separated by a fixed step. -# The default step size is 1/2 the the minimum spacing between locations in -# either histogram after normalising to unit variance. However, you can +# You can also specify method = "fixed_step" to use the much slower method of +# exhaustively evaluating the EMD at all offsets separated by a fixed step. +# The default step size is 1/2 the the minimum spacing between locations in +# either histogram after normalising to unit variance. However, you can # specifiy your own fixed step using the optional "step_size" parameter. -# Note that this step size is applied to the histograms after they have been +# Note that this step size is applied to the histograms after they have been # normalised to unit variance # Convert to matrix for input to dendrogram method @@ -56,25 +57,27 @@ netemd_mat ``` ```{r} -cex=1 -title = paste("NetEMD: max graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +cex <- 1 +title <- paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex * 2, main = title, cex.lab = cex, cex.axis = cex, cex.main = cex, + cex.sub = cex, cex = cex +) -# The gdd_for_all_graphs and netemd_many_to_many functions will run in +# The gdd_for_all_graphs and netemd_many_to_many functions will run in # parallel using multiple threads where supported. The number of threads -# used is determined by the global R option "mc.cores". You can inspect the -# current value of this using options("mc.cores") and set it with +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with # options("mc.cores" = ). To fully utilise a modern consumer -# processor, this should be set to 2x the number of available processor +# processor, this should be set to 2x the number of available processor # cores as each core supports two threads. ``` ```{r} -cex=1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("NetEMD: max graphlet size = ", 4, sep = "") +cex <- 1.5 +col <- colorRampPalette(colors = c("blue", "white"))(100) +title <- paste("NetEMD: max graphlet size = ", 4, sep = "") heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) ``` diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd index 0c1c724a..0040450c 100644 --- a/vignettes/netdis_customisations.Rmd +++ b/vignettes/netdis_customisations.Rmd @@ -29,10 +29,10 @@ min_ego_nodes <- 3 min_ego_edges <- 1 # Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") - ``` ## Load query graphs @@ -46,11 +46,12 @@ graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) print(results$netdis) print(results$comp_spec) @@ -60,23 +61,23 @@ print(results$comp_spec) ```{r} binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 10, - num_intervals = 50) + min_counts_per_interval = 10, + num_intervals = 50 +) # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn +) print(results$netdis) print(results$comp_spec) - - ``` ## With Modified Expected Counts: Geometric Poisson @@ -84,17 +85,19 @@ print(results$comp_spec) bin_counts_fn <- density_binned_counts_gp exp_counts_fn <- purrr::partial(netdis_expected_counts, - scale_fn = NULL) + scale_fn = NULL +) # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn +) print(results$netdis) print(results$comp_spec) @@ -108,15 +111,16 @@ exp_counts_fn <- netdis_expected_counts # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn +) print(results$netdis) print(results$comp_spec) -``` \ No newline at end of file +``` diff --git a/vignettes/netdis_pairwise_comparisons.Rmd b/vignettes/netdis_pairwise_comparisons.Rmd index d5809c4d..650454c5 100644 --- a/vignettes/netdis_pairwise_comparisons.Rmd +++ b/vignettes/netdis_pairwise_comparisons.Rmd @@ -33,10 +33,10 @@ min_bin_count <- 5 num_bins <- 100 # Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") - ``` ## Compare two graphs @@ -45,18 +45,21 @@ ref_graph <- read_simple_graph(ref_path, format = "ncol") source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") + format = "ncol" +) # Calculate netdis statistics netdis_one_to_one(graph_1, graph_2, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Compare one graph to many other graphs @@ -68,11 +71,12 @@ graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] # Calculate netdis statistics netdis_one_to_many(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Do pairwise netdis calculations for many graphs @@ -83,12 +87,13 @@ graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) print(results$netdis) print(results$comp_spec) -``` \ No newline at end of file +``` diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index a55f1d82..87764511 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -23,11 +23,12 @@ source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") + format = "ncol" +) graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - + format = "ncol" +) ``` ## Set Netdis parameters @@ -50,15 +51,17 @@ num_bins <- 100 ## Generate ego networks ```{r} # Get ego networks for query graphs and reference graph -ego_1 <- make_named_ego_graph(graph_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(graph_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) ``` ## Count graphlets in ego networks @@ -71,65 +74,76 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ## Use a reference graph to calculate expected graphlet counts in ego network density bins ```{r} # Load reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" +) ref_graph <- read_simple_graph(ref_path, format = "ncol") -ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) -scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, - max_graphlet_size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego( + graphlet_counts_ref, + max_graphlet_size +) # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) # Adaptively bin ref ego-network densities -binned_densities <- binned_densities_adaptive(densities_ref, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins +) ref_ego_density_bins <- binned_densities$breaks # Average ref graphlet counts across density bins ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts_ref, - binned_densities$interval_indexes) - + scaled_graphlet_counts_ref, + binned_densities$interval_indexes +) ``` ## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples +) -exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples +) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) - -centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_1 <- netdis_subtract_exp_counts( + graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size +) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts( + graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size +) ``` ## Sum centred graphlet counts across all ego networks @@ -142,9 +156,11 @@ sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) ## Calculate netdis statistics ```{r} -netdis_result <- netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) +netdis_result <- netdis_uptok( + sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size +) print(netdis_result) ``` From f11f06837809b789f8f5c1a4a0183a6387a51b92 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Tue, 7 Jun 2022 12:54:54 +0100 Subject: [PATCH 66/84] Regenerated NAMESPACE and RcppExports.R after devtools::document() --- NAMESPACE | 10 ++-------- R/RcppExports.R | 10 ++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e0d5d89d..85e26932 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,8 @@ # Generated by roxygen2: do not edit by hand -export(NetEmdSmoothV2) export(NetEmdSmooth) +export(NetEmdSmoothV2) +export(NetEmdSmoothV2_old) export(adaptive_breaks) export(area_between_dhist_ecmfs) export(as_smoothed_dhist) @@ -50,10 +51,7 @@ export(min_emd) export(min_emd_exhaustive) export(min_emd_optimise) export(min_emd_optimise_fast) -export(netEMDSpeedTest) export(netEMDSpeedTestSmooth) -export(net_emd) -export(net_emds_for_all_graphs) export(netdis) export(netdis.plot) export(netdis_centred_graphlet_counts) @@ -79,9 +77,5 @@ export(shift_dhist) export(simplify_graph) export(single_density_bin) export(sort_dhist) -export(zeros_to_ones) -import(Rcpp) -importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) useDynLib(netdist, .registration=TRUE) -exportPattern("^[[:alpha:]]+") diff --git a/R/RcppExports.R b/R/RcppExports.R index e718b542..38e022c2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -45,3 +45,13 @@ NetEmdSmoothV2 <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { .Call(`_netdist_NetEmdSmoothV2`, loc1, val1, binWidth1, loc2, val2, binWidth2) } +#' @title +#' Compute EMD +NULL + +#' +#' @export +NetEmdSmoothV2_old <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { + .Call(`_netdist_NetEmdSmoothV2_old`, loc1, val1, binWidth1, loc2, val2, binWidth2) +} + From f31de5b5ec3eef8ac29b09a25cae123bfb5cd81e Mon Sep 17 00:00:00 2001 From: jack89roberts Date: Tue, 7 Jun 2022 13:24:53 +0100 Subject: [PATCH 67/84] vignette line length fixes --- .github/workflows/build.yml | 2 +- R/data.R | 38 ++++++++--- R/dhist.R | 6 +- vignettes/ManyToMany.Rmd | 76 ++++++++++++++++++---- vignettes/NetEmdTimeOrdering.Rmd | 26 ++++++-- vignettes/NetdisGPStepByStep.Rmd | 11 +++- vignettes/NetdisStepByStep.Rmd | 23 +++++-- vignettes/NewNetdisCustomisations.Rmd | 41 +++++++++--- vignettes/PreComputedProps.Rmd | 82 ++++++++++++++++++------ vignettes/default_pairwise_usage.Rmd | 64 +++++++++++++----- vignettes/dendrogram_example_net_dis.Rmd | 21 ++++-- vignettes/dendrogram_example_net_emd.Rmd | 23 +++++-- vignettes/quickstart_netdis_2graphs.Rmd | 15 ++++- 13 files changed, 329 insertions(+), 99 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 160d59eb..1c1ff79a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -61,7 +61,7 @@ jobs: git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git add man/\* NAMESPACE - git commit -m "Update documentation" || echo "No changes to commit" + git commit -m "Update documentation (bot)" || echo "No changes to commit" git pull --ff-only git push origin diff --git a/R/data.R b/R/data.R index 4753ed66..688cee76 100644 --- a/R/data.R +++ b/R/data.R @@ -36,9 +36,17 @@ #' } #' #' @format A list of \code{igraph} objects. -#' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. -#' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, Parkinson J (2009) The Modular Organization of Protein Interactions in Escherichia coli. PLoS Comput Biol 5(10): e1000523. \url{https://doi.org/10.1371/journal.pcbi.1000523} -#' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. \url{https://www.ncbi.nlm.nih.gov/taxonomy} +#' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, +#' Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily +#' Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): +#' e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table +#' S2 in the supporting information. +#' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, +#' Parkinson J (2009) The Modular Organization of Protein Interactions in +#' Escherichia coli. PLoS Comput Biol 5(10): e1000523. +#' \url{https://doi.org/10.1371/journal.pcbi.1000523} +#' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. +#' \url{https://www.ncbi.nlm.nih.gov/taxonomy} #' @encoding UTF-8 "virusppi" @@ -48,16 +56,28 @@ #' World trade networks from 1985–2014 #' -#' The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. +#' The world trade data set consists of a small sample of world trade networks +#' for the years 2001-2014, and pre-computed subgraph counts of a larger set of +#' world trade networks (1985–2014). The world trade networks are based on the +#' data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the +#' United Nations division COMTRADE [Division, 2015] for the years 2001-2014. #' #' \itemize{ -#' \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. -#' \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. +#' \item wtnets: List of \code{igraph} objects providing the world trade +#' networks from 2001–2014. +#' \item Counts: Pre-computed graphlet counts for the world trade networks in +#' the years 1985-2014. #' } #' -#' @format A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. -#' @source \strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. -#' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). +#' @format A list of two elements. The first element, 'wtnets', is a list of +#' \code{igraph} objects providing a small sample of world trade networks from +#' 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph +#' counts of world trade networks in the years 1985-2014. +#' @source \strong{World trade networks:}. United nations commodity trade +#' statistics database (UN comtrade). http://comtrade.un.org/, 2015. +#' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and +#' Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau +#' of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). #' #' @encoding UTF-8 "worldtradesub" diff --git a/R/dhist.R b/R/dhist.R index 0ff24b9a..fe81d357 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -491,7 +491,8 @@ dhist_variance <- function(dhist) { bin_uppers <- mean_centred_locations + hw # See comment in issue #21 on Github repository for verification that E[X^2] # is calculated as below for a uniform bin - bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers * bin_uppers) / 3 + bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + + bin_lowers * bin_uppers) / 3 variance <- sum(dhist$masses * bin_x2_integrals) / sum(dhist$masses) } return(variance) @@ -563,7 +564,8 @@ normalise_dhist_variance <- function(dhist) { # If smoothing_window_width not zero, then update it to reflect the variance # normalisation if (dhist$smoothing_window_width != 0) { - normalised_smoothing_window_width <- dhist$smoothing_window_width / std_dev + normalised_smoothing_window_width <- + dhist$smoothing_window_width / std_dev dhist <- update_dhist( dhist, smoothing_window_width = normalised_smoothing_window_width diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd index 89dc2ea5..c69e9f21 100644 --- a/vignettes/ManyToMany.Rmd +++ b/vignettes/ManyToMany.Rmd @@ -67,9 +67,23 @@ plot(gTree_2, vertex.size = 0.8, vertex.label = NA) Subgraph count based NetEmd comparisons: ```{r, netemdorbits,fig.align='center'} # NetEMD using subgraph counts. -glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) +glist <- list( + Lat_1 = gLat_1, + Lat_2 = gLat_2, + Ring_1 = gRing_1, + Ring_2 = gRing_1, + Tree_1 = gTree_1, + Tree_2 = gTree_2 +) -netemdlist <- netemd_many_to_many(graphs = glist, smoothing_window_width = 1, mc.cores = 1) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing_window_width +# equal to zero is recommended. +netemdlist <- netemd_many_to_many( + graphs = glist, + smoothing_window_width = 1, + mc.cores = 1 +) netemdlist ``` @@ -77,13 +91,19 @@ netemdlist To read the results in a matrix form: ```{r,fig.align='center'} # Creating a comparison matrix: -mat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) +mat <- cross_comp_to_matrix( + measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec +) mat ``` Illustration of the multiple NetEmd comparisons based on subgraph counts. ```{r,netemdorbitsPLOT,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd subgraph counts") +netemd.plot( + netemdlist = netemdlist, + clustering_method = "ward.D", + main = "NetEmd subgraph counts" +) ``` ## NetEmd using the Laplacian and Normalized Laplacian Spectrum @@ -95,16 +115,25 @@ SPECT <- list() # This step may take several minutes. for (i in 1:length(glist)) { - Lapg <- igraph::laplacian_matrix(graph = glist[[i]], normalized = FALSE, sparse = FALSE) - NLap <- igraph::laplacian_matrix(graph = glist[[i]], normalized = TRUE, sparse = FALSE) - SPECT[[names(glist)[i]]] <- cbind(L.Spectra = eigen(Lapg)$values, NL.Spectra = eigen(NLap)$values) + Lapg <- igraph::laplacian_matrix( + graph = glist[[i]], normalized = FALSE, sparse = FALSE + ) + NLap <- igraph::laplacian_matrix( + graph = glist[[i]], normalized = TRUE, sparse = FALSE + ) + SPECT[[names(glist)[i]]] <- cbind( + L.Spectra = eigen(Lapg)$values, NL.Spectra = eigen(NLap)$values + ) } str(SPECT) ``` Compute NetEmd: ```{r} -netemdlist <- netemd_many_to_many(dhists = SPECT, smoothing_window_width = 0) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing +# window_width equal to zero is recommended. +netemdlist <- netemd_many_to_many(dhists = SPECT, smoothing_window_width = 0) netemdlist ``` @@ -112,7 +141,9 @@ netemdlist ### Illustration of the multiple NetEmd comparisons based on the Laplacian and Normalized Laplacian spectra ```{r,netemdspectrumPLOT ,fig.align='center',fig.dim=c(8,8)} -netemd.plot(netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd Spectra") +netemd.plot( + netemdlist = netemdlist, clustering_method = "ward.D", main = "NetEmd Spectra" +) ``` @@ -154,7 +185,14 @@ plot(gst_2, vertex.size = 0.8, vertex.label = NA) Obtain the comparison via Netdis using each of the reference graph networks. ```{r,netdisgoldstand ,fig.align='center'} -glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) +glist <- list( + Lat_1 = gLat_1, + Lat_2 = gLat_2, + Ring_1 = gRing_1, + Ring_2 = gRing_1, + Tree_1 = gTree_1, + Tree_2 = gTree_2 +) # Netdis using the goldstd_1 graph as gold-standard reference point netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = gst_1) @@ -170,18 +208,28 @@ netdis_mat_gst2 To read the results in a matrix form: ```{r,fig.align='center'} # Creating a comparison matrix: -cross_comp_to_matrix(measure = netdis_mat_gst1$netdis, cross_comparison_spec = netdis_mat_gst1$comp_spec) +cross_comp_to_matrix( + measure = netdis_mat_gst1$netdis, + cross_comparison_spec = netdis_mat_gst1$comp_spec +) -cross_comp_to_matrix(measure = netdis_mat_gst2$netdis, cross_comparison_spec = netdis_mat_gst2$comp_spec) +cross_comp_to_matrix( + measure = netdis_mat_gst2$netdis, + cross_comparison_spec = netdis_mat_gst2$comp_spec +) ``` Heatmap of the Netdis comparisons: ```{r,netdisgoldstandPLOT ,fig.align='center',fig.dim=c(8,8)} # Network comparisons heatmap with Gold-Standard 1 -netdis.plot(netdislist = netdis_mat_gst1, whatrow = 2, main = "Netdis GoldStd-1") +netdis.plot( + netdislist = netdis_mat_gst1, whatrow = 2, main = "Netdis GoldStd-1" +) # Network comparisons heatmap with Gold-Standard 2 -netdis.plot(netdislist = netdis_mat_gst2, whatrow = 2, main = "Netdis GoldStd-2") +netdis.plot( + netdislist = netdis_mat_gst2, whatrow = 2, main = "Netdis GoldStd-2" +) ``` diff --git a/vignettes/NetEmdTimeOrdering.Rmd b/vignettes/NetEmdTimeOrdering.Rmd index 2a199a41..44a48e7a 100644 --- a/vignettes/NetEmdTimeOrdering.Rmd +++ b/vignettes/NetEmdTimeOrdering.Rmd @@ -47,8 +47,8 @@ plot(wtnets$wtn2001, vertex.size = 5, vertex.label.cex = 0.4) In this example **NetEmd** will consider orbit counts of subgraphs containing up to 5 nodes. If NetEmd is to be called a single time, then the command `netemd_many_to_many(graphs = wtnets)` would suffice. The following code provides such an example: ```{r} -# As the trade networks are considerable dense, this example first considers a small number of networks. -# This example may take some minutes to run. +# As the trade networks are considerable dense, this example first considers a +# small number of networks. This example may take some minutes to run. netemd_result <- netemd_many_to_many(graphs = wtnets[1:4], mc.cores = 1) print(netemd_result) @@ -60,11 +60,15 @@ However, if there are pre-computed counts or features NetEmd can be called via t World trade networks consist of relatively dense networks, thus leading to longer computational times for the calculation of the subgraph counts. Hence, it is advisable to pre-compute counts in case there is a need to call NetEmd multiple times. This may, for example, be the case when adding a new network to the data set. The following illustrates the extraction of subgraph counts for the small network sample. ```{r} -# This example may take more than a few minutes to run (approx. 20 mins) , and it is not necessary to run it for the upcoming examples as a larger set of counts has been already computed. -if (FALSE) { # It is not necessary to run, as these counts are already available in. +# This example may take more than a few minutes to run (approx. 20 mins), and +# it is not necessary to run it for the upcoming examples as a larger set of +# counts has been already computed. +if (FALSE) { Counts <- list() for (i in 1:length(wtnets)) { - Counts[[names(wtnets)[i]]] <- count_orbits_per_node(graph = wtnets[[i]], max_graphlet_size = 5) + Counts[[names(wtnets)[i]]] <- count_orbits_per_node( + graph = wtnets[[i]], max_graphlet_size = 5 + ) } } ``` @@ -78,7 +82,10 @@ Counts <- worldtradesub$Counts netemd_result <- netemd_many_to_many(dhists = Counts, mc.cores = 1) # Results -netemd_matrix <- cross_comp_to_matrix(measure = netemd_result$netemds, cross_comparison_spec = netemd_result$comp_spec) +netemd_matrix <- cross_comp_to_matrix( + measure = netemd_result$netemds, + cross_comparison_spec = netemd_result$comp_spec +) print(netemd_matrix[1:10, 1:5]) ``` @@ -88,7 +95,12 @@ print(netemd_matrix[1:10, 1:5]) Based on the comparison of the world trade networks across the years, we can identify periods of time where possible considerable changes in world trade have occurred. The following heat map clearly shows the existence of two potential changes in the world trade system, and which correspond to 1995-1996 and 2010-2011. ```{r,fig.align='center',fig.dim=c(8.5,8.5)} -netemd.plot(netemdlist = netemd_result, clustering_method = "ward.D", main = "NetEmd", docluster = FALSE) +netemd.plot( + netemdlist = netemd_result, + clustering_method = "ward.D", + main = "NetEmd", + docluster = FALSE +) ``` The World Trade Organization (WTO) said the following about these years: diff --git a/vignettes/NetdisGPStepByStep.Rmd b/vignettes/NetdisGPStepByStep.Rmd index 988efe97..c5351bcc 100644 --- a/vignettes/NetdisGPStepByStep.Rmd +++ b/vignettes/NetdisGPStepByStep.Rmd @@ -116,7 +116,8 @@ neighbourhood_size <- 2 min_ego_nodes <- 3 min_ego_edges <- 1 -# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +# Ego-network density binning parameters. Here, the minimum number of +# ego-networks per bin and the starting number of bins min_bin_count <- 5 num_bins <- 100 ``` @@ -145,8 +146,12 @@ head(ego_2, n = 2) Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: ```{r} # Subgraphs counts for ego-networks in query graphs -subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) -subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) +subgraph_counts_1 <- ego_to_graphlet_counts( + ego_networks = ego_1, max_graphlet_size = max_subgraph_size +) +subgraph_counts_2 <- ego_to_graphlet_counts( + ego_networks = ego_2, max_graphlet_size = max_subgraph_size +) head(subgraph_counts_1) head(subgraph_counts_2) diff --git a/vignettes/NetdisStepByStep.Rmd b/vignettes/NetdisStepByStep.Rmd index 7162f142..0499acad 100644 --- a/vignettes/NetdisStepByStep.Rmd +++ b/vignettes/NetdisStepByStep.Rmd @@ -104,7 +104,8 @@ neighbourhood_size <- 2 min_ego_nodes <- 3 min_ego_edges <- 1 -# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +# Ego-network density binning parameters. Here, the minimum number of +# ego-networks per bin and the starting number of bins min_bin_count <- 5 num_bins <- 100 ``` @@ -135,8 +136,12 @@ tail(ego_2, n = 2) Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: ```{r} # Subgraphs counts for ego-networks in query graphs -subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) -subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) +subgraph_counts_1 <- ego_to_graphlet_counts( + ego_networks = ego_1, max_graphlet_size = max_subgraph_size +) +subgraph_counts_2 <- ego_to_graphlet_counts( + ego_networks = ego_2, max_graphlet_size = max_subgraph_size +) tail(subgraph_counts_1) tail(subgraph_counts_2) @@ -203,8 +208,8 @@ str(binned_densities_gst_1) $E_w$ is estimated based on the average subgraph counts of ego-networks per density bin for each given subgraph. However, as the query networks and the gold-standard may have different number of nodes, the counts of the gold-standard network are first scaled to a "standard" or "canonical" scale from which they can be scaled back towards networks of different sizes. The following code first shows the computation of the subgraph counts for the ego-networks in the gold-standard network with their corresponding scaling: ```{r} -# Scale ego-network subgraph counts by dividing by the total number of k-tuples in the -# ego-network (where k is the subgraph size) +# Scale ego-network subgraph counts by dividing by the total number of k-tuples +# in the ego-network (where k is the subgraph size) scaled_subgraph_counts_ref <- scale_graphlet_counts_ego( graphlet_counts = subgraph_counts_gst_1, max_graphlet_size = max_subgraph_size @@ -215,7 +220,10 @@ str(scaled_subgraph_counts_ref) Finally, the standard or canonical $E_w$ can be obtained by taking the average per bin of the scaled subgraph counts: ```{r} # Average of the scaled reference subgraph counts in each density bin -ref_binned_canonical_subgraph_counts <- mean_density_binned_graphlet_counts(graphlet_counts = scaled_subgraph_counts_ref, density_interval_indexes = binned_densities_gst_1$interval_indexes) +ref_binned_canonical_subgraph_counts <- mean_density_binned_graphlet_counts( + graphlet_counts = scaled_subgraph_counts_ref, + density_interval_indexes = binned_densities_gst_1$interval_indexes +) ref_binned_canonical_subgraph_counts ``` @@ -224,7 +232,8 @@ ref_binned_canonical_subgraph_counts After obtaining the average scaled subgraph counts per density bin, the subgraph counts of the query networks can be centred: ```{r} -# Scale the reference counts of the gold-standard network to the sizes of each of the query ego-networks. +# Scale the reference counts of the gold-standard network to the sizes of each +# of the query ego-networks. exp_subgraph_counts_1 <- netdis_expected_counts( graphlet_counts = subgraph_counts_1, density_breaks = binned_densities_gst_1$breaks, diff --git a/vignettes/NewNetdisCustomisations.Rmd b/vignettes/NewNetdisCustomisations.Rmd index e8cbe645..249c37d3 100644 --- a/vignettes/NewNetdisCustomisations.Rmd +++ b/vignettes/NewNetdisCustomisations.Rmd @@ -50,7 +50,14 @@ gRing_2 <- make_ring(40^2) gTree_1 <- igraph::as.undirected(make_tree(n = 20^2, children = 3)) gTree_2 <- igraph::as.undirected(make_tree(n = 40^2, children = 3)) -glist <- list(Lat_1 = gLat_1, Lat_2 = gLat_2, Ring_1 = gRing_1, Ring_2 = gRing_1, Tree_1 = gTree_1, Tree_2 = gTree_2) +glist <- list( + Lat_1 = gLat_1, + Lat_2 = gLat_2, + Ring_1 = gRing_1, + Ring_2 = gRing_1, + Tree_1 = gTree_1, + Tree_2 = gTree_2 +) # Create a random graph to be used as a gold-standard gst <- igraph::as.undirected(graph.star(1000)) @@ -65,7 +72,10 @@ netdis_mat_gst <- netdis_many_to_many( graphs = glist, ref_graph = gst ) -netdis.plot(netdislist = netdis_mat_gst, whatrow = 2, main = "Netdis with reference graph") +netdis.plot( + netdislist = netdis_mat_gst, whatrow = 2, + main = "Netdis with reference graph" +) ``` ## Using Netdis with a constant valued expectation, $E_w=k$ @@ -107,7 +117,9 @@ mybinning <- function(densities) { min_counts_per_interval <- 5 num_intervals <- 3 # - if (length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + if (length(densities) < min_counts_per_interval) { + min_counts_per_interval <- length(densities) + } breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals @@ -136,7 +148,11 @@ netdisgp_mat_mybin <- netdis_many_to_many( binning_fn = mybinning ) -netdis.plot(netdislist = netdisgp_mat_mybin, whatrow = 2, main = "Netdis-GP with mybinning") +netdis.plot( + netdislist = netdisgp_mat_mybin, + whatrow = 2, + main = "Netdis-GP with mybinning" +) ``` Note that whenever $E_w$ is taken as a constant value, then the binning will not have an effect on the computation of Netdis. @@ -167,8 +183,9 @@ num_bins <- 100 These values can be directly imputed and changed into the shortcut Netdis function calls. However, not all combinations may be possible. The following shows the use of subgraphs up to size 4, with 3-step ego-networks and where only ego-networks with at least 5 nodes and 4 edges can be considered. Furthermore, the binning of the ego-networks will be sett to start with 20 bins and each bin will be required to have at least 20 elements. ```{r,fig.align='center',fig.dim=c(7,7)} -# (We only recommend changing these default values for those users that have a clear understanding of graph theory behind it) -# (change values with care as not all combinations may be possible). +# (We only recommend changing these default values for those users that have a +# clear understanding of graph theory behind it. Change values with care as not +# all combinations may be possible). # Defining a new binning function: binning_fn <- purrr::partial(binned_densities_adaptive, @@ -205,8 +222,16 @@ netdisgp_mat <- netdis_many_to_many( min_ego_edges = min_ego_edges, binning_fn = binning_fn ) -netdis.plot(netdislist = netdisgp_mat, whatrow = 2, main = "Netdis-GP: Default parameter values") -netdis.plot(netdislist = netdisgp_mat_custom, whatrow = 2, main = "Netdis-GP: illustrative parameter changes") +netdis.plot( + netdislist = netdisgp_mat, + whatrow = 2, + main = "Netdis-GP: Default parameter values" +) +netdis.plot( + netdislist = netdisgp_mat_custom, + whatrow = 2, + main = "Netdis-GP: illustrative parameter changes" +) ``` diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd index 564c9b1c..8bc8a8d8 100644 --- a/vignettes/PreComputedProps.Rmd +++ b/vignettes/PreComputedProps.Rmd @@ -51,7 +51,12 @@ plot(gLat_2, vertex.size = 4, vertex.label = NA) The simple computation of NetEmd without pre-computed features: ```{r} -netemd_one_to_one(graph_1 = gLat_1, graph_2 = gLat_2, feature_type = "orbit", smoothing_window_width = 1) +netemd_one_to_one( + graph_1 = gLat_1, + graph_2 = gLat_2, + feature_type = "orbit", + smoothing_window_width = 1 +) ``` @@ -61,7 +66,9 @@ counts_1 <- count_orbits_per_node(graph = gLat_1, max_graphlet_size = 5) counts_2 <- count_orbits_per_node(graph = gLat_2, max_graphlet_size = 5) head(counts_1[, 1:4]) -netemd_one_to_one(dhists_1 = counts_1, dhists_2 = counts_2, smoothing_window_width = 1) +netemd_one_to_one( + dhists_1 = counts_1, dhists_2 = counts_2, smoothing_window_width = 1 +) ``` ### Providing the network features as lists of dhist objects @@ -69,7 +76,9 @@ netemd_one_to_one(dhists_1 = counts_1, dhists_2 = counts_2, smoothing_window_wid dhists_1 <- graph_features_to_histograms(features_matrix = counts_1) dhists_2 <- graph_features_to_histograms(features_matrix = counts_2) -netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 1) +netemd_one_to_one( + dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 1 +) ``` ___ @@ -82,31 +91,48 @@ gLat_1 <- graph.lattice(c(20, 20)) gLat_2 <- graph.lattice(c(44, 44)) # Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1, normalized = FALSE, sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2, normalized = FALSE, sparse = FALSE) +Lapg_1 <- igraph::laplacian_matrix( + graph = gLat_1, normalized = FALSE, sparse = FALSE +) +Lapg_2 <- igraph::laplacian_matrix( + graph = gLat_2, normalized = FALSE, sparse = FALSE +) # Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1, normalized = TRUE, sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2, normalized = TRUE, sparse = FALSE) +NLapg_1 <- igraph::laplacian_matrix( + graph = gLat_1, normalized = TRUE, sparse = FALSE +) +NLapg_2 <- igraph::laplacian_matrix( + graph = gLat_2, normalized = TRUE, sparse = FALSE +) -# Providing a matrix of network features (e.g. Spectra). (This may take a couple of minutes). -spec_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) -spec_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) +# Providing a matrix of network features (e.g. Spectra). (This may take a +# couple of minutes). +spec_1 <- cbind( + L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values +) +spec_2 <- cbind( + L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values +) head(spec_1) ``` Similarly to counts, all other features can be given as a matrix or as dhist objects: ```{r} -netemd_one_to_one(dhists_1 = spec_1, dhists_2 = spec_2, smoothing_window_width = 0) +netemd_one_to_one( + dhists_1 = spec_1, dhists_2 = spec_2, smoothing_window_width = 0 +) # Providing pre-computed dhist objects from network features dhists_1 <- graph_features_to_histograms(spec_1) dhists_2 <- graph_features_to_histograms(spec_2) -netemd_one_to_one(dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 0) +netemd_one_to_one( + dhists_1 = dhists_1, dhists_2 = dhists_2, smoothing_window_width = 0 +) ``` ------------------------- @@ -120,7 +146,8 @@ Netdis uses counts from the resulting ego-networks of each of the nodes in a gra The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard graph and which is summarized in $E_w$. ```{r,netdisgoldstand} -# Set source directory for virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects @@ -146,8 +173,16 @@ props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) Compute Netdis using the pre-computed counts and any of the example gold-standard networks. ```{r} -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_1) -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, graphlet_counts_ref = props_goldstd_2) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_1 +) +netdis_one_to_one( + graphlet_counts_1 = props_1, + graphlet_counts_2 = props_2, + graphlet_counts_ref = props_goldstd_2 +) ``` Comparison to the result of Netdis without pre-computed counts. @@ -165,21 +200,25 @@ netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = goldstd_2) This Netdis variant focuses on detecting more meso-level discrepancies between the ego-network structures. ```{r, netdisGP} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +# Netdis using the Geometric-Poisson approximation as a way to obtain background +# expectations. netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) # Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL) +netdis_one_to_one( + graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = NULL +) ``` @@ -188,7 +227,8 @@ Comparing the networks via their observed ego counts without centring them, (equ ```{r,netdiszero} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects @@ -202,7 +242,9 @@ netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = 0) props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) -netdis_one_to_one(graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0) +netdis_one_to_one( + graphlet_counts_1 = props_1, graphlet_counts_2 = props_2, ref_graph = 0 +) ``` ------------------------- diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index 6041a04d..67683313 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -37,10 +37,12 @@ The `netdist` package also includes examples of a few real networks. These are p Although the `virusppi` list of PPI networks is loaded along with the `netdist` package, the following code shows how to read a network data from a file in disk: ```{r, graphs,fig.align='center',fig.dim=c(6,6)} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction edge files stored +# in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# Load query graphs as undirected igraph objects, with no loops, multiple edges or degree zero nodes. +# Load query graphs as undirected igraph objects, with no loops, multiple edges +# or degree zero nodes. graph_1 <- read_simple_graph( file = file.path(source_dir, "EBV.txt"), format = "ncol" @@ -51,12 +53,14 @@ graph_2 <- read_simple_graph( format = "ncol" ) -# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +# Herpes virus EBV protein-protein interaction graph with 60 nodes +# and 208 edges. graph_1 # Note this graph is the same as # virusppi$EBV -# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +# Herpes virus ECL protein-protein interaction graph with 1941 nodes +# and 3989 edges. graph_2 # Note this graph is the same as # virusppi$ECL @@ -95,42 +99,69 @@ and where $p_{t_i}(G)$ and $p_{t_i}(G')$ are the distributions of ${t_i}$ on $G$ ## Comparing two graphs with NetEmd ```{r, netemd,fig.align='center'} -# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package. +# Set source directory for Virus protein-protein interaction network edge files +# stored in the netdist package. source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs as igraph objects -# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +# Herpes virus EBV protein-protein interaction graph with 60 nodes +# and 208 edges. graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol" ) -# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +# Herpes virus ECL protein-protein interaction graph with 1941 nodes +# and 3989 edges. graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol" ) # One to one NetEmd comparison. -netemd_one_to_one(graph_1 = graph_1, graph_2 = graph_2, feature_type = "orbit", smoothing_window_width = 1) # Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# Use of smoothing window 1 is given for discrete integer distributions. If the +# network features are considered continuous variables smoothing_window_width +# equal to zero is recommended. +netemd_one_to_one( + graph_1 = graph_1, + graph_2 = graph_2, + feature_type = "orbit", + smoothing_window_width = 1 +) ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum ```{r, netemdEigen,fig.align='center'} # Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = FALSE, sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = FALSE, sparse = FALSE) +Lapg_1 <- igraph::laplacian_matrix( + graph = graph_1, normalized = FALSE, sparse = FALSE +) +Lapg_2 <- igraph::laplacian_matrix( + graph = graph_2, normalized = FALSE, sparse = FALSE +) # Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = graph_1, normalized = TRUE, sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = graph_2, normalized = TRUE, sparse = FALSE) +NLapg_1 <- igraph::laplacian_matrix( + graph = graph_1, normalized = TRUE, sparse = FALSE +) +NLapg_2 <- igraph::laplacian_matrix( + graph = graph_2, normalized = TRUE, sparse = FALSE +) # Spectra (this may take a couple of minutes). -props_1 <- cbind(L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values) -props_2 <- cbind(L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values) +props_1 <- cbind( + L.Spectra = eigen(Lapg_1)$values, NL.Spectra = eigen(NLapg_1)$values +) +props_2 <- cbind( + L.Spectra = eigen(Lapg_2)$values, NL.Spectra = eigen(NLapg_2)$values +) head(props_1, n = 3) head(props_2, n = 3) -netemd_one_to_one(dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0) # If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +# If the network features are considered continuous variables +# smoothing_window_width equal to zero is recommended. +netemd_one_to_one( + dhists_1 = props_1, dhists_2 = props_2, smoothing_window_width = 0 +) ``` ------------------------- @@ -207,7 +238,8 @@ where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ f This variant focuses on detecting more meso-level discrepancies between the ego-network structures. ```{r, netdisGP} -# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +# Netdis using the Geometric-Poisson approximation as a way to obtain background +# expectations. netdis_one_to_one(graph_1 = graph_1, graph_2 = graph_2, ref_graph = NULL) ``` diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index 1090a4ab..265f97b7 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -87,7 +87,9 @@ netdis_result <- netdis_many_to_many( # Netdis measure for subgraphs of size 3 res3 <- netdis_result$netdis["netdis3", ] -netdis3_mat <- cross_comp_to_matrix(measure = res3, cross_comparison_spec = netdis_result$comp_spec) +netdis3_mat <- cross_comp_to_matrix( + measure = res3, cross_comparison_spec = netdis_result$comp_spec +) print("Netdis: subgraph size = 3") print(netdis3_mat) @@ -134,11 +136,20 @@ plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"),