Fix@cran blas (#3)

* Fixed compilation error in R 4.2.0 See https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Fortran-character-strings ans https://www.stats.ox.ac.uk/pub/bdr/BLAS/README.txt * Use roxygen2 and markdown for documentation * Changed maintainer * import head and tail from utils in order to suppress cran note * remove news.md from Rbuildignore * updated cran-comments and README * Fixed BLAS compilation error also on dist.cpp * Added a comment regarding rchk * update CRAN-SUBMISSION
tanaylab · Apr 14, 2022 · b5ce39b · b5ce39b
1 parent a402007
commit b5ce39b
Show file tree

Hide file tree

Showing 38 changed files with 1,157 additions and 568 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,4 +1,11 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
-README\.md
-NEWS\.md
+^README\.md$
+^README\.Rmd$
+^push_misha_manual$
+^_pkgdown\.yml$
+^\.git\.*
+^\.gitignore\.*
+^README_cache$
+^cran-comments\.md$
+^CRAN-SUBMISSION$
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,11 @@
-.Rproj.user
-.Rhistory
-.RData
-.Ruserdata
-src/*.o
-src/*.so
-src/*.dll
-README_cache/*
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+src/*.o
+src/*.so
+src/*.dll
+README_cache/*
+inst/doc
+README_cache/
+..Rcheck
diff --git a/.travis.yml b/.travis.yml
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -0,0 +1,3 @@
+Version: 2.3.17
+Date: 2022-04-14 11:57:00 UTC
+SHA: 0d3cde73826253c86ffa98007855f54c44b37278
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,25 +1,36 @@
-Package: tgstat
 Type: Package
+Package: tgstat
 Title: Amos Tanay's Group High Performance Statistical Utilities
-Version: 2.3.16
-Depends: R (>= 3.5.0)
-Imports: utils
-SystemRequirements: C++11
-OS_type: unix
-Date: 2020-09-02
-Author: Michael Hoichman
-Maintainer: Michael Hoichman <[email protected]>
-Description: A collection of high performance utilities to compute distance,
-    correlation, auto correlation, clustering and other tasks.
-    Contains graph clustering algorithm described in "MetaCell: analysis of
-    single-cell RNA-seq data using K-nn graph partitions" (Yael Baran,
-    Akhiad Bercovich, Arnau Sebe-Pedros, Yaniv Lubling, Amir Giladi,
-    Elad Chomsky, Zohar Meir, Michael Hoichman, Aviezer Lifshitz & Amos Tanay,
+Version: 2.3.17
+Date: 2022-04-13
+Authors@R: c(
+    person("Michael", "Hoichman", , "[email protected]", role = "aut"),
+    person("Aviezer", "Lifshitz", , "[email protected]", role = c("aut", "cre"))
+  )  
+Author: Michael Hoichman [aut], Aviezer Lifshitz [aut, cre]
+Maintainer: Aviezer Lifshitz <[email protected]>
+Description: A collection of high performance utilities to compute
+    distance, correlation, auto correlation, clustering and other tasks.
+    Contains graph clustering algorithm described in "MetaCell: analysis
+    of single-cell RNA-seq data using K-nn graph partitions" (Yael Baran,
+    Akhiad Bercovich, Arnau Sebe-Pedros, Yaniv Lubling, Amir Giladi, Elad
+    Chomsky, Zohar Meir, Michael Hoichman, Aviezer Lifshitz & Amos Tanay,
     2019 <doi:10.1186/s13059-019-1812-2>).
 License: GPL-2
+BugReports: https://github.com/tanaylab/tgstat/issues
+Depends: 
+    R (>= 3.5.0)
+Imports: 
+    utils
+Suggests: 
+    knitr,
+    rmarkdown
+VignetteBuilder: 
+    knitr
+Encoding: UTF-8
 LazyLoad: yes
-RoxygenNote: 6.1.1
 NeedsCompilation: yes
-Packaged: 2020-09-02 18:10:14 UTC; hoichman
-Authors@R: person("Misha", "Hoichman", email = "[email protected]", role = c("aut", "cre"))
-BugReports: https://github.com/tanaylab/tgstat/issues
+OS_type: unix
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.1.2
+SystemRequirements: C++11
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,23 @@
-useDynLib(tgstat, .registration = TRUE)
-exportPattern("^[[:alpha:]]+")
+# Generated by roxygen2: do not edit by hand
 
+export(tgs_cor)
+export(tgs_cor_knn)
+export(tgs_dist)
+export(tgs_finite)
+export(tgs_graph)
+export(tgs_graph_cover)
+export(tgs_graph_cover_resample)
+export(tgs_knn)
+export(tgs_matrix_tapply)
+importFrom(utils,head)
+importFrom(utils,tail)
+useDynLib(tgstat,tgs_cor_blas)
+useDynLib(tgstat,tgs_cor_graph)
+useDynLib(tgstat,tgs_cross_cor)
+useDynLib(tgstat,tgs_cross_cor_blas)
+useDynLib(tgstat,tgs_cross_cor_knn)
+useDynLib(tgstat,tgs_dist_blas)
+useDynLib(tgstat,tgs_graph2cluster)
+useDynLib(tgstat,tgs_graph2cluster_multi_edges)
+useDynLib(tgstat,tgs_graph2cluster_multi_full)
+useDynLib(tgstat,tgs_graph2cluster_multi_hash)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# tgstat 2.3.17
+
+- Fix compilation issues with R 4.2.0
+- Use roxygen2 and markdown for documentation
+
 # tgstat 2.3.16
 
 - Fix compilation issues on Debian.

diff --git a/R/cor.R b/R/cor.R
@@ -0,0 +1,125 @@
+#' Calculates correlation or auto-correlation
+#'
+#' Calculates correlation between two matrices columns or auto-correlation
+#' between a matrix columns.
+#'
+#' 'tgs_cor' is very similar to 'stats::cor'. Unlike the latter it uses
+#' all available CPU cores to compute the correlation in a much faster way. The
+#' basic implementation of 'pairwise.complete.obs' is also more efficient
+#' giving overall great run-time advantage.
+#'
+#' Unlike 'stats::cor' 'tgs_cor' implements only two modes of treating
+#' data containing NA, which are equivalent to 'use="everything"' and
+#' 'use="pairwise.complete.obs". Please refer the documentation of this
+#' function for more details.
+#'
+#' 'tgs_cor(x, y, spearman = FALSE)' is equivalent to 'cor(x, y, method =
+#' "pearson")' 'tgs_cor(x, y, spearman = TRUE)' is equivalent to 'cor(x, y, method
+#' = "spearman")' 'tgs_cor(x, y, pairwise.complete.obs = TRUE, spearman = TRUE)' is
+#' equivalent to 'cor(x, y, use = "pairwise.complete.obs", method =
+#' "spearman")' 'tgs_cor(x, y, pairwise.complete.obs = TRUE, spearman = FALSE)' is
+#' equivalent to 'cor(x, y, use = "pairwise.complete.obs", method = "pearson")'
+#'
+#' 'tgs_cor' can output its result in "tidy" format: a data frame with three
+#' columns named 'col1', 'col2' and 'cor'. Only the correlation values which
+#' abs are equal or above the 'threshold' are reported. For auto-correlation
+#' (i.e. when 'y=NULL') a pair of columns numbered X and Y is reported only if
+#' X < Y.
+#'
+#' 'tgs_cor_knn' works similarly to 'tgs_cor'. Unlike the latter it returns
+#' only the highest 'knn' correlations for each column in 'x'. The result of
+#' 'tgs_cor_knn' is always outputed in "tidy" format.
+#'
+#' One of the reasons to opt 'tgs_cor_knn' over a pair of calls to 'tgs_cor'
+#' and 'tgs_knn' is the reduced memory consumption of the former. For
+#' auto-correlation case (i.e. 'y=NULL') given that the number of columns NC
+#' exceeds the number of rows NR, then 'tgs_cor' memory consumption becomes a
+#' factor of NCxNC. In contrast 'tgs_cor_knn' would consume in the similar
+#' scenario a factor of max(NCxNR,NCxKNN). Similarly 'tgs_cor(x,y)' would
+#' consume memory as a factor of NCXxNCY, wherever 'tgs_cor_knn(x,y,knn)' would
+#' reduce that to max((NCX+NCY)xNR,NCXxKNN).
+#'
+#' @aliases tgs_cor tgs_cor_knn
+#' @param x numeric matrix
+#' @param y numeric matrix
+#' @param pairwise.complete.obs see below
+#' @param spearman if 'TRUE' Spearman correlation is computed, otherwise
+#' Pearson
+#' @param tidy if 'TRUE' data is outputed in tidy format
+#' @param threshold absolute threshold above which values are outputed in tidy
+#' format
+#' @param knn the number of highest correlations returned per column
+#' @return 'tgs_cor_knn' or 'tgs_cor' with 'tidy=TRUE' return a data frame,
+#' where each row represents correlation between two pairs of columns from 'x'
+#' and 'y' (or two columns of 'x' itself if 'y==NULL'). 'tgs_cor' with the
+#' 'tidy=FALSE' returns a matrix of correlation values, where \code{val[X,Y]}
+#' represents the correlation between columns X and Y of the input matrices (if
+#' 'y' is not 'NULL') or the correlation between columns X and Y of 'x' (if 'y'
+#' is 'NULL').
+#' @keywords ~correlation
+#' @examples
+#' \donttest{
+#' # Note: all the available CPU cores might be used
+#'
+#' set.seed(seed = 0)
+#' rows <- 100
+#' cols <- 1000
+#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
+#' m <- matrix(vals, nrow = rows, ncol = cols)
+#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
+#'
+#' r1 <- tgs_cor(m, spearman = FALSE)
+#' r2 <- tgs_cor(m, pairwise.complete.obs = TRUE, spearman = TRUE)
+#' r3 <- tgs_cor_knn(m, NULL, 5, spearman = FALSE)
+#' }
+#'
+#' \dontshow{
+#' options(tgs_use.blas = FALSE)
+#' options(tgs_max.processes = 1)
+#'
+#' set.seed(seed = 0)
+#' rows <- 100
+#' cols <- 100
+#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
+#' m <- matrix(vals, nrow = rows, ncol = cols)
+#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
+#'
+#' r1 <- tgs_cor(m, spearman = FALSE)
+#' r2 <- tgs_cor(m, pairwise.complete.obs = TRUE, spearman = TRUE)
+#' r3 <- tgs_cor_knn(m, NULL, 5, spearman = FALSE)
+#' }
+#'
+#' @export tgs_cor
+tgs_cor <- function(x, y = NULL, pairwise.complete.obs = FALSE, spearman = FALSE, tidy = FALSE, threshold = 0) {
+    if (missing(x)) {
+        stop("Usage: tgs_cor(x, y = NULL, pairwise.complete.obs = FALSE, spearman = FALSE, tidy = FALSE, threshold = 0)", call. = FALSE)
+    }
+
+    if (is.null(y)) {
+        if (!.tgs_use_blas() || pairwise.complete.obs && spearman && !tgs_finite(x)) {
+            .Call("tgs_cor", x, pairwise.complete.obs, spearman, tidy, threshold, new.env(parent = parent.frame()))
+        } else {
+            .Call("tgs_cor_blas", x, pairwise.complete.obs, spearman, tidy, threshold, new.env(parent = parent.frame()))
+        }
+    } else {
+        if (!.tgs_use_blas() || pairwise.complete.obs && spearman && (!tgs_finite(x) || !tgs_finite(y))) {
+            .Call("tgs_cross_cor", x, y, pairwise.complete.obs, spearman, tidy, threshold, new.env(parent = parent.frame()))
+        } else {
+            .Call("tgs_cross_cor_blas", x, y, pairwise.complete.obs, spearman, tidy, threshold, new.env(parent = parent.frame()))
+        }
+    }
+}
+
+#' @rdname tgs_cor
+#' @export
+tgs_cor_knn <- function(x, y, knn, pairwise.complete.obs = FALSE, spearman = FALSE, threshold = 0) {
+    if (missing(x) || missing(knn)) {
+        stop("Usage: tgs_cor_knn(x, y, knn, pairwise.complete.obs = FALSE, spearman = FALSE, threshold = 0)", call. = FALSE)
+    }
+
+    if (is.null(y)) {
+        .Call("tgs_cor_knn", x, knn, pairwise.complete.obs, spearman, threshold, new.env(parent = parent.frame()))
+    } else {
+        .Call("tgs_cross_cor_knn", x, y, knn, pairwise.complete.obs, spearman, threshold, new.env(parent = parent.frame()))
+    }
+}
diff --git a/R/dist.R b/R/dist.R
@@ -0,0 +1,70 @@
+#' Calculates distances between the matrix rows
+#'
+#' Calculates distances between the matrix rows.
+#'
+#' This function is very similar to 'package:stats::dist'. Unlike the latter it
+#' uses all available CPU cores to compute the distances in a much faster way.
+#'
+#' Unlike 'package:stats::dist' 'tgs_dist' uses always "euclidean" metrics (see
+#' 'method' parameter of 'dist' function). Thus:
+#'
+#' 'tgs_dist(x)' is equivalent to 'dist(x, method = "euclidean")'
+#'
+#' 'tgs_dist' can output its result in "tidy" format: a data frame with three
+#' columns named 'row1', 'row2' and 'dist'. Only the distances that are less or
+#' equal than the 'threshold' are reported. Distance between row number X and Y
+#' is reported only if X < Y. 'diag' and 'upper' parameters are ignored when
+#' the result is returned in "tidy" format.
+#'
+#' @param x numeric matrix
+#' @param diag see 'dist' documentation
+#' @param upper see 'dist' documentation
+#' @param tidy if 'TRUE' data is outputed in tidy format
+#' @param threshold threshold below which values are outputed in tidy format
+#' @return If 'tidy' is 'FALSE' - the output is similar to that of 'dist'
+#' function. If 'tidy' is 'TRUE' - 'tgs_dist' returns a data frame, where each
+#' row represents distances between two pairs of original rows.
+#' @keywords ~distance
+#' @examples
+#' \donttest{
+#' # Note: all the available CPU cores might be used
+#'
+#' set.seed(seed = 0)
+#' rows <- 100
+#' cols <- 1000
+#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
+#' m <- matrix(vals, nrow = rows, ncol = cols)
+#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
+#' r <- tgs_dist(m)
+#' }
+#'
+#' \dontshow{
+#' options(tgs_use.blas = FALSE)
+#' options(tgs_max.processes = 1)
+#'
+#' set.seed(seed = 0)
+#' rows <- 100
+#' cols <- 100
+#' vals <- sample(1:(rows * cols / 2), rows * cols, replace = TRUE)
+#' m <- matrix(vals, nrow = rows, ncol = cols)
+#' m[sample(1:(rows * cols), rows * cols / 1000)] <- NA
+#' r <- tgs_dist(m)
+#' }
+#'
+#' @export tgs_dist
+tgs_dist <- function(x, diag = FALSE, upper = FALSE, tidy = FALSE, threshold = Inf) {
+    if (missing(x)) {
+        stop("Usage: tgs_dist(x, diag = FALSE, upper = FALSE, tidy = FALSE, threshold = Inf)", call. = FALSE)
+    }
+
+    attrs <- list(
+        Size = nrow(x), Labels = dimnames(x)[[1L]], Diag = diag,
+        Upper = upper, method = "euclidian", call = match.call(), class = "dist"
+    )
+
+    if (.tgs_use_blas()) {
+        .Call("tgs_dist_blas", x, attrs, tidy, threshold, dimnames(x)[[1L]], new.env(parent = parent.frame()))
+    } else {
+        .Call("tgs_dist", x, attrs, tidy, threshold, dimnames(x)[[1L]], new.env(parent = parent.frame()))
+    }
+}