se-sic · bockthom · Nov 8, 2020 · Oct 20, 2019 · Aug 8, 2020 · Aug 8, 2020
diff --git a/README.md b/README.md
@@ -128,6 +128,8 @@ Alternatively, you can run `Rscript install.R` to install the packages.
 - `lubridate`: For convenient date conversion and parsing
 - `viridis`: For plotting of networks with nice colors
 - `jsonlite`: For parsing the issue data
+- `rTensor`: For calculating EDCPTD centrality
+- `Matrix`: For sparse matrix representation of large adjacency matrices
 
 ### Submodule
 
@@ -409,6 +411,10 @@ Additionally, for more examples, the file `showcase.R` is worth a look.
     * Functionality to add vertex attributes to existing networks
 - `util-networks-metrics.R`
     * A set of network-metric functions
+- `util-networks-misc.R`
+    * Helper functions for network creation (e.g., create adjacency matrices)
+- `util-tensor.R`
+    * Functionality to build fourth-order tensors
 - `util-core-peripheral.R`
     * Author classification (core and peripheral) and related functions
 - `util-motifs.R`

diff --git a/install.R b/install.R
@@ -37,7 +37,9 @@ packages = c(
     "markovchain",
     "lubridate",
     "viridis",
-    "jsonlite"
+    "jsonlite",
+    "rTensor",
+    "Matrix"
 )
 
 

diff --git a/showcase.R b/showcase.R
@@ -18,6 +18,7 @@
 ## Copyright 2017-2018 by Thomas Bock <[email protected]>
 ## Copyright 2018 by Jakob Kronawitter <[email protected]>
 ## Copyright 2019 by Klara Schlueter <[email protected]>
+## Copyright 2020 by Anselm Fehnker <[email protected]>
 ## All Rights Reserved.
 
 
@@ -122,6 +123,17 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf)
 # net = x$get.author.network()
 # save(net, file = sprintf("busybox_%s.network", x$get.network.conf.variable(var.name = "author.relation")))
 
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Calculate EDCPTD centrality ---------------------------------------------
+
+## get author networks for each relation
+author.networks = get.author.networks.for.multiple.relations(x, c("cochange", "mail", "issue"))
+
+## create fourth-order tensor
+fourth.order.tensor = FourthOrderTensor$new(author.networks)
+
+## calculate EDCPTD scores
+edcptd.scores = calculate.EDCPTD.centrality(fourth.order.tensor)
 
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Range-level data --------------------------------------------------------

diff --git a/util-data.R b/util-data.R
@@ -18,7 +18,7 @@
 ## Copyright 2017 by Felix Prasse <[email protected]>
 ## Copyright 2017 by Ferdinand Frank <[email protected]>
 ## Copyright 2018-2019 by Jakob Kronawitter <[email protected]>
-## Copyright 2019 by Anselm Fehnker <[email protected].de>
+## Copyright 2019-2020 by Anselm Fehnker <anselm@muenster.de>
 ## All Rights Reserved.
 
 
@@ -1313,23 +1313,29 @@ ProjectData = R6::R6Class("ProjectData",
             return(mylist)
         },
 
-        #' Get the list of authors by only looking only at the specified data source.
+        #' Get the list of authors for the specified data sources.
         #'
         #' *Note*: The constant \code{DATASOURCE.TO.ARTIFACT.FUNCTION} denotes the mapping between
         #' data source and the method which is retrieving the data for each data source.
         #'
-        #' @param data.source the data source which can be either \code{"commits"}, \code{"mails"},
-        #'                    or \code{"issues"} [default: "commits"]
+        #' @param data.sources the data sources from which the authors should be retrieved,
+        #'                    can be either \code{"commits"}, \code{"mails"}, or \code{"issues"},
+        #'                    or any combination of them [default: c("commits", "mails", "issues")]
         #'
         #' @return a data.frame of unique author names (columns \code{name} and \code{author.email}),
         #'         extracted from the specified data source
-        get.authors.by.data.source = function(data.source = c("commits", "mails", "issues")) {
+        get.authors.by.data.source = function(data.sources = c("commits", "mails", "issues")) {
 
-            data.source = match.arg(data.source)
+            data.sources = match.arg.or.default(data.sources, several.ok = TRUE)
 
             ## retrieve author names from chosen data source
-            data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]]
-            data = self[[data.source.func]]()[c("author.name", "author.email")]
+            data = lapply(data.sources, function(data.source){
+                data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]]
+                data.source.authors = self[[data.source.func]]()[c("author.name", "author.email")]
+                return (data.source.authors)
+            })
+
+            data = plyr::rbind.fill(data)
 
             ## remove duplicates
             data = unique(data)

diff --git a/util-init.R b/util-init.R
@@ -17,6 +17,7 @@
 ## Copyright 2017 by Sofie Kemper <[email protected]>
 ## Copyright 2017 by Felix Prasse <[email protected]>
 ## Copyright 2019 by Klara Schlüter <[email protected]>
+## Copyright 2019-2020 by Anselm Fehnker <[email protected]>
 ## All Rights Reserved.
 
 
@@ -62,3 +63,5 @@ source("util-core-peripheral.R")
 source("util-networks-metrics.R")
 source("util-networks-covariates.R")
 source("util-plot-evaluation.R")
+source("util-networks-misc.R")
+source("util-tensor.R")
diff --git a/util-networks-misc.R b/util-networks-misc.R
@@ -0,0 +1,238 @@
+## This file is part of coronet, which is free software: you
+## can redistribute it and/or modify it under the terms of the GNU General
+## Public License as published by  the Free Software Foundation, version 2.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along
+## with this program; if not, write to the Free Software Foundation, Inc.,
+## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+##
+## Copyright 2016 by Sofie Kemper <[email protected]>
+## Copyright 2016 by Claus Hunsen <[email protected]>
+## Copyright 2016-2018 by Thomas Bock <[email protected]>
+## Copyright 2017 by Angelika Schmid <[email protected]>
+## Copyright 2019 by Jakob Kronawitter <[email protected]>
+## Copyright 2019-2020 by Anselm Fehnker <[email protected]>
+## All Rights Reserved.
+
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Libraries ---------------------------------------------------------------
+
+requireNamespace("parallel") # for parallel computation
+requireNamespace("igraph") # networks
+requireNamespace("Matrix") # for sparse matrices
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Get active authors  -----------------------------------------------------
+
+#' Get all author names that are active in at least one of the networks.
+#'
+#' @param networks the list of networks from which the author names are wanted
+#' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE]
+#'
+#' @return the list of author names
+get.author.names.from.networks = function(networks, globally = TRUE) {
+
+    ## for each network, get a list of authors that are in this network
+    active.authors.list = lapply(networks, function(network) {
+        active.authors = igraph::V(network)$name
+        return(active.authors)
+    })
+
+    if (globally) {
+        ## flatten the list of lists to one list of authors
+        active.authors = unlist(active.authors.list, recursive = FALSE)
+
+        ## remove distracting named list members
+        names(active.authors) = NULL
+
+        ## remove duplicates and order alphabetically ascending
+        active.authors = active.authors[!duplicated(active.authors)]
+        active.authors = sort(active.authors)
+        return(active.authors)
+    } else {
+        return(active.authors.list)
+    }
+}
+
+#' Get all author names that are active in at least one of the data sources during the data ranges.
+#'
+#' @param data.ranges the list of the data ranges
+#' @param data.sources the data sources from which the author names should be retrieved,
+#'                    can be either \code{"commits"}, \code{"mails"}, or \code{"issues"},
+#'                    or any combination of them [default: c("commits", "mails", "issues")]
+#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE]
+#'
+#' @return the list of author names
+get.author.names.from.data = function(data.ranges, data.sources = c("commits", "mails", "issues"), globally = TRUE) {
+
+    data.sources = match.arg.or.default(data.sources, several.ok = TRUE)
+
+    ## for each range, get the authors who have been active on at least one data source in this range
+    active.authors.list = lapply(data.ranges, function(range.data) {
+
+        active.authors = range.data$get.authors.by.data.source(data.sources)
+
+        active.authors.names = active.authors$author.name
+
+        return(active.authors.names)
+
+    })
+
+    if (globally) {
+        ## flatten the list of lists to one list of authors
+        active.authors = unlist(active.authors.list, recursive = FALSE)
+
+        ## remove distracting named list members
+        names(active.authors) = NULL
+
+        ## remove duplicates and order alphabetically ascending
+        active.authors = active.authors[!duplicated(active.authors)]
+        active.authors = sort(active.authors)
+        return(active.authors)
+    } else {
+        return(active.authors.list)
+    }
+}
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Adjacency matrices ----------------------------------------------------
+
+#' Get a sparse expanded adjacency matrix for network.
+#'
+#' The adjacency matrix is expanded as it may contain rows and columns for authors which are not part of the network
+#' but given in the \code{authors} parameter. However, this also means that authors present in the network
+#' but not given in the \code{authors} parameter are not contained in the expanded adjacency matrix.
+#'
+#' @param network the given network
+#' @param authors all authors that are wanted in the adjacency matrix
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the sparse adjacency matrix of the network
+get.expanded.adjacency = function(network, authors, weighted = FALSE) {
+
+    ## create an empty sparse matrix with the right size
+    matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE)
+    matrix = as(matrix, "dgTMatrix")
+
+    ## add row and column names
+    rownames(matrix) = authors
+    colnames(matrix) = authors
+
+    if (igraph::vcount(network) > 0) {
+
+        if (weighted) {
+            ## get the weighted adjacency matrix for the current network
+            matrix.data = igraph::get.adjacency(network, attr = "weight")
+        } else {
+            ## get the unweighted adjacency matrix for the current network
+            matrix.data = igraph::get.adjacency(network)
+        }
+
+        ## order the adjacency matrix
+        if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work
+            matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))]
+        }
+
+        ## save the activity data per author
+        if (nrow(matrix.data) > 0) {
+            matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data
+        }
+
+        if (!weighted) {
+            matrix[matrix > 0] = 1
+        }
+
+    }
+
+    return(matrix)
+}
+
+#' Calculates a sparse adjacency matrix for each network in the list.
+#' All adjacency matrices are expanded in such a way that the use the same set
+#' of authors derived from all networks in the list.
+#'
+#' @param networks list of networks
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the list of adjacency matrices
+get.expanded.adjacency.matrices = function(networks, weighted = FALSE){
+
+    authors = get.author.names.from.networks(networks)
+
+    adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted)
+
+    return(adjacency.matrices)
+}
+
+#' Gets a list of networks, converts them to sparse adjacency matrices, and sums up the adjacency matrices cumulatively.
+#' This means that the first entry of the returned list is just the adjacency matrix from the first network,
+#' the second entry is the sum of the first and the second entry, and so on.
+#'
+#' @param networks list of networks
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the list of cumulated adjacency matrices
+get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) {
+    ## get expanded adjacency matrices first
+    matrices = get.expanded.adjacency.matrices(networks, weighted)
+
+    ## pair-wise sum of matrices: m.cumul(n) = m.cumul(m - 1) + m
+    ## (intermediate results consecutively stored in matrices.cumulated)
+    matrices.cumulated = list(matrices[[1]]) # first one is complete already
+
+    if (length(matrices) > 1) {
+        for (m in 2:(length(matrices))){
+
+            matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]]
+            rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m - 1]])
+            colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m - 1]])
+
+            if (!weighted) {
+
+                ## search for a non-zero entry and set them to an arbitray number (e.g., 42)
+                ## to force that all non-zero entries are correctly set to 1 afterwards
+                not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE)
+                if (nrow(not.zero.idxs) > 0) {
+                    first.not.zero.idx = not.zero.idxs[1, ]
+                    names(first.not.zero.idx) = c("row", "col")
+                    matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42
+                    matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i))
+                }
+            }
+        }
+    }
+
+    return(matrices.cumulated)
+}
+
+#' Converts a list of adjacency matrices to an array.
+#'
+#' @param adjacency.list the list of adjacency matrices
+#'
+#' @return the converted array
+convert.adjacency.matrix.list.to.array = function(adjacency.list){
+
+    ## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result
+    array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list)))
+    rownames(array) = rownames(adjacency.list[[1]])
+    colnames(array) = colnames(adjacency.list[[1]])
+
+    ## copy the activity values from the adjacency matrices in the list to the corresponding array slices
+    for (i in seq_along(adjacency.ist)){
+        adjacency = adjacency.list[[i]]
+        activity.indices = which(adjacency != 0, arr.ind = TRUE)
+
+        for (j in 1:nrow(activity.indices)){
+            array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2]), i] =
+                adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])]
+        }
+    }
+
+    return(array)
+}
diff --git a/util-networks.R b/util-networks.R
@@ -17,6 +17,7 @@
 ## Copyright 2017-2019 by Thomas Bock <[email protected]>
 ## Copyright 2018 by Barbara Eckl <[email protected]>
 ## Copyright 2018-2019 by Jakob Kronawitter <[email protected]>
+## Copyright 2020 by Anselm Fehnker <[email protected]>
 ## All Rights Reserved.
 
 
@@ -195,7 +196,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder",
             ## also corresponding author information. Re-add author vertices back to the network now by accessing the
             ## complete author list:
             ## 1) get all authors on commits
-            authors = private$proj.data$get.authors.by.data.source(data.source = "commits")
+            authors = private$proj.data$get.authors.by.data.source(data.sources = "commits")
             ## 2) only select author names
             authors = authors["author.name"]
             ## 3) rename single column to "name" to correct mapping to vertex attribute "name"