diff --git a/README.md b/README.md index 3171ca4e..952e19a9 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `viridis`: For plotting of networks with nice colors - `jsonlite`: For parsing the issue data - `rTensor`: For calculating EDCPTD centrality +- `Matrix`: For sparse matrix representation of large adjacency matrices ### Submodule @@ -410,6 +411,10 @@ Additionally, for more examples, the file `showcase.R` is worth a look. * Functionality to add vertex attributes to existing networks - `util-networks-metrics.R` * A set of network-metric functions +- `util-networks-misc.R` + * Helper functions for network creation (e.g., create adjacency matrices) +- `util-tensor.R` + * Functionality to build fourth-order tensors - `util-core-peripheral.R` * Author classification (core and peripheral) and related functions - `util-motifs.R` diff --git a/showcase.R b/showcase.R index 27d67275..7ddd5097 100644 --- a/showcase.R +++ b/showcase.R @@ -118,13 +118,13 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) ## Calculate EDCPTD centrality --------------------------------------------- ## get author networks for each relation -author.networks = get.author.networks(x, c("cochange", "mail", "issue")) +author.networks = get.author.networks.for.multiple.relations(x, c("cochange", "mail", "issue")) -## create forth-order tensor -forth.order.tensor = ForthOrderTensor$new(author.networks) +## create fourth-order tensor +fourth.order.tensor = FourthOrderTensor$new(author.networks) ## calculate EDCPTD scores -edcptd.scores = calculate.EDCPTD.centrality(forth.order.tensor) +edcptd.scores = calculate.EDCPTD.centrality(fourth.order.tensor) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Range-level data -------------------------------------------------------- diff --git a/util-init.R b/util-init.R index f2eb296e..f649eab8 100644 --- a/util-init.R +++ b/util-init.R @@ -60,4 +60,5 @@ source("util-plot.R") source("util-core-peripheral.R") source("util-networks-metrics.R") source("util-networks-covariates.R") +source("util-networks-misc.R") source("util-tensor.R") diff --git a/util-networks-misc.R b/util-networks-misc.R new file mode 100644 index 00000000..f8baf36a --- /dev/null +++ b/util-networks-misc.R @@ -0,0 +1,203 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2016 by Sofie Kemper +## Copyright 2016 by Claus Hunsen +## Copyright 2016-2018 by Thomas Bock +## Copyright 2017 by Angelika Schmid +## Copyright 2019 by Jakob Kronawitter +## Copyright 2019-2020 by Anselm Fehnker +## All Rights Reserved. + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Get active authors ----------------------------------------------------- + +#' Get all author names that are active in at least one of the networks. +#' +#' @param networks the list of networks +#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE] +#' +#' @return the list of author names +get.author.names.from.networks = function(networks, globally = TRUE) { + + ## for each network, get a list of authors that are in this network + active.authors.list = lapply(networks, function(network) { + active.authors = igraph::V(network)$name + return(active.authors) + }) + + if (globally) { + ## flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) + + ## remove distracting named list members + names(active.authors) = NULL + + ## remove duplicates and order alphabetically ascending + active.authors = active.authors[!duplicated(active.authors)] + active.authors = active.authors[order(active.authors)] + return(active.authors) + } else { + return(active.authors.list) + } +} + +#' Get all author names that are active in at least one of the date ranges. +#' +#' @param data.ranges the list of the data ranges +#' @param is.mail.analysis bool if the data is a mail analysis +#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE] +#' +#' @return the list of author names +get.author.names.from.data = function(dataRanges, isMailAnalysis, globally = TRUE) { + + ## for each range, get the authors who made at least one commit/mail in this range + active.authors.list = lapply(dataRanges, function(range.data) { + if (isMailAnalysis) { + active.authors = names(range.data$group.artifacts.by.data.column("mails", "author.name")) + } else { + active.authors = names(range.data$group.artifacts.by.data.column("commits", "author.name")) + } + return(active.authors) + }) + + if (globally) { + ## flatten the list of lists to one list of authors + active.authors = unlist(active.authors.list, recursive = FALSE) + + ## remove distracting named list members + names(active.authors) = NULL + + ## remove duplicates and order alphabetically ascending + active.authors = active.authors[!duplicated(active.authors)] + active.authors = active.authors[order(active.authors)] + return(active.authors) + } else { + return(active.authors.list) + } +} + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Adjacency matrices ---------------------------------------------------- + +#' Get a sparse adjacency matrix for a network. +#' +#' @param network the given network +#' @param authors all authors that are wanted in the adjacency matrix +#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE] +#' +#' @return the sparse adjacency matrix of the network +get.expanded.adjacency = function(network, authors, weighted = FALSE) { + + ## create an empty sparse matrix with the right size + matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) + matrix = as(matrix, "dgTMatrix") + + ## add row and column names + rownames(matrix) = authors + colnames(matrix) = authors + + if (igraph::vcount(network) > 0) { + + if (weighted) { + ## get the weighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network, attr = "weight") + } else { + ## get the unweighted adjacency matrix for the current network + matrix.data = igraph::get.adjacency(network) + } + + ## order the adjacency matrix + if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work + matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] + } + + ## save the activity data per author + if (nrow(matrix.data) > 0) { + matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data + } + + if (!weighted) { + matrix[matrix > 0] = 1 + } + + } + + return(matrix) +} + +#' Gets a list of networks, converts them to sparse adjacency matrices and ums up the adjacency matrices cumulatively. +#' This means that the first entry of the return list is just the adjacency matrix from the first network, +#' the second entry is the sum of the first and the second entry, and so on. +#' +#' @param networks list of networks +#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE] +#' +#' @return the list of cumulated adjacency matrices +get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) { + ## get expanded adjacency matrices first + matrices = get.expanded.adjacency(networks, weighted) + + ## pair-wise sum of matrices: m.cumul(n) = m.cumul(m-1) + m + ## (intermediate results consecutively stored in matrices.cumulated) + matrices.cumulated = list(matrices[[1]]) # first one is complete already + + if (length(matrices) > 1) { + for (m in 2:(length(matrices))){ + + matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]] + rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m-1]]) + colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m-1]]) + + if (!weighted) { + ## search for a non-zero entry and set them to an arbitray number (e.g., 42) + ## to force that all non-zero entries are correctly set to 1 afterwards + not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE) + if (nrow(not.zero.idxs) > 0) { + first.not.zero.idx = not.zero.idxs[1, ] + names(first.not.zero.idx) = c("row", "col") + matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42 + matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i)) + } + } + } + } + + return(matrices.cumulated) +} + +#' Converts a list of adjacency matrices to an array. +#' +#' @param network the list of adjacency matrices +#' +#' @return the converted array +convert.adjacency.matrix.list.to.array = function(adjacency.list){ + + ## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result + array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list))) + rownames(array) = rownames(adjacency.list[[1]]) + colnames(array) = colnames(adjacency.list[[1]]) + + ## copy the activity values from the adjacency matrices in the list to the corresponding array slices + for (i in seq_along(adjacency.ist)){ + adjacency = adjacency.list[[i]] + activity.indices = which(adjacency != 0, arr.ind = TRUE) + + for (j in 1:nrow(activity.indices)){ + array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j,2]), i] = + adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])] + } + } + + return(array) +} diff --git a/util-tensor.R b/util-tensor.R index d7d0439f..56728701 100644 --- a/util-tensor.R +++ b/util-tensor.R @@ -11,101 +11,103 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2020 by Anselm Fehnker +## Copyright 2019-2020 by Anselm Fehnker ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## ForthOrderTensor class ------------------------------------------------ +## FourthOrderTensor class ------------------------------------------------ -#' The class \code{ForthOrderTensor} creates an (author x relation x author x relation) +#' The class \code{FourthOrderTensor} creates an (author x relation x author x relation) #' tensor from a list of networks. The tensor as well as the dimensions and lists #' of the authors and relations are stored. #' -ForthOrderTensor = R6::R6Class("ForthOrderTensor", - - ## * private ---------------------------------------------------------- - - private = list( - ## * * data --------------------------------------------------------- - - dim = NULL, - relations = NULL, - authors = NULL, - tensor = NULL, - - ## * * tensor creation ---------------------------------------------- - - #' Creates a forth-order tensor from a list of networks using their - #' adjacency matrices. - #' - #' @param networks the list of networks - #' - #' @return the created tensor - build.tensor.from.networks = function(networks, weighted = FALSE) { - - ## get adjacency matrices from networks - adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) - - ## create an array with the size of the forth-order tensor that only contains zeros - array <-array(0, dim = private$dim) - - ## transfer entries from adjacency matrices to array - for (l in 1:length(adjacency.matrices)) { - - matrix = as(adjacency.matrices[[l]], "dgTMatrix") - - for (entry in 1:length(matrix@x)) { - array[matrix@i[entry]+1, l, matrix@j[entry]+1, l] = matrix@x[entry] - } - } - - ## convert array to tensor - tensor <- rTensor::as.tensor(array) - - return(tensor) - } - ), - - ## * * public ---------------------------------------------------------- - - public = list( - - #' Constructor of the class. Constructs a new forth-order tensor instance - #' based on the given list of networks. - #' - #' @param networks the given list of networks - #' @param weighted bool if the tensor shall be weighted - initialize = function(networks, weighted = FALSE) { - - private$relations = names(networks) - private$authors = get.author.names.from.networks(networks) - private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) - private$tensor = private$build.tensor.from.networks(networks, weighted) - - }, - - #' Get the list of authors of the tensor. - #' - #' @return the list of authors - get.authors = function() { - return(private$authors) - }, - - #' Get the list of relations of the tensor. - #' - #' @return the list of relations - get.relations = function() { - return(private$relations) - }, - - #' Get the tensor data saved in the object. - #' - #' @return the tensor data - get.tensor = function() { - return(private$tensor) - } - - ) +FourthOrderTensor = R6::R6Class("FourthOrderTensor", + + ## * private ---------------------------------------------------------- + + private = list( + ## * * data ------------------------------------------------------- + + dim = NULL, + relations = NULL, + authors = NULL, + tensor = NULL, + + ## * * tensor creation -------------------------------------------- + + #' Creates a fourth-order tensor from a list of networks using their + #' adjacency matrices. + #' + #' @param networks the list of networks + #' + #' @return the created tensor + build.tensor.from.networks = function(networks, weighted = FALSE) { + + ## get adjacency matrices from networks + adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) + + ## create an array with the size of the fourth-order tensor that only contains zeros + array = array(0, dim = private$dim) + + ## transfer entries from adjacency matrices to array + for (l in 1:length(adjacency.matrices)) { + + matrix = as(adjacency.matrices[[l]], "dgTMatrix") + + for (entry in 1:length(matrix@x)) { + ## Transfer the entries from the adjacency matrix to the tensor. + ## Due to the property that the indexes of a sparse matrix start with 1, + ## while the indexes of an array start with 0, the indexes need to be shifted + array[matrix@i[entry] + 1, l, matrix@j[entry] + 1, l] = matrix@x[entry] + } + } + + ## convert array to tensor + tensor = rTensor::as.tensor(array) + + return(tensor) + } + ), + + ## * * public ---------------------------------------------------------- + + public = list( + + #' Constructor of the class. Constructs a new fourth-order tensor instance + #' based on the given list of networks. + #' + #' @param networks the given list of networks + #' @param weighted bool if the tensor shall be weighted [default: FALSE] + initialize = function(networks, weighted = FALSE) { + + private$relations = names(networks) + private$authors = get.author.names.from.networks(networks) + private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) + private$tensor = private$build.tensor.from.networks(networks, weighted) + + }, + + #' Get the list of authors of the tensor. + #' + #' @return the list of authors + get.authors = function() { + return(private$authors) + }, + + #' Get the list of relations of the tensor. + #' + #' @return the list of relations + get.relations = function() { + return(private$relations) + }, + + #' Get the tensor data saved in the object. + #' + #' @return the tensor data + get.tensor = function() { + return(private$tensor) + } + ) ) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -118,137 +120,59 @@ ForthOrderTensor = R6::R6Class("ForthOrderTensor", #' @param relations the relations of the wanted networks #' #' @return the list of networks -get.author.networks = function(network.builder, relations) { - - networks = list() - - networks = lapply(relations, function(rel) { - - ## retrieve network for relation - network.builder$update.network.conf(updated.values = list(author.relation = rel)) - retrieved.network = network.builder$get.author.network() - - ## check if network is not empty - if(igraph::vcount(retrieved.network) > 0){ - logging::loginfo("Added %s data to list", rel) - return(retrieved.network) - } else { - logging::logwarn("There is no %s data available for the current project", rel) - return(NA) - } - }) - - ## add names of the relations - names(networks) = relations +get.author.networks.for.multiple.relations = function(network.builder, relations) { - ## removes empty networks - networks = networks[!is.na(networks)] + networks = list() - return(networks) -} - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Get active authors ----------------------------------------------------- - -#' Get all author names that are active in at least one of the networks. -#' -#' @param networks the list of networks -#' -#' @return the list of author names -get.author.names.from.networks = function(networks) { - - ## for each network, get a list of authors that are in this network - active.authors.list = lapply(networks, function(network) { - active.authors = igraph::V(network)$name - return (active.authors) - }) + networks = lapply(relations, function(rel) { - ## flatten the list of lists to one list of authors - active.authors = unlist(active.authors.list, recursive = FALSE) + ## retrieve network for relation + network.builder$update.network.conf(updated.values = list(author.relation = rel)) + retrieved.network = network.builder$get.author.network() - ## remove distracting named list members - names(active.authors) = NULL + ## check if network is not empty + if (igraph::vcount(retrieved.network) > 0){ + logging::loginfo("Added %s data to list", rel) + return(retrieved.network) + } else { + logging::logwarn("There is no %s data available for the current project", rel) + return(NA) + } + }) - ## remove duplicates - active.authors = active.authors[!duplicated(active.authors)] + ## add names of the relations + names(networks) = relations - ## order alphabetically ascending - active.authors = active.authors[order(active.authors)] + ## removes empty networks + networks = networks[!is.na(networks)] - return (active.authors) -} - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Get adjacency matrix ---------------------------------------------------- - -#' Get a sparse adjacency matrix for a network. -#' -#' @param network the given network -#' @param authors all authors that are wanted in the adjacency matrix -#' @param weighted bool if the adjacency matrix shall be weighted -#' -#' @return the list of author names -get.expanded.adjacency = function(network, authors, weighted = FALSE) { - - ## create an empty sparse matrix with the right size - matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) - matrix = as(matrix, "dgTMatrix") - - ## add row and column names - rownames(matrix) = authors - colnames(matrix) = authors - - if(igraph::vcount(network) > 0) { - - if(weighted) { - ## get the weighted adjacency matrix for the current network - matrix.data = igraph::get.adjacency(network, attr = "weight") - } else { - ## get the unweighted adjacency matrix for the current network - matrix.data = igraph::get.adjacency(network) - } - - ## order the adjacency matrix - if(nrow(matrix.data)>1) { # for a 1x1 matrix ordering doesn't work - matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] - } - - ## save the activity data per developer - if(nrow(matrix.data)>0) { - matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data - } - - if(!weighted) { - matrix[matrix > 0] <- 1 - } - - } - - return(matrix) + return(networks) } ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Calculate centrality ---------------------------------------------------- -#' Calculate EDCPTD centrality for a given forth-order tensor. +#' Calculate EDCPTD centrality for a given fourth-order tensor. +#' EDCPTD centrality based on: Chaos 27, 063108 (2017); https://doi.org/10.1063/1.4985185 #' -#' @param forth.order.tensor the given tensor +#' @param fourth.order.tensor the given tensor #' #' @return data frame with EDCPTD score for every author -calculate.EDCPTD.centrality = function(forth.order.tensor) { +calculate.EDCPTD.centrality = function(fourth.order.tensor) { - ## create data frame for results - results = data.frame(names = forth.order.tensor$get.authors(), EDCPTD.score = 0) + ## create data frame for results + results = data.frame(names = fourth.order.tensor$get.authors(), EDCPTD.score = 0) - ## decompose tensor - decomposition <-rTensor::cp(forth.order.tensor$get.tensor(), num_components = 1, max_iter = 50, tol = 1e-05) + ## decompose tensor. 'num_comonents = 1' needed for EDCPTD centrality. + ## 'max_iter' and 'tol' chosen from default in documentation. + decomposition = rTensor::cp(fourth.order.tensor$get.tensor(), num_components = 1, max_iter = 25, tol = 1e-05) - ## calculate EDCPTD centrality - for (y in 1:length(forth.order.tensor$get.relations())) { - results[["EDCPTD.score"]] = (results[["EDCPTD.score"]] - + abs(decomposition[["U"]][[1]][,1] * decomposition[["U"]][[2]][,1][y]) - + abs(decomposition[["U"]][[3]][,1] * decomposition[["U"]][[4]][,1][y]))/2 - } + ## calculate EDCPTD centrality + for (y in 1:length(fourth.order.tensor$get.relations())) { + results[["EDCPTD.score"]] = (results[["EDCPTD.score"]] + + abs(decomposition[["U"]][[1]][,1] * decomposition[["U"]][[2]][,1][y]) + + abs(decomposition[["U"]][[3]][,1] * decomposition[["U"]][[4]][,1][y])) / 2 + } - return(results) + return(results) }