From 13f58a71e0e141c2ff94538918cccd145608c6f1 Mon Sep 17 00:00:00 2001 From: Klara Date: Tue, 2 Jul 2019 14:49:35 +0200 Subject: [PATCH 01/39] Count authors AND committers for artifact editor count The function add.vertex.attribute.artifact.editor.count now counts authors and committers as editors (before, only authors were considered). For example, if an artifact has been changed by one commit which was authored by Author1 and committed by Author2, the vertex attribute editor.count now equals 2 (instead of 1). Signed-off-by: Klara Schlueter --- tests/test-networks-covariates.R | 2 +- util-networks-covariates.R | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 44233926..2960f291 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -822,7 +822,7 @@ test_that("Test add.vertex.attribute.artifact.editor.count", { expected.attributes = list( range = network.covariates.test.build.expected( - c(1L), c(1L), c(3L, 1L)), + c(1L), c(2L), c(3L, 1L)), cumulative = network.covariates.test.build.expected( c(1L), c(2L), c(3L, 1L)), all.ranges = network.covariates.test.build.expected( diff --git a/util-networks-covariates.R b/util-networks-covariates.R index b6510817..3f05511e 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -679,11 +679,13 @@ add.vertex.attribute.artifact.editor.count = function(list.of.networks, project. nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, aggregation.level, default.value, function(range, range.data, net) { - lapply(range.data$group.authors.by.data.column("commits", "artifact"), + vertex.attributes = lapply(range.data$group.authors.by.data.column("commits", "artifact"), function(x) { - length(unique(x[["author.name"]])) + editor.count = length(unique(c(x[["author.name"]], x[["committer.name"]]))) + return(editor.count) } ) + return(vertex.attributes) } ) From ff1e147ba563b2d71f8228afd49492a315a5ad48 Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 11 Jul 2019 03:20:57 +0200 Subject: [PATCH 02/39] Add possibility of editor definition to editor.count vertex.attribute Add a parameter "editor.definition" to the function add.vertex.attribute.artifact.editor.count. The new parameter should be a subset of "author" and "committer" and decides who is counted as editor while computing the attribute values. Signed-off-by: Klara Schlueter --- util-networks-covariates.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 3f05511e..36aca3c9 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -666,6 +666,7 @@ add.vertex.attribute.author.role = function(list.of.networks, classification.res #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] +#' @param editor.definition Determines, who is counted as editor of an artifact. [default: c("author", "committer")] #' @param default.value The default value to add if a vertex has no matching value [default: 0] #' #' @return A list of networks with the added attribute @@ -673,15 +674,20 @@ add.vertex.attribute.artifact.editor.count = function(list.of.networks, project. aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), + editor.definition = c("author", "committer"), default.value = 0) { aggregation.level = match.arg.or.default(aggregation.level, default = "range") + ## match editor definitions to column name in commit dataframe + editor.definition = match.arg.or.default(editor.definition, several.ok = TRUE) + editor.definition = lapply(editor.definition, function(editor) {paste0(editor, ".name")}) + nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, aggregation.level, default.value, function(range, range.data, net) { vertex.attributes = lapply(range.data$group.authors.by.data.column("commits", "artifact"), - function(x) { - editor.count = length(unique(c(x[["author.name"]], x[["committer.name"]]))) + function(artifact.commits) { + editor.count = length(unique(unlist(lapply(editor.definition, function(editor.type) {artifact.commits[[editor.type]]})))) return(editor.count) } ) From 35dbc8ff55974752b9928cc643311c0d1c327468 Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 11 Jul 2019 03:36:51 +0200 Subject: [PATCH 03/39] Add cases for new parameter to test add.vertex.attribute.editor.count Signed-off-by: Klara Schlueter --- tests/test-networks-covariates.R | 50 ++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 2960f291..1deb9000 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -818,9 +818,35 @@ test_that("Test add.vertex.attribute.artifact.editor.count", { networks.and.data = get.network.covariates.test.networks("artifact") - expected.attributes = network.covariates.test.build.expected(list(1L), list(1L), list(3L, 1L)) - - expected.attributes = list( + expected.attributes.author = list( + range = network.covariates.test.build.expected( + c(1L), c(1L), c(3L, 1L)), + cumulative = network.covariates.test.build.expected( + c(1L), c(2L), c(3L, 1L)), + all.ranges = network.covariates.test.build.expected( + c(2L), c(2L), c(3L, 1L)), + project.cumulative = network.covariates.test.build.expected( + c(1L), c(2L), c(3L, 1L)), + project.all.ranges = network.covariates.test.build.expected( + c(2L), c(2L), c(3L, 1L)), + complete = network.covariates.test.build.expected( + c(2L), c(2L), c(3L, 1L)) + ) + expected.attributes.committer = list( + range = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)), + cumulative = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)), + all.ranges = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)), + project.cumulative = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)), + project.all.ranges = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)), + complete = network.covariates.test.build.expected( + c(1L), c(1L), c(2L, 1L)) + ) + expected.attributes.both = list( range = network.covariates.test.build.expected( c(1L), c(2L), c(3L, 1L)), cumulative = network.covariates.test.build.expected( @@ -838,14 +864,26 @@ test_that("Test add.vertex.attribute.artifact.editor.count", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.artifact.editor.count( + networks.with.attr.author = add.vertex.attribute.artifact.editor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level, editor.definition = "author" + ) + networks.with.attr.committer = add.vertex.attribute.artifact.editor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level, editor.definition = "committer" + ) + networks.with.attr.both = add.vertex.attribute.artifact.editor.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) - actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "editor.count") + actual.attributes.author = lapply(networks.with.attr.author, igraph::get.vertex.attribute, name = "editor.count") + actual.attributes.committer = lapply(networks.with.attr.committer, igraph::get.vertex.attribute, name = "editor.count") + actual.attributes.both = lapply(networks.with.attr.both, igraph::get.vertex.attribute, name = "editor.count") - expect_equal(expected.attributes[[level]], actual.attributes) + expect_equal(expected.attributes.author[[level]], actual.attributes.author) + expect_equal(expected.attributes.committer[[level]], actual.attributes.committer) + expect_equal(expected.attributes.both[[level]], actual.attributes.both) }) }) From 3ddd859ace1be4fa8292f5c2057489edea4b62b2 Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 11 Jul 2019 03:47:59 +0200 Subject: [PATCH 04/39] Update changelog Signed-off-by: Klara Schlueter Date: Fri, 12 Jul 2019 14:09:16 +0200 Subject: [PATCH 05/39] Apply reviews Signed-off-by: Klara Schlueter --- NEWS.md | 2 +- tests/test-networks-covariates.R | 4 ++-- util-networks-covariates.R | 13 +++++++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9886b5ed..f23e9fb8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ ## Unversioned ### Added -- Add a parameter 'editor.definition' to the function 'add.vertex.attribute.artifact.editor.count' which can be used to define, if author or committer or both count as editors when computing the attribute values. (ff1e147ba563b2d71f8228afd49492a315a5ad48) +- Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) ## 3.5 diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 1deb9000..eb7d71e2 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -866,7 +866,7 @@ test_that("Test add.vertex.attribute.artifact.editor.count", { lapply(AGGREGATION.LEVELS, function(level) { networks.with.attr.author = add.vertex.attribute.artifact.editor.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], - aggregation.level = level, editor.definition = "author" + aggregation.level = level ) networks.with.attr.committer = add.vertex.attribute.artifact.editor.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], @@ -874,7 +874,7 @@ test_that("Test add.vertex.attribute.artifact.editor.count", { ) networks.with.attr.both = add.vertex.attribute.artifact.editor.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], - aggregation.level = level + aggregation.level = level, editor.definition = c("author", "committer") ) actual.attributes.author = lapply(networks.with.attr.author, igraph::get.vertex.attribute, name = "editor.count") diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 36aca3c9..4c4a945a 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -666,7 +666,8 @@ add.vertex.attribute.author.role = function(list.of.networks, classification.res #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] -#' @param editor.definition Determines, who is counted as editor of an artifact. [default: c("author", "committer")] +#' @param editor.definition Determines, who is counted as editor of an artifact (one ore more of +#' \code{c("author", "committer")}). [default: "author"] #' @param default.value The default value to add if a vertex has no matching value [default: 0] #' #' @return A list of networks with the added attribute @@ -679,15 +680,19 @@ add.vertex.attribute.artifact.editor.count = function(list.of.networks, project. aggregation.level = match.arg.or.default(aggregation.level, default = "range") ## match editor definitions to column name in commit dataframe - editor.definition = match.arg.or.default(editor.definition, several.ok = TRUE) - editor.definition = lapply(editor.definition, function(editor) {paste0(editor, ".name")}) + if (missing(editor.definition)) { + editor.definition = "author" + } else { + editor.definition = match.arg.or.default(editor.definition, choices = c("author", "committer"), several.ok = TRUE) + } + editor.definition = paste0(editor.definition, ".name") nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, aggregation.level, default.value, function(range, range.data, net) { vertex.attributes = lapply(range.data$group.authors.by.data.column("commits", "artifact"), function(artifact.commits) { - editor.count = length(unique(unlist(lapply(editor.definition, function(editor.type) {artifact.commits[[editor.type]]})))) + editor.count = length(unique(unlist(artifact.commits[editor.definition]))) return(editor.count) } ) From 400400e26fd48d7cf3bbafb2ac8b96851d2d4e71 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 12 Jun 2019 17:25:45 +0200 Subject: [PATCH 06/39] Remove duplicated word in README.md Reported-by: Christian Hechtl Signed-off-by: Thomas Bock --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b540e6b..43cade69 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ While `proximity` triggers a file/function-based commit analysis in `Codeface`, When using this network library, the user only needs to give the `artifact` parameter to the [`ProjectConf`](#projectconf) constructor, which automatically ensures that the correct tagging is selected. The configuration files `{project-name}_{tagging}.conf` are mandatory and contain some basic configuration regarding a performed `Codeface` analysis (e.g., project name, name of the corresponding repository, name of the mailing list, etc.). -For further details on those files, please have a look at some [example files](https://github.com/siemens/codeface/tree/master/conf) files in the `Codeface` repository. +For further details on those files, please have a look at some [example files](https://github.com/siemens/codeface/tree/master/conf) in the `Codeface` repository. All the `*.list` files listed above are output files of `codeface-extraction` and contain meta data of, e.g., commits or e-mails to the mailing list, etc., in CSV format. This network library lazily loads and processes these files when needed. From 1608e28ca36610c58d2a5447d12ee2052c6eb976 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Tue, 3 Sep 2019 16:23:46 +0200 Subject: [PATCH 07/39] Add ProjectConf parameter 'mails.filter.patchstack.mails' This commit adds the possibility to filter out patchstack mails from the mails of the ProjectData. The option can be toggled using the newly added configuration option 'mails.filter.patchstack.mails'. In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread creator and has been sent within a short time window after the preceding mail. The mails spanned by a patchstack are called 'patchstack mails' and every patchstack mail but the first one are filtered when it is configured active. The "short time window" mentioned above can be controlled using the newly added constant 'PATCHSTACK.MAIL.DECAY.THRESHOLD' in util-data.R. Severel new functions have been introduced and existing getters and setters for PaStA, synchronicity, mails and commits have been refactored to integrate the new behaviour. Mainly, updating functionality that previously resided in the setters has been moved to separate updating functions to avoid setters that repeatedly have to call others setters for updating purposes. Also, the two filtering functions 'filter.patchstack.mails' and 'filter.pasta.data' have been added which filter out patchstack mails and PaStA data that corresponded to deleted patchstack mails, respectively. In the file test-data.R the configuration option 'pasta' was enabled for a test case since it was used within the test case. Due to internal caching behaviour of the ProjectData some of the asserting statements had to moved. Signed-off-by: Claus Hunsen Signed-off-by: Jakob Kronawitter --- tests/test-data.R | 10 +- util-conf.R | 6 + util-data.R | 368 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 299 insertions(+), 85 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index ed7c7d8d..63e48ba2 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -34,6 +34,7 @@ test_that("Compare two ProjectData objects", { ##initialize a ProjectData object with the ProjectConf and clone it into another one proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("pasta", TRUE) proj.data.one = ProjectData$new(project.conf = proj.conf) proj.data.two = proj.data.one$clone() @@ -43,19 +44,20 @@ test_that("Compare two ProjectData objects", { ## second object, as well, and test for equality. ##change the second data object - proj.data.one$get.commits() + + proj.data.two$get.pasta() expect_false(proj.data.one$equals(proj.data.two), "Two not identical ProjectData objects.") - proj.data.two$get.commits() + proj.data.one$get.pasta() expect_true(proj.data.one$equals(proj.data.two), "Two identical ProjectData objects.") - proj.data.two$get.pasta() + proj.data.one$get.commits() expect_false(proj.data.one$equals(proj.data.two), "Two not identical ProjectData objects.") - proj.data.one$get.pasta() + proj.data.two$get.commits() expect_true(proj.data.one$equals(proj.data.two), "Two identical ProjectData objects.") diff --git a/util-conf.R b/util-conf.R index 974ae65a..0aecfc43 100644 --- a/util-conf.R +++ b/util-conf.R @@ -355,6 +355,12 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), + mails.filter.patchstack.mails = list( + default = FALSE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 + ), synchronicity = list( default = FALSE, type = "logical", diff --git a/util-data.R b/util-data.R index 6492ebc7..76fc1db4 100644 --- a/util-data.R +++ b/util-data.R @@ -63,6 +63,9 @@ DATASOURCE.TO.ARTIFACT.COLUMN = list( "issues" = "issue.id" ) +## the maximum time difference between subsequent mails of a patchstack +PATCHSTACK.MAIL.DECAY.THRESHOLD = "30 seconds" + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## ProjectData ------------------------------------------------------------- @@ -101,6 +104,7 @@ ProjectData = R6::R6Class("ProjectData", commits = NULL, # data.frame ## mails mails = NULL, # data.frame + mails.patchstacks = NULL, # list ## issues issues = NULL, #data.frame ## authors @@ -113,37 +117,138 @@ ProjectData = R6::R6Class("ProjectData", ## timestamps of mail, issue and commit data data.timestamps = NULL, #data.frame - ## * * filtering commits ------------------------------------------- + ## * * commit filtering -------------------------------------------- - #' Filter commits retrieved by the method \code{get.commits} after potentially removing untracked files and the - #' base artifact (see parameters). + #' Filter commits by potentially removing untracked files and the base artifact (see parameters). #' + #' @param commits the data.frame of commits to be filtered #' @param remove.untracked.files flag whether untracked files are kept or removed #' @param remove.base.artifact flag whether the base artifact is kept or removed #' - #' @return the commits retrieved by the method \code{get.commits} after all filters have been applied - filter.commits = function(remove.untracked.files, remove.base.artifact) { + #' @return the commits after all filters have been applied + filter.commits = function(commits, remove.untracked.files, remove.base.artifact) { logging::logdebug("filter.commits: starting.") - ## get commit data - commit.data = self$get.commits() - ## filter out the untracked files if (remove.untracked.files) { - commit.data = subset(commit.data, file != UNTRACKED.FILE) + commits = subset(commits, file != UNTRACKED.FILE) } ## filter out the base artifacts (i.e., Base_Feature, File_Level) if (remove.base.artifact) { - commit.data = subset(commit.data, !(artifact %in% BASE.ARTIFACTS)) + commits = subset(commits, !(artifact %in% BASE.ARTIFACTS)) } logging::logdebug("filter.commits: finished.") - return(commit.data) + return(commits) + }, + + ## * * mail filtering ---------------------------------------------- + + #' Filters patchstack mails from the mails that are currently cached in the field \code{mails} and returns them. + #' Detected patchstacks are also stored in the field \code{patchstack.mails}. They are used later in the + #' function \code{filter.pasta.data} to also accommodate for the deleted mails in the PaStA data. + #' + #' In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread + #' creator and has been sent within a short time window (see \code{PATCHSTACK.MAIL.DECAY.THRESHOLD}) after the + #' preceding mail. + #' The mails spanned by a patchstack are called 'patchstack mails'. + #' + #' For each patchstack, all patchstack mails but the first one are filtered. + #' + #' @return the mail data after filtering patchstack mails + filter.patchstack.mails = function() { + logging::logdebug("filter.patchstack.mails: starting.") + + ## retrieve mails grouped by thread IDs + thread.data = self$group.authors.by.data.column("mails", "thread") + + result = parallel::mclapply(thread.data, function(thread) { + + ## ensure that all mails within the thread are ordered correctly + thread = thread[order(thread["date"]), ] + + running = TRUE + i = 1 + + ## find the largest index 'i' for which holds that each mail up to index 'i' has been authored by the + ## thread creator and that all mails up to index 'i' have been received within a succesive time window + ## of 'PATCHSTACK.MAIL.DECAY.THRESHOLD' + while (i < nrow(thread) && running) { + if (thread[1, "author.name"] == thread[i + 1, "author.name"] && + thread[i + 1, "date"] - thread[i, "date"] <= + lubridate::as.duration(PATCHSTACK.MAIL.DECAY.THRESHOLD)) { + i = i + 1 + } else { + running = FALSE + } + } + + ## return the mails of the thread with all patchstack mails but the first one being removed + return (list(keep = thread[setdiff(seq_len(nrow(thread)), seq_len(i)[-1]), ], + patchstack = thread[seq_len(i), ])) + }) + + ## override thread data with filtered thread data + thread.data = lapply(result, function(x) x[["keep"]]) + + ## flatten the list of mail-dataframes (i.e. thread.data) to a single mail-dataframe + mails = plyr::rbind.fill(thread.data) + + ## Retrieve patchstacks from the result above which are used to manipulate the PaStA data. This needs to be + ## done because the PaSta data relates to some of the filtered mails and must be adjusted accordingly. + patchstacks = lapply(result, function(x) x[["patchstack"]]) + + ## only patchstacks that contain at least two mails are considered patchstacks + patchstacks = patchstacks[lapply(patchstacks, nrow) > 1] + + ## store patchstack information + private$mails.patchstacks = patchstacks + + logging::logdebug("filter.patchstack.mails: finished.") + return(mails) }, ## * * PaStA data -------------------------------------------------- + #' Uses the information about the deleted patchstack mails that are stored in the field \code{patchstack.mails} + #' to also filter out PaStA information that relates to the deleted mails. The PaStA information is not + #' discarded completely however but instead is gathered for each patchstack and is assigned to the first mail + #' in each patchstack because this very first mail has not been filtered and represents the patchstack. + #' + #' @return the filtered PaStA data + filter.pasta.data = function() { + logging::logdebug("filter.pasta.data: starting.") + + new.pasta = parallel::mclapply(private$mails.patchstacks, function(patchstack) { + + ## get all PaStA data that relates to the current mail (do not drop data.frame structure!) + pasta.tmp = private$pasta[private$pasta[["message.id"]] %in% patchstack[["message.id"]], , drop = FALSE] + + ## override all old message IDs with the message ID of the first mail in the patchstack since it + ## is the only one that is kept (if any data is available in 'pasta.tmp') + if (nrow(pasta.tmp) > 0) { + pasta.tmp["message.id"] = patchstack[1, "message.id"] + } + + return(pasta.tmp) + }) + ## combine new re-written PaStA data + new.pasta = plyr::rbind.fill(new.pasta) + + ## remove old items from PaStA data + ## 1) flatten the list of mail-dataframes (i.e. patchstacks) to a single mail-dataframe + patchstack.mails = plyr::rbind.fill(private$mails.patchstacks) + ## 2) delete any PaStA information that relate to message IDs of mails that will be discarded + pasta = private$pasta[!(private$pasta[["message.id"]] %in% patchstack.mails[["message.id"]]), ] + + ## append the new pasta to the old pasta + pasta = plyr::rbind.fill(pasta, new.pasta) + + logging::logdebug("filter.pasta.data: finished.") + return(pasta) + }, + #' Aggregate PaStA data for convenient merging to main data sources. #' #' In detail, the given PaStA data is independently aggregated by both the @@ -153,17 +258,14 @@ ProjectData = R6::R6Class("ProjectData", #' #' **Note**: The column \code{commit.hash} gets renamed to \code{hash} to match #' the corresponding column in the commit data (see \code{read.commits}). - #' - #' @param pasta.data a data.frame of PaStA data as retrieved from - #' \code{ProjectData$get.pasta.data} - aggregate.pasta.data = function(pasta.data) { + aggregate.pasta.data = function() { logging::logdebug("aggregate.pasta.data: starting.") ## check for data first - if (nrow(pasta.data) == 0) { + if (nrow(private$pasta) == 0) { ## take (empty) input data and no rows from it - private$pasta.mails = pasta.data[0, ] - private$pasta.commits = pasta.data[0, ] + private$pasta.mails = create.empty.pasta.list() + private$pasta.commits = create.empty.pasta.list() } else { ## compute aggregated data.frames for easier merging ## 1) define group function (determines result in aggregated data.frame cells) @@ -171,13 +273,13 @@ ProjectData = R6::R6Class("ProjectData", ## 2) aggregate by message ID group.col = "message.id" private$pasta.mails = aggregate( - as.formula(sprintf(". ~ %s", group.col)), pasta.data, + as.formula(sprintf(". ~ %s", group.col)), private$pasta, group.fun, na.action = na.pass ) ## 3) aggregate by commit hash group.col = "commit.hash" private$pasta.commits = aggregate( - as.formula(sprintf(". ~ %s", group.col)), pasta.data, + as.formula(sprintf(". ~ %s", group.col)), private$pasta, group.fun, na.action = na.pass ) } @@ -189,6 +291,98 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("aggregate.pasta.data: finished.") }, + #' Updates the PaStA column that is appended to mails using the currently available PaStA data from the field + #' \code{pasta.commits}. + update.pasta.commit.data = function() { + logging::logdebug("update.pasta.commit.data: starting.") + + ## return immediately if no commits available + if (!is.null(private$mails)) { + + ## remove previous PaStA data + private$commits["pasta"] = NULL + private$commits["revision.set.id"] = NULL + + ## merge PaStA data + private$commits = merge(private$commits, private$pasta.commits, + by = "hash", all.x = TRUE, sort = FALSE) + } + + logging::logdebug("update.pasta.commit.data: finished.") + }, + + #' Updates the PaStA column that is appended to mails using the currently available PaStA data from the field + #' \code{pasta.mails}. + update.pasta.mail.data = function() { + logging::logdebug("update.pasta.mail.data: starting.") + + ## return immediately if no mails available + if (!is.null(private$mails)) { + + ## remove previous PaStA data + private$mails["pasta"] = NULL + private$mails["revision.set.id"] = NULL + + ## merge PaStA data + private$mails = merge(private$mails, private$pasta.mails, + by = "message.id", all.x = TRUE, sort = FALSE) + } + + logging::logdebug("update.pasta.mail.data: finished.") + }, + + #' Recomputes the values of the cached fields \code{pasta.mails} and \code{pasta.commits} using the currrently + #' available PaStA information of the field \code{pasta} and also assigns/updates this PaStA information to + #' \code{mails} and \code{commits}. + #' + #' This method should be called whenever the field \code{pasta} is changed. + update.pasta.data = function() { + logging::logdebug("update.pasta.data: starting.") + + ## filter patchstack mails from PaStA data if configured + if (private$project.conf$get.value("mails.filter.patchstack.mails")) { + private$pasta = private$filter.pasta.data() + } + + ## aggregate by message IDs and commit hashes + private$aggregate.pasta.data() + + ## update mail data by attaching PaStA data + if (!is.null(private$mails)) { + private$update.pasta.mail.data() + } + + ## update commit data by attaching PaStA data + if (!is.null(private$commits)) { + private$update.pasta.commit.data() + } + + logging::logdebug("update.pasta.data: finished.") + }, + + ## * * synchronicity data ------------------------------------------ + + #' Updates the synchronicity column that is appended to commits using the currently available synchronicity data + #' from the field \code{synchronicity}. + #' + #' This method should be called whenever the field \code{synchronicity} is changed. + update.synchronicity.data = function() { + logging::logdebug("update.synchronicity.data: starting.") + + ## update commit data by attaching synchronicity data + if (!is.null(private$commits)) { + ## remove previous synchronicity data + private$commits["synchronicity"] = NULL + + ## merge synchronicity data + private$commits = merge(private$commits, private$synchronicity, + by = "hash", all.x = TRUE, sort = FALSE) + + } + + logging::logdebug("update.synchronicity.data: finished.") + }, + ## * * timestamps -------------------------------------------------- #' Call the getters of the specified data sources in order to @@ -388,6 +582,7 @@ ProjectData = R6::R6Class("ProjectData", get.commits.filtered = function() { if (is.null(private$commits.filtered)) { private$commits.filtered = private$filter.commits( + self$get.commits(), private$project.conf$get.value("commits.filter.untracked.files"), private$project.conf$get.value("commits.filter.base.artifact") ) @@ -408,7 +603,7 @@ ProjectData = R6::R6Class("ProjectData", #' #' @seealso get.commits.filtered get.commits.filtered.uncached = function(remove.untracked.files, remove.base.artifact) { - return (private$filter.commits(remove.untracked.files, remove.base.artifact)) + return (private$filter.commits(self$get.commits(), remove.untracked.files, remove.base.artifact)) }, #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. @@ -445,44 +640,38 @@ ProjectData = R6::R6Class("ProjectData", set.commits = function(commit.data) { logging::loginfo("Setting commit data.") - # TODO: Also check for correct shape (column names and data types) of the passed data - if (is.null(commit.data)) { commit.data = create.empty.commits.list() } - ## append synchronicity data if wanted + ## temporarily store commit data to enable attachment of PaStA stuff + private$commits = commit.data + + ## add synchronicity data if wanted if (private$project.conf$get.value("synchronicity")) { - logging::loginfo("Adding synchronicity data.") - synchronicity.data = self$get.synchronicity() - ## remove previous synchronicity data - if ("synchronicity" %in% colnames(commit.data)) { - commit.data["synchronicity"] = NULL + if (is.null(private$synchronicity)) { + ## get data (no assignment because we just want to trigger anything synchronicity-related) + self$get.synchronicity() + } else { + ## update all synchronicity-related data + private$update.synchronicity.data() } - commit.data = merge(commit.data, synchronicity.data, - by = "hash", all.x = TRUE, sort = FALSE) } ## add PaStA data if wanted if (private$project.conf$get.value("pasta")) { - logging::loginfo("Adding PaStA data.") - ## get data - self$get.pasta() # no assignment because we just want to trigger the read-in - ## remove previous PaStA data - if ("pasta" %in% colnames(commit.data)) { - commit.data["pasta"] = NULL - commit.data["revision.set.id"] = NULL + if (is.null(private$pasta)) { + ## get data (no assignment because we just want to trigger anything PaStA-related) + self$get.pasta() + } else { + ## update all PaStA-related data + private$update.pasta.data() } - ## merge PaStA data - commit.data = merge(commit.data, private$pasta.commits, - by = "hash", all.x = TRUE, sort = FALSE) } ## sort by date again (because 'merge' is doing bullshit!) commit.data = commit.data[order(commit.data[["date"]], decreasing = FALSE), ] # sort! - private$commits = commit.data - ## remove cached data for filtered commits as these need to be re-computed after ## changing the data private$commits.filtered = NULL @@ -500,16 +689,19 @@ ProjectData = R6::R6Class("ProjectData", if (private$project.conf$get.value("synchronicity")) { ## if data are not read already, read them if (is.null(private$synchronicity)) { - synchronicity.data = read.synchronicity( + private$synchronicity = read.synchronicity( self$get.data.path.synchronicity(), private$project.conf$get.value("artifact"), private$project.conf$get.value("synchronicity.time.window") ) - ## set actual data - self$set.synchronicity(synchronicity.data) + ## no read of commit data needed here! + + ## update all synchronicity-related data + private$update.synchronicity.data() } } else { + logging::logwarn("You have not set the ProjectConf parameter 'synchronicity' to 'TRUE'! Ignoring...") ## mark synchronicity data as empty self$set.synchronicity(NULL) } @@ -534,10 +726,11 @@ ProjectData = R6::R6Class("ProjectData", ## add synchronicity data to the commit data if configured if (private$project.conf$get.value("synchronicity")) { - logging::loginfo("Updating synchronicity data.") - if (!is.null(private$commits)) { - self$set.commits(private$commits) - } + + ## no read of commit data needed here! + + ## update all synchronicity-related data + private$update.synchronicity.data() } }, @@ -553,12 +746,21 @@ ProjectData = R6::R6Class("ProjectData", if (private$project.conf$get.value("pasta")) { ## if data are not read already, read them if (is.null(private$pasta)) { - pasta.data = read.pasta(self$get.data.path.pasta()) - - ## set actual data - self$set.pasta(pasta.data) + ## read PaStA data from disk + private$pasta = read.pasta(self$get.data.path.pasta()) + + ## read mail data if filtering patchstack mails + if (is.null(private$mails) + && private$project.conf$get.value("mails.filter.patchstack.mails")) { + ## just triggering read-in, no storage + self$get.mails() + } else { + ## update all PaStA-related data + private$update.pasta.data() + } } } else { + logging::logwarn("You have not set the ProjectConf parameter 'pasta' to 'TRUE'! Ignoring...") ## mark PaStA data as empty self$set.pasta(NULL) } @@ -581,17 +783,19 @@ ProjectData = R6::R6Class("ProjectData", ## set the actual data private$pasta = data - ## aggregate by message IDs and commit hashes - private$aggregate.pasta.data(private$pasta) - ## add PaStA data to commit and mail data if configured if (private$project.conf$get.value("pasta")) { - logging::loginfo("Updating PaStA data.") - if (!is.null(private$commits)) { - self$set.commits(private$commits) - } - if (!is.null(private$mails)) { - self$set.mails(private$mails) + + ## read mail data if filtering patchstack mails + if (is.null(private$mails) && + private$project.conf$get.value("mails.filter.patchstack.mails")) { + ## just triggering read-in, no storage + self$get.mails() + + } else { + ## update all PaStA-related data + private$update.pasta.data() + } } }, @@ -609,7 +813,7 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(private$mails)) { mails.read = read.mails(self$get.data.path()) - self$set.mails(data = mails.read) + self$set.mails(mails.read) } private$extract.timestamps(source = "mails") @@ -619,33 +823,35 @@ ProjectData = R6::R6Class("ProjectData", #' Set the mail data to the given new data and add PaStA data #' if configured in the field \code{project.conf}. #' - #' @param data the new mail data - set.mails = function(data) { + #' @param mail.data the new mail data + set.mails = function(mail.data) { logging::loginfo("Setting e-mail data.") - if (is.null(data)) { - data = create.empty.mails.list() + if (is.null(mail.data)) { + mail.data = create.empty.mails.list() + } + + ## temporarily store mail data to enable attachment of PaStA stuff + private$mails = mail.data + + ## filter patchstack mails and store again + if (private$project.conf$get.value("mails.filter.patchstack.mails")) { + private$mails = private$filter.patchstack.mails() } ## add PaStA data if wanted if (private$project.conf$get.value("pasta")) { - logging::loginfo("Adding PaStA data.") - ## get data - self$get.pasta() # no assignment because we just want to trigger the read-in - ## remove previous PaStA data - if ("pasta" %in% colnames(data)) { - data["pasta"] = NULL - data["revision.set.id"] = NULL + if (is.null(private$pasta)) { + ## get data (no assignment because we just want to trigger anything PaStA-related) + self$get.pasta() + } else { + ## update all PaStA-related data + private$update.pasta.data() } - ## merge PaStA data - data = merge(data, private$pasta.mails, - by = "message.id", all.x = TRUE, sort = FALSE) } ## sort by date again (because 'merge' is doing bullshit!) - data = data[order(data[["date"]], decreasing = FALSE), ] # sort! - - private$mails = data + private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] # sort! }, #' Get the author data. From a932c8cdaa6fe5149c798bc09d9e421ba679c48d Mon Sep 17 00:00:00 2001 From: Jakob Date: Sun, 22 Sep 2019 10:50:26 +0200 Subject: [PATCH 08/39] Add tests for the recently introduced patchstack mail filtering Signed-off-by: Jakob Kronawitter --- tests/test-data.R | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test-data.R b/tests/test-data.R index 63e48ba2..67cc17ed 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -125,3 +125,52 @@ test_that("Compare two RangeData objects", { expect_false(proj.data.base$equals(range.data.four)) }) + +test_that("Filter patchstack mails", { + + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("mails.filter.patchstack.mails", TRUE) + + ## create the project data + proj.data = ProjectData$new(proj.conf) + + ## retrieve the mails while filtering patchstack mails + mails.filtered = proj.data$get.mails() + + ## create new project with filtering disabled + proj.conf$update.value("mails.filter.patchstack.mails", FALSE) + proj.data = ProjectData$new(proj.conf) + + ## retrieve the mails without filtering patchstack mails + mails.unfiltered = proj.data$get.mails() + + ## get message ids + mails.filtered.mids = mails.filtered[["message.id"]] + mails.unfiltered.mids = mails.unfiltered[["message.id"]] + + expect_equal(setdiff(mails.unfiltered.mids, mails.filtered.mids), c("", + "", + "", + "", + "")) +}) + +test_that("Filter patchstack mails with PaStA enabled", { + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("mails.filter.patchstack.mails", TRUE) + proj.conf$update.value("pasta", TRUE) + + proj.data = ProjectData$new(proj.conf) + + ## retrieve filtered PaStA data by calling 'get.pasta' which calls the filtering functionality internally + filtered.pasta = proj.data$get.pasta() + + ## ensure that PaStA data relating to Hans' mail 2 and 3 do not exist anymore since they have also been filtered + ## during patchstack mail filtering + expect_false("" %in% filtered.pasta[["message.id"]]) + expect_false("" %in% filtered.pasta[["message.id"]]) + + ## ensure that all three PaStA entries that existed previously do still exist but have been associated to the + ## very first mail of the patchstack + expect_equal(3, sum(filtered.pasta[["message.id"]] == "")) +}) From f014eeb182f07df106c603fc8470067020fe701c Mon Sep 17 00:00:00 2001 From: Jakob Date: Tue, 24 Sep 2019 11:50:47 +0200 Subject: [PATCH 09/39] Update copyright headers Signed-off-by: Jakob Kronawitter --- tests/test-data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test-data.R b/tests/test-data.R index 67cc17ed..78f27675 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -12,7 +12,8 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2018 by Christian Hechtl -## Copyright 2018 by Claus Hunsen +## Copyright 2018-2019 by Claus Hunsen +## Copyright 2019 by Jakob Kronawitter ## All Rights Reserved. From 9f53e8cc12edc032ad1d93231a91b73f57988c3a Mon Sep 17 00:00:00 2001 From: Jakob Date: Tue, 24 Sep 2019 11:55:29 +0200 Subject: [PATCH 10/39] Update changelog Signed-off-by: Jakob Kronawitter --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index f23e9fb8..df5f5f0b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) +- Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) ## 3.5 From 7949fa47ee5a67904f96cd88e83830e883bcd463 Mon Sep 17 00:00:00 2001 From: Jakob Date: Wed, 2 Oct 2019 15:37:58 +0200 Subject: [PATCH 11/39] Add 'mails.filter.patchstack.mails' description to README Signed-off-by: Jakob Kronawitter --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 43cade69..23456686 100644 --- a/README.md +++ b/README.md @@ -521,6 +521,10 @@ There is no way to update the entries, except for the revision-based parameters. - `commits.filter.untracked.files` * Remove all information concerning untracked files from the commit data. This effect becomes clear when retrieving commits using `get.commits.filtered`, because then the result of which does not contain any commits that solely changed untracked files. Networks built on top of this `ProjectData` do also not contain any information about untracked files. * [*`TRUE`*, `FALSE`] +- `mails.filter.patchstack.mails` + * Filter patchstack mails from the mail data. In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread creator and has been sent within a short time window after the preceding mail. The mails spanned by a patchstack are called +'patchstack mails' and for each patchstack, every patchstack mail but the first one are filtered when `mails.filter.patchstack.mails = TRUE`. + * [`TRUE`, *`FALSE`*] - `issues.only.comments` * Only use comments from the issue data on disk and no further events such as references and label changes * [*`TRUE`*, `FALSE`] From c4a8348b2a044a090b8026c74bc3d11e1734839a Mon Sep 17 00:00:00 2001 From: Jakob Date: Wed, 2 Oct 2019 15:51:25 +0200 Subject: [PATCH 12/39] Improve documentation Signed-off-by: Jakob Kronawitter --- util-data.R | 55 +++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/util-data.R b/util-data.R index 76fc1db4..a6f1d517 100644 --- a/util-data.R +++ b/util-data.R @@ -119,9 +119,9 @@ ProjectData = R6::R6Class("ProjectData", ## * * commit filtering -------------------------------------------- - #' Filter commits by potentially removing untracked files and the base artifact (see parameters). + #' Filter commits by potentially removing commits to untracked files or to the base artifact (see parameters). #' - #' @param commits the data.frame of commits to be filtered + #' @param commits the data.frame of commits on which filtering will be applied #' @param remove.untracked.files flag whether untracked files are kept or removed #' @param remove.base.artifact flag whether the base artifact is kept or removed #' @@ -145,8 +145,8 @@ ProjectData = R6::R6Class("ProjectData", ## * * mail filtering ---------------------------------------------- - #' Filters patchstack mails from the mails that are currently cached in the field \code{mails} and returns them. - #' Detected patchstacks are also stored in the field \code{patchstack.mails}. They are used later in the + #' Filter patchstack mails from the mails that are currently cached in the field \code{mails} and return them. + #' Store detected patchstacks in the field \code{patchstack.mails}. They are used later in the #' function \code{filter.pasta.data} to also accommodate for the deleted mails in the PaStA data. #' #' In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread @@ -163,6 +163,7 @@ ProjectData = R6::R6Class("ProjectData", ## retrieve mails grouped by thread IDs thread.data = self$group.authors.by.data.column("mails", "thread") + ## extract the patchstack mails and the filtered mails for each thread result = parallel::mclapply(thread.data, function(thread) { ## ensure that all mails within the thread are ordered correctly @@ -196,7 +197,7 @@ ProjectData = R6::R6Class("ProjectData", mails = plyr::rbind.fill(thread.data) ## Retrieve patchstacks from the result above which are used to manipulate the PaStA data. This needs to be - ## done because the PaSta data relates to some of the filtered mails and must be adjusted accordingly. + ## done because the PaStA data relates to some of the filtered mails and must be adjusted accordingly. patchstacks = lapply(result, function(x) x[["patchstack"]]) ## only patchstacks that contain at least two mails are considered patchstacks @@ -211,10 +212,12 @@ ProjectData = R6::R6Class("ProjectData", ## * * PaStA data -------------------------------------------------- - #' Uses the information about the deleted patchstack mails that are stored in the field \code{patchstack.mails} - #' to also filter out PaStA information that relates to the deleted mails. The PaStA information is not - #' discarded completely however but instead is gathered for each patchstack and is assigned to the first mail - #' in each patchstack because this very first mail has not been filtered and represents the patchstack. + #' Use the information about the deleted patchstack mails that are stored in the field \code{patchstack.mails} + #' to also filter out PaStA information that relates to the deleted mails. + #' + #' The PaStA information is not discarded completely however but instead is gathered for each patchstack and is + #' assigned to the first mail in each patchstack because this very first mail has not been filtered and + #' represents the patchstack. #' #' @return the filtered PaStA data filter.pasta.data = function() { @@ -222,7 +225,7 @@ ProjectData = R6::R6Class("ProjectData", new.pasta = parallel::mclapply(private$mails.patchstacks, function(patchstack) { - ## get all PaStA data that relates to the current mail (do not drop data.frame structure!) + ## get all PaStA data that relates to the current patchstack (do not drop data.frame structure!) pasta.tmp = private$pasta[private$pasta[["message.id"]] %in% patchstack[["message.id"]], , drop = FALSE] ## override all old message IDs with the message ID of the first mail in the patchstack since it @@ -242,7 +245,7 @@ ProjectData = R6::R6Class("ProjectData", ## 2) delete any PaStA information that relate to message IDs of mails that will be discarded pasta = private$pasta[!(private$pasta[["message.id"]] %in% patchstack.mails[["message.id"]]), ] - ## append the new pasta to the old pasta + ## append the new pasta data to the old pasta data pasta = plyr::rbind.fill(pasta, new.pasta) logging::logdebug("filter.pasta.data: finished.") @@ -291,8 +294,8 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("aggregate.pasta.data: finished.") }, - #' Updates the PaStA column that is appended to mails using the currently available PaStA data from the field - #' \code{pasta.commits}. + #' Update the PaStA-related columns \code{pasta} and \code{revision.set.id} that are appended to \code{commits} + #' using the currently available PaStA data from the field \code{pasta.commits}. update.pasta.commit.data = function() { logging::logdebug("update.pasta.commit.data: starting.") @@ -311,8 +314,8 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("update.pasta.commit.data: finished.") }, - #' Updates the PaStA column that is appended to mails using the currently available PaStA data from the field - #' \code{pasta.mails}. + #' Update the PaStA-related columns \code{pasta} and \code{revision.set.id} that are appended to \code{mails} + #' using the currently available PaStA data from the field \code{pasta.mails}. update.pasta.mail.data = function() { logging::logdebug("update.pasta.mail.data: starting.") @@ -331,8 +334,8 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("update.pasta.mail.data: finished.") }, - #' Recomputes the values of the cached fields \code{pasta.mails} and \code{pasta.commits} using the currrently - #' available PaStA information of the field \code{pasta} and also assigns/updates this PaStA information to + #' Recompute the values of the cached fields \code{pasta.mails} and \code{pasta.commits} using the currrently + #' available PaStA information of the field \code{pasta} and also assign/update this PaStA information to #' \code{mails} and \code{commits}. #' #' This method should be called whenever the field \code{pasta} is changed. @@ -362,8 +365,8 @@ ProjectData = R6::R6Class("ProjectData", ## * * synchronicity data ------------------------------------------ - #' Updates the synchronicity column that is appended to commits using the currently available synchronicity data - #' from the field \code{synchronicity}. + #' Update the column \code{synchronicity} that is appended to commits using the currently available + #' synchronicity data from the field \code{synchronicity}. #' #' This method should be called whenever the field \code{synchronicity} is changed. update.synchronicity.data = function() { @@ -644,7 +647,7 @@ ProjectData = R6::R6Class("ProjectData", commit.data = create.empty.commits.list() } - ## temporarily store commit data to enable attachment of PaStA stuff + ## store commit data private$commits = commit.data ## add synchronicity data if wanted @@ -677,9 +680,8 @@ ProjectData = R6::R6Class("ProjectData", private$commits.filtered = NULL }, - #' Get the synchronicity data. - #' If it does not already exist call the read method. - #' Call the setter function to set the data. + #' Get the synchronicity data. If it is not already stored in the ProjectData, this function triggers a read in + #' from disk. #' #' @return the synchronicity data get.synchronicity = function() { @@ -734,9 +736,8 @@ ProjectData = R6::R6Class("ProjectData", } }, - #' Get the PaStA data. - #' If it does not already exist call the read method. - #' Call the setter function to set the data. + #' Get the PaStA data. If it is not already stored in the ProjectData, this function triggers a read in + #' from disk. #' #' @return the PaStA data get.pasta = function() { @@ -831,7 +832,7 @@ ProjectData = R6::R6Class("ProjectData", mail.data = create.empty.mails.list() } - ## temporarily store mail data to enable attachment of PaStA stuff + ## store mail data private$mails = mail.data ## filter patchstack mails and store again From 32ee3f4ad6d12741a4f77b8a103187304e1a353d Mon Sep 17 00:00:00 2001 From: Jakob Date: Fri, 1 Nov 2019 17:46:10 +0100 Subject: [PATCH 13/39] Ensure correct ordering and handle PaStA duplicates Previously, it was possible that in certain use cases the ordering of mails and commits could end up mixed up because a sorting operation was lacking. This is added in this commit. For PaStA, it is possible that after filtering patchstack mails, duplicated PaStA rows can occur which are filtered from now on. Also, to make debugging easier in futurer, PaStA data gets ordered after adding new PaStA data which results from the filtering of patchstack mails. Signed-off-by: Jakob Kronawitter --- tests/test-data.R | 20 ++++++++++++-------- util-data.R | 23 +++++++++++++++++++---- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index 78f27675..589759e8 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -166,12 +166,16 @@ test_that("Filter patchstack mails with PaStA enabled", { ## retrieve filtered PaStA data by calling 'get.pasta' which calls the filtering functionality internally filtered.pasta = proj.data$get.pasta() - ## ensure that PaStA data relating to Hans' mail 2 and 3 do not exist anymore since they have also been filtered - ## during patchstack mail filtering - expect_false("" %in% filtered.pasta[["message.id"]]) - expect_false("" %in% filtered.pasta[["message.id"]]) - - ## ensure that all three PaStA entries that existed previously do still exist but have been associated to the - ## very first mail of the patchstack - expect_equal(3, sum(filtered.pasta[["message.id"]] == "")) + ## ensure that the remaining mails have not been touched + expect_true("" %in% filtered.pasta[["message.id"]]) + expect_true("" %in% filtered.pasta[["message.id"]]) + expect_true("" %in% filtered.pasta[["message.id"]]) + expect_equal(2, sum(filtered.pasta[["message.id"]] == "")) + + ## ensure that out of three PaStA entries that existed previously, all of which pointing to the same commit hash, + ## one new PaStA entry has been created with has assigned the message ID of the first patchstack mail + expect_true("" %in% filtered.pasta[["message.id"]]) + + ## ensure that there are no other entries than the ones that have been verified to exist above + expect_equal(6, nrow(filtered.pasta)) }) diff --git a/util-data.R b/util-data.R index a6f1d517..e1e424df 100644 --- a/util-data.R +++ b/util-data.R @@ -239,6 +239,9 @@ ProjectData = R6::R6Class("ProjectData", ## combine new re-written PaStA data new.pasta = plyr::rbind.fill(new.pasta) + ## remove potential duplicates + new.pasta = unique(new.pasta) + ## remove old items from PaStA data ## 1) flatten the list of mail-dataframes (i.e. patchstacks) to a single mail-dataframe patchstack.mails = plyr::rbind.fill(private$mails.patchstacks) @@ -248,6 +251,9 @@ ProjectData = R6::R6Class("ProjectData", ## append the new pasta data to the old pasta data pasta = plyr::rbind.fill(pasta, new.pasta) + ## reestablish ordering using the 'revision.set.id' column of the PaStA data + pasta = pasta[order(pasta[["revision.set.id"]]), ] + logging::logdebug("filter.pasta.data: finished.") return(pasta) }, @@ -309,6 +315,9 @@ ProjectData = R6::R6Class("ProjectData", ## merge PaStA data private$commits = merge(private$commits, private$pasta.commits, by = "hash", all.x = TRUE, sort = FALSE) + + ## sort by date again because 'merge' disturbs the order + private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] } logging::logdebug("update.pasta.commit.data: finished.") @@ -329,6 +338,9 @@ ProjectData = R6::R6Class("ProjectData", ## merge PaStA data private$mails = merge(private$mails, private$pasta.mails, by = "message.id", all.x = TRUE, sort = FALSE) + + ## sort by date again because 'merge' disturbs the order + private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] } logging::logdebug("update.pasta.mail.data: finished.") @@ -381,6 +393,9 @@ ProjectData = R6::R6Class("ProjectData", private$commits = merge(private$commits, private$synchronicity, by = "hash", all.x = TRUE, sort = FALSE) + ## sort by date again because 'merge' disturbs the order + private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] + } logging::logdebug("update.synchronicity.data: finished.") @@ -672,8 +687,8 @@ ProjectData = R6::R6Class("ProjectData", } } - ## sort by date again (because 'merge' is doing bullshit!) - commit.data = commit.data[order(commit.data[["date"]], decreasing = FALSE), ] # sort! + ## sort by date + private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] ## remove cached data for filtered commits as these need to be re-computed after ## changing the data @@ -851,8 +866,8 @@ ProjectData = R6::R6Class("ProjectData", } } - ## sort by date again (because 'merge' is doing bullshit!) - private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] # sort! + ## sort by date + private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] }, #' Get the author data. From 451284e36bdab0b810cd6d9b3cf936c32a418c82 Mon Sep 17 00:00:00 2001 From: Jakob Kronawitter Date: Sun, 10 Nov 2019 21:27:31 +0100 Subject: [PATCH 14/39] Update in-line comment Signed-off-by: Jakob Kronawitter --- tests/test-data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index 589759e8..f996eefe 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -172,8 +172,8 @@ test_that("Filter patchstack mails with PaStA enabled", { expect_true("" %in% filtered.pasta[["message.id"]]) expect_equal(2, sum(filtered.pasta[["message.id"]] == "")) - ## ensure that out of three PaStA entries that existed previously, all of which pointing to the same commit hash, - ## one new PaStA entry has been created with has assigned the message ID of the first patchstack mail + ## ensure that the three PaStA entries relating to the filtered patchstack mails have been merged to a single new + ## PaStA entry which has assigned the message ID of the first patchstack mail expect_true("" %in% filtered.pasta[["message.id"]]) ## ensure that there are no other entries than the ones that have been verified to exist above From 8b2a52d38475a59c55feb17bb54ed12b9252a937 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Wed, 17 Jul 2019 15:34:01 +0200 Subject: [PATCH 15/39] Add R version 3.6 to test suite This is related to issue #161. Signed-off-by: Claus Hunsen --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index b49c4b66..5e8724f4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,6 +20,7 @@ r: - 3.3 - 3.4 - 3.5 + - 3.6 # TravisCI container sudo: required From 33d63fd50c4b29d45a9ca586c383650f7d29efd5 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Fri, 24 Jan 2020 12:26:28 +0100 Subject: [PATCH 16/39] Ensure sorting of commit-count and LOC-count data.frames Due to weird circumstances, tests are failing only on R 3.3, indicating string mismatches in all commit-count and LOC-count data.frames [1]. The reason for these failures is that the row ordering of the respective data.frames is not deterministic for authors exhibiting the same frequency value. To fix this problem, the resulting data.frames of all corresponding functions are ordered by the author and committer names, respectively. Props to @hechtlC for a valuable discussion. [1] https://travis-ci.com/se-passau/coronet/jobs/278309743 Signed-off-by: Claus Hunsen --- util-core-peripheral.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/util-core-peripheral.R b/util-core-peripheral.R index e57e162c..028a0856 100644 --- a/util-core-peripheral.R +++ b/util-core-peripheral.R @@ -14,7 +14,7 @@ ## Copyright 2017 by Mitchell Joblin ## Copyright 2017 by Ferdinand Frank ## Copyright 2017 by Sofie Kemper -## Copyright 2017-2019 by Claus Hunsen +## Copyright 2017-2020 by Claus Hunsen ## Copyright 2017 by Felix Prasse ## Copyright 2018-2019 by Christian Hechtl ## Copyright 2018 by Klara Schlüter @@ -637,7 +637,7 @@ get.committer.not.author.commit.count = function(range.data) { res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` WHERE `committer.name` <> `author.name` GROUP BY `committer.name`, `author.name` - ORDER BY `freq` DESC") + ORDER BY `freq` DESC, `author.name` ASC") logging::logdebug("get.committer.not.author.commit.count: finished.") return(res) @@ -664,7 +664,7 @@ get.committer.and.author.commit.count = function(range.data) { res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` WHERE `committer.name` = `author.name` GROUP BY `committer.name`, `author.name` - ORDER BY `freq` DESC") + ORDER BY `freq` DESC, `author.name` ASC") logging::logdebug("get.committer.and.author.commit.count: finished.") return(res) @@ -699,7 +699,7 @@ get.committer.or.author.commit.count = function(range.data) { res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `ungrouped` GROUP BY `name` - ORDER BY `freq` DESC") + ORDER BY `freq` DESC, `name` ASC") logging::logdebug("get.committer.or.author.commit.count: finished.") return(res) @@ -725,7 +725,7 @@ get.committer.commit.count = function(range.data) { ## Execute a query to get the commit count per author res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` - GROUP BY `committer.name` ORDER BY `freq` DESC") + GROUP BY `committer.name` ORDER BY `freq` DESC, `committer.name` ASC") logging::logdebug("get.committer.commit.count: finished.") return(res) @@ -751,7 +751,7 @@ get.author.commit.count = function(proj.data) { ## Execute a query to get the commit count per author res = sqldf::sqldf("SELECT `author.name`, COUNT(*) AS `freq` FROM `commits.df` - GROUP BY `author.name` ORDER BY `freq` DESC") + GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC") logging::logdebug("get.author.commit.count: finished.") return(res) @@ -813,7 +813,7 @@ get.author.loc.count = function(proj.data) { ## Execute a query to get the changed lines per author res = sqldf::sqldf("SELECT `author.name`, SUM(`added.lines`) + SUM(`deleted.lines`) AS `loc` FROM `commits.df` - GROUP BY `author.name` ORDER BY `loc` DESC") + GROUP BY `author.name` ORDER BY `loc` DESC, `author.name` ASC") logging::logdebug("get.author.loc.count: finished.") return(res) From 41ce589b3b50fd581a10e6af33ac6b1bbea63bb8 Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Fri, 24 Jan 2020 15:53:14 +0100 Subject: [PATCH 17/39] Update .travis.yml The configuration file for Travis CI caused some warnings in the Travis backend (see also https://config.travis-ci.com/explore): > [warn] on root: deprecated key: "sudo" (The key `sudo` has no effect > anymore.) > [info] on root: missing os, using the default "linux" To fix these warnings, the configuration is heavily adapted: - Use Linux OS explicitly. - Use distribution 'xenial' (16.04) instead of 'trusty'. - Re-order and comment configuration parts. Signed-off-by: Claus Hunsen --- .travis.yml | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5e8724f4..2933e44c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,37 +11,32 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2017-2018 by Claus Hunsen +## Copyright 2017-2018,2020 by Claus Hunsen ## All Rights Reserved. +# TravisCI container +os: linux +dist: xenial +warnings_are_errors: false +# R environment, dependencies and information language: r r: - 3.3 - 3.4 - 3.5 - 3.6 - -# TravisCI container -sudo: required -dist: trusty -warnings_are_errors: false - -# # Branches -# branches: -# only: -# - travis -# - claus-updates - -# R dependencies and information cache: packages repos: CRAN: https://cloud.r-project.org -# installation +# Installation install: + # package dependencies - sudo apt-get install libudunits2-dev + # package installation - Rscript install.R +# Tests script: - Rscript tests.R From 4220a799b27e58a90b32df3cd3d195c2a37b04aa Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Fri, 24 Jan 2020 21:01:02 +0100 Subject: [PATCH 18/39] Update changelog Signed-off-by: Claus Hunsen --- NEWS.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS.md b/NEWS.md index df5f5f0b..651e641c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,15 @@ - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) +### Changed/Improved + +- Add R version 3.6 to test suite (8b2a52d38475a59c55feb17bb54ed12b9252a937, #161) +- Update `.travis.yml` to improve compatibility with Travis CI (41ce589b3b50fd581a10e6af33ac6b1bbea63bb8) + +### Fixed + +- Ensure sorting of commit-count and LOC-count data.frames to fix tests with R 3.3 (33d63fd50c4b29d45a9ca586c383650f7d29efd5) + ## 3.5 From d4af515f859ce16ffaa0963d6d3d4086bcbb7377 Mon Sep 17 00:00:00 2001 From: Klara Date: Sun, 14 Jul 2019 21:34:31 +0200 Subject: [PATCH 19/39] Add function plot.commit.editor.types.by.author Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and both author and committer for the given project data. Signed-off-by: Klara Schlueter --- showcase.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/showcase.R b/showcase.R index 16861cac..ce4797fd 100644 --- a/showcase.R +++ b/showcase.R @@ -17,6 +17,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter +## Copyright 2019 by Klara Schlüter ## All Rights Reserved. @@ -121,6 +122,37 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) y.data = RangeData$new(project.conf = proj.conf, range = ranges[[22]]) y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) +#' Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and +#' both author and committer. +#' +#' @param data The project data. +plot.commit.editor.types.by.author = function(data) { + + ## get editor data + and = get.committer.and.author.commit.count(data) + or = get.committer.not.author.commit.count(data) + + ## build data frame as required for plotting + both = data.frame(and[["author.name"]], and[["freq"]]) + colnames(both) = c("editor", "author and committer") + + author = aggregate(or$freq, by = list(or$author.name), FUN = sum) + colnames(author) = c("editor", "only author") + + committer = aggregate(or$freq, by = list(or$committer.name), FUN = sum) + colnames(committer) = c("editor", "only committer") + + plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) + plot.data[is.na(plot.data)] = 0 + editors = plot.data[["editor"]] + plot.data = plot.data[2:4] + rownames(plot.data) = editors + + ## draw plot + barplot(t(plot.data), main = "Types of commit edits per author", ylab = "commit count", col = heat.colors(3), las = 2) + legend("topright", names(plot.data), fill = heat.colors(3), cex = 0.5) +} + ## * Data retrieval -------------------------------------------------------- # y.data$get.commits() From aa542a215f59bc3ed869cfefbc5a25fa050b1fc9 Mon Sep 17 00:00:00 2001 From: Klara Date: Tue, 30 Jul 2019 08:29:59 +0200 Subject: [PATCH 20/39] Add function plot.commit.edit.types.in.project Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer and author are different. Signed-off-by: Klara Schlueter --- showcase.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/showcase.R b/showcase.R index ce4797fd..3175f6fe 100644 --- a/showcase.R +++ b/showcase.R @@ -153,6 +153,26 @@ plot.commit.editor.types.by.author = function(data) { legend("topright", names(plot.data), fill = heat.colors(3), cex = 0.5) } +#' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer +#' and author are different. +#' +#' @param data The project data. +plot.commit.edit.types.in.project = function(data) { + + ## get commit count + and = get.committer.and.author.commit.count(data) + or = get.committer.not.author.commit.count(data) + + ## build data frame as required for plotting + same = sum(and$freq) + different = sum(or$freq) + plot.data = data.frame(same, different) + + ## draw plot + barplot(t(plot.data), ylab = "commit count", col = heat.colors(3), ylim = c(0, same + different + (same + different)/4)) + legend("topright", c("author = committer", "author /= committer"), fill = heat.colors(3)) +} + ## * Data retrieval -------------------------------------------------------- # y.data$get.commits() From 33a1f040a118d7f8fe4f795c2a2c5112d130fd59 Mon Sep 17 00:00:00 2001 From: Klara Date: Tue, 30 Jul 2019 14:41:11 +0200 Subject: [PATCH 21/39] Draw editor type plot by author using ggplot2 Signed-off-by: Klara Schlueter --- showcase.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/showcase.R b/showcase.R index 3175f6fe..c1e1d22d 100644 --- a/showcase.R +++ b/showcase.R @@ -144,13 +144,17 @@ plot.commit.editor.types.by.author = function(data) { plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) plot.data[is.na(plot.data)] = 0 - editors = plot.data[["editor"]] - plot.data = plot.data[2:4] - rownames(plot.data) = editors + ## prepare data for a stacked barplot (prepare for stacking the editor types) + plot.data = reshape2::melt(plot.data) + names(plot.data) = c("editor", "editor type", "commit count") + ## draw plot - barplot(t(plot.data), main = "Types of commit edits per author", ylab = "commit count", col = heat.colors(3), las = 2) - legend("topright", names(plot.data), fill = heat.colors(3), cex = 0.5) + ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = editor, y = `commit count`, fill = `editor type`)) + + ## use data frame values instead of counting entries + ggplot2::geom_bar(stat = 'identity') + + ## rotate y-axis labels by 90 degree + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) } #' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer From 305cc209717aed1252ffe14e3cfa03c1e807bea4 Mon Sep 17 00:00:00 2001 From: Klara Date: Tue, 30 Jul 2019 16:17:16 +0200 Subject: [PATCH 22/39] Draw edit type plot for project using ggplot2 Signed-off-by: Klara Schlueter --- showcase.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/showcase.R b/showcase.R index c1e1d22d..cfff6282 100644 --- a/showcase.R +++ b/showcase.R @@ -168,13 +168,13 @@ plot.commit.edit.types.in.project = function(data) { or = get.committer.not.author.commit.count(data) ## build data frame as required for plotting - same = sum(and$freq) - different = sum(or$freq) - plot.data = data.frame(same, different) + plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or$freq), sum(and$freq))) + colnames(plot.data) = c("edit types", "commit count") ## draw plot - barplot(t(plot.data), ylab = "commit count", col = heat.colors(3), ylim = c(0, same + different + (same + different)/4)) - legend("topright", c("author = committer", "author /= committer"), fill = heat.colors(3)) + ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + + ## use data frame values instead of counting entries + ggplot2::geom_bar(stat = 'identity') } ## * Data retrieval -------------------------------------------------------- From 0a0a5903e7c609dfe805a3471749eb2241efafe2 Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 1 Aug 2019 10:53:50 +0200 Subject: [PATCH 23/39] Move evaluation plotting functions to new file util-evaluation-plot.R Signed-off-by: Klara Schlueter --- README.md | 2 ++ showcase.R | 55 ----------------------------- util-evaluation-plot.R | 79 ++++++++++++++++++++++++++++++++++++++++++ util-init.R | 1 + 4 files changed, 82 insertions(+), 55 deletions(-) create mode 100644 util-evaluation-plot.R diff --git a/README.md b/README.md index 23456686..42dfe3c2 100644 --- a/README.md +++ b/README.md @@ -415,6 +415,8 @@ Additionally, for more examples, the file `showcase.R` is worth a look. * Functionality for the identification of network motifs (subgraph patterns) - `util-plot.R` * Everything needed for plotting networks +- `util-evaluation-plot.R` + * Plotting functions for data evaluation - `util-misc.R` * Helper functions and also legacy functions, both needed in the other files - `showcase.R` diff --git a/showcase.R b/showcase.R index cfff6282..041a8b21 100644 --- a/showcase.R +++ b/showcase.R @@ -122,61 +122,6 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) y.data = RangeData$new(project.conf = proj.conf, range = ranges[[22]]) y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) -#' Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and -#' both author and committer. -#' -#' @param data The project data. -plot.commit.editor.types.by.author = function(data) { - - ## get editor data - and = get.committer.and.author.commit.count(data) - or = get.committer.not.author.commit.count(data) - - ## build data frame as required for plotting - both = data.frame(and[["author.name"]], and[["freq"]]) - colnames(both) = c("editor", "author and committer") - - author = aggregate(or$freq, by = list(or$author.name), FUN = sum) - colnames(author) = c("editor", "only author") - - committer = aggregate(or$freq, by = list(or$committer.name), FUN = sum) - colnames(committer) = c("editor", "only committer") - - plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) - plot.data[is.na(plot.data)] = 0 - ## prepare data for a stacked barplot (prepare for stacking the editor types) - plot.data = reshape2::melt(plot.data) - names(plot.data) = c("editor", "editor type", "commit count") - - - ## draw plot - ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = editor, y = `commit count`, fill = `editor type`)) + - ## use data frame values instead of counting entries - ggplot2::geom_bar(stat = 'identity') + - ## rotate y-axis labels by 90 degree - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) -} - -#' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer -#' and author are different. -#' -#' @param data The project data. -plot.commit.edit.types.in.project = function(data) { - - ## get commit count - and = get.committer.and.author.commit.count(data) - or = get.committer.not.author.commit.count(data) - - ## build data frame as required for plotting - plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or$freq), sum(and$freq))) - colnames(plot.data) = c("edit types", "commit count") - - ## draw plot - ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + - ## use data frame values instead of counting entries - ggplot2::geom_bar(stat = 'identity') -} - ## * Data retrieval -------------------------------------------------------- # y.data$get.commits() diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R new file mode 100644 index 00000000..d233c9ae --- /dev/null +++ b/util-evaluation-plot.R @@ -0,0 +1,79 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2019 by Klara Schlüter +## All Rights Reserved. + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Libraries --------------------------------------------------------------- + +requireNamespace("ggplot2") ## plotting + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Plot functions ---------------------------------------------------------- + +#' Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and +#' both author and committer. +#' +#' @param data The project data. +plot.commit.editor.types.by.author = function(data) { + + ## get editor data + and = get.committer.and.author.commit.count(data) + or = get.committer.not.author.commit.count(data) + + ## build data frame as required for plotting + both = data.frame(and[["author.name"]], and[["freq"]]) + colnames(both) = c("editor", "author and committer") + + author = aggregate(or$freq, by = list(or$author.name), FUN = sum) + colnames(author) = c("editor", "only author") + + committer = aggregate(or$freq, by = list(or$committer.name), FUN = sum) + colnames(committer) = c("editor", "only committer") + + plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) + plot.data[is.na(plot.data)] = 0 + ## prepare data for a stacked barplot (prepare for stacking the editor types) + plot.data = reshape2::melt(plot.data) + names(plot.data) = c("editor", "editor type", "commit count") + + + ## draw plot + ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = editor, y = `commit count`, fill = `editor type`)) + + ## use data frame values instead of counting entries + ggplot2::geom_bar(stat = 'identity') + + ## rotate y-axis labels by 90 degree + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +} + +#' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer +#' and author are different. +#' +#' @param data The project data. +plot.commit.edit.types.in.project = function(data) { + + ## get commit count + and = get.committer.and.author.commit.count(data) + or = get.committer.not.author.commit.count(data) + + ## build data frame as required for plotting + plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or$freq), sum(and$freq))) + colnames(plot.data) = c("edit types", "commit count") + + ## draw plot + ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + + ## use data frame values instead of counting entries + ggplot2::geom_bar(stat = 'identity') +} diff --git a/util-init.R b/util-init.R index e307c788..edd8e1c3 100644 --- a/util-init.R +++ b/util-init.R @@ -60,3 +60,4 @@ source("util-plot.R") source("util-core-peripheral.R") source("util-networks-metrics.R") source("util-networks-covariates.R") +source("util-evaluation-plot.R") From a4c22ab0cc7df348d0542e5cb6adcdb36ece0cfa Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 1 Aug 2019 13:32:49 +0200 Subject: [PATCH 24/39] Add possibility of scaling editor type plot to 100% A parameter plot.percentage is added to the function plot.commit.editor.types.by.author. If true, the barplot shows the relative numnber of differently edited commits per author: each bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the absolute number of commits per author is shown in the plot. Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index d233c9ae..48d2bbed 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -27,7 +27,10 @@ requireNamespace("ggplot2") ## plotting #' both author and committer. #' #' @param data The project data. -plot.commit.editor.types.by.author = function(data) { +#' @param plot.percentage If true, the barplot shows the relative number of differently edited commits per author: each +#' bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the +#' absolute number of commits per author is shown in the plot. +plot.commit.editor.types.by.author = function(data, plot.percentage = FALSE) { ## get editor data and = get.committer.and.author.commit.count(data) @@ -45,6 +48,12 @@ plot.commit.editor.types.by.author = function(data) { plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) plot.data[is.na(plot.data)] = 0 + + ## if desired, calculate percentage of editor types per author + if(plot.percentage) { + plot.data = cbind(plot.data[1], t(apply(plot.data[2:4], 1, function(x) {x/sum(x)}))) + } + ## prepare data for a stacked barplot (prepare for stacking the editor types) plot.data = reshape2::melt(plot.data) names(plot.data) = c("editor", "editor type", "commit count") From cc4caaece84c2434227537cffe63ea62d2a61cc0 Mon Sep 17 00:00:00 2001 From: Klara Date: Thu, 1 Aug 2019 16:48:02 +0200 Subject: [PATCH 25/39] Order bars in plot of plot.commit.editor.types.by.author The bars in the plot are ordered as follows (criteria are listed with descending priority): the editor who committed the most commits that where authored by someone else, is represented by the rightmost bar. The editor who committed the most commits that where also authored by him is represented by the rightmost bar. The remaining editors (who are only authors) are sorted by the number of commits. Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 48d2bbed..a7ef113b 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -54,13 +54,15 @@ plot.commit.editor.types.by.author = function(data, plot.percentage = FALSE) { plot.data = cbind(plot.data[1], t(apply(plot.data[2:4], 1, function(x) {x/sum(x)}))) } + ## compute order of bars from data: only author < author and committer < only committer + ordered.editors = plot.data$editor[with(plot.data, order(`only committer`, `author and committer`, `only author`))] + ## prepare data for a stacked barplot (prepare for stacking the editor types) plot.data = reshape2::melt(plot.data) names(plot.data) = c("editor", "editor type", "commit count") - ## draw plot - ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = editor, y = `commit count`, fill = `editor type`)) + + ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), y = `commit count`, fill = `editor type`)) + ## use data frame values instead of counting entries ggplot2::geom_bar(stat = 'identity') + ## rotate y-axis labels by 90 degree From 442e2d7eda77c8343bcd28eae5da36082025ca79 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 16 Sep 2019 13:35:12 +0200 Subject: [PATCH 26/39] Add possibility of scaling edit type plot to 100% A parameter relative.y.scale is added to the function plot.commit.edit.types.in.project. If true, the y axis shows the percentage of the number of commits of the edit type with respect to all commits. If false, y axis shows the absolut number of commits. Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index a7ef113b..505234d3 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -27,10 +27,10 @@ requireNamespace("ggplot2") ## plotting #' both author and committer. #' #' @param data The project data. -#' @param plot.percentage If true, the barplot shows the relative number of differently edited commits per author: each -#' bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the -#' absolute number of commits per author is shown in the plot. -plot.commit.editor.types.by.author = function(data, plot.percentage = FALSE) { +#' @param percentage.per.author If true, the barplot shows the relative number of differently edited commits per author: each +#' bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the +#' absolute number of commits per author is shown in the plot. [default: FALSE] +plot.commit.editor.types.by.author = function(data, percentage.per.author = FALSE) { ## get editor data and = get.committer.and.author.commit.count(data) @@ -73,7 +73,9 @@ plot.commit.editor.types.by.author = function(data, plot.percentage = FALSE) { #' and author are different. #' #' @param data The project data. -plot.commit.edit.types.in.project = function(data) { +#' @param relative.y.scale If true, the y axis shows the percentage of the number of commits of the special edit type with +#' respect to all commits. If false, the y axis shows the absolut number of commits. +plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { ## get commit count and = get.committer.and.author.commit.count(data) @@ -83,6 +85,11 @@ plot.commit.edit.types.in.project = function(data) { plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or$freq), sum(and$freq))) colnames(plot.data) = c("edit types", "commit count") + ## if desired, calculate values for y axis labes showing percentage of all commits + if(relative.y.scale) { + plot.data = cbind(plot.data[1], plot.data[2]/sum(plot.data[2])) + } + ## draw plot ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + ## use data frame values instead of counting entries From 6719d42d2b9ce6a48738f512ef4245e08b3532cf Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 16 Sep 2019 14:10:31 +0200 Subject: [PATCH 27/39] Update changelog Signed-off-by: Klara Schlueter [CH: Resolved conflict during rebase.] --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 651e641c..90acfaa6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) +- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project (PR #171) ### Changed/Improved From bcefcfff137f5e4bacaca6eb429586e8e8c66de3 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 16 Sep 2019 14:22:42 +0200 Subject: [PATCH 28/39] Fix parameter name issue in util-evaluation-plot.R Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 505234d3..15478c7e 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -50,7 +50,7 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS plot.data[is.na(plot.data)] = 0 ## if desired, calculate percentage of editor types per author - if(plot.percentage) { + if(percentage.per.author) { plot.data = cbind(plot.data[1], t(apply(plot.data[2:4], 1, function(x) {x/sum(x)}))) } From 04e1b94dcafe35b742e9d5f09dda30a7906c7271 Mon Sep 17 00:00:00 2001 From: Klara Date: Tue, 24 Sep 2019 13:20:08 +0200 Subject: [PATCH 29/39] Apply Thomas' Review on evaluation plot functions Signed-off-by: Klara Schlueter [CH: Resolved conflicts while rebasing, updated commit hashes.] --- NEWS.md | 2 +- util-evaluation-plot.R | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index 90acfaa6..d35f6a66 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) -- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project (PR #171) +- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9) ### Changed/Improved diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 15478c7e..898ada88 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -27,9 +27,11 @@ requireNamespace("ggplot2") ## plotting #' both author and committer. #' #' @param data The project data. -#' @param percentage.per.author If true, the barplot shows the relative number of differently edited commits per author: each +#' @param percentage.per.author If \code{TRUE}, the barplot shows the relative number of differently edited commits per author: each #' bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the #' absolute number of commits per author is shown in the plot. [default: FALSE] +#' +#' @return a ggplot2/ggraph plot object plot.commit.editor.types.by.author = function(data, percentage.per.author = FALSE) { ## get editor data @@ -40,10 +42,10 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS both = data.frame(and[["author.name"]], and[["freq"]]) colnames(both) = c("editor", "author and committer") - author = aggregate(or$freq, by = list(or$author.name), FUN = sum) + author = aggregate(or[["freq"]], by = list(or[["author.name"]]), FUN = sum) colnames(author) = c("editor", "only author") - committer = aggregate(or$freq, by = list(or$committer.name), FUN = sum) + committer = aggregate(or[["freq"]], by = list(or[["committer.name"]]), FUN = sum) colnames(committer) = c("editor", "only committer") plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) @@ -51,30 +53,39 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS ## if desired, calculate percentage of editor types per author if(percentage.per.author) { - plot.data = cbind(plot.data[1], t(apply(plot.data[2:4], 1, function(x) {x/sum(x)}))) + name.column = plot.data[1] + value.columns = plot.data[2:4] + + ## scale data values per author (represented by one line) to 100% + scaled.value.columns = apply(value.columns, 1, function(x) {x/sum(x)}) + + plot.data = cbind(name.column, t(scaled.value.columns)) } ## compute order of bars from data: only author < author and committer < only committer - ordered.editors = plot.data$editor[with(plot.data, order(`only committer`, `author and committer`, `only author`))] + ordered.editors = plot.data[["editor"]][with(plot.data, order(`only committer`, `author and committer`, `only author`))] ## prepare data for a stacked barplot (prepare for stacking the editor types) plot.data = reshape2::melt(plot.data) names(plot.data) = c("editor", "editor type", "commit count") ## draw plot - ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), y = `commit count`, fill = `editor type`)) + + plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), y = `commit count`, fill = `editor type`)) + ## use data frame values instead of counting entries ggplot2::geom_bar(stat = 'identity') + ## rotate y-axis labels by 90 degree ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) + return(plot) } #' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer #' and author are different. #' #' @param data The project data. -#' @param relative.y.scale If true, the y axis shows the percentage of the number of commits of the special edit type with -#' respect to all commits. If false, the y axis shows the absolut number of commits. +#' @param relative.y.scale If \code{TRUE}, the y axis shows the percentage of the number of commits of the special edit type with +#' respect to all commits. If \code{FALSE}, the y axis shows the absolut number of commits. [default: FALSE] +#' +#' @return a ggplot2/ggraph plot object plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { ## get commit count @@ -82,7 +93,7 @@ plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { or = get.committer.not.author.commit.count(data) ## build data frame as required for plotting - plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or$freq), sum(and$freq))) + plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or[["freq"]]), sum(and[["freq"]]))) colnames(plot.data) = c("edit types", "commit count") ## if desired, calculate values for y axis labes showing percentage of all commits @@ -91,7 +102,8 @@ plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { } ## draw plot - ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + + plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + ## use data frame values instead of counting entries ggplot2::geom_bar(stat = 'identity') + return(plot) } From f18c3a3a028899876b33f3487e4e03d0ac929fc4 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 30 Sep 2019 13:51:39 +0200 Subject: [PATCH 30/39] Break lines after 120 characters in util-evaluation-plots Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 898ada88..63ebbbed 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -27,9 +27,10 @@ requireNamespace("ggplot2") ## plotting #' both author and committer. #' #' @param data The project data. -#' @param percentage.per.author If \code{TRUE}, the barplot shows the relative number of differently edited commits per author: each -#' bar in the barplot (representing the commits of one editor) is scaled to 100%. Otherwise, the -#' absolute number of commits per author is shown in the plot. [default: FALSE] +#' @param percentage.per.author If \code{TRUE}, the barplot shows the relative number of differently edited commits per +#' author: each bar in the barplot (representing the commits of one editor) is scaled to +#' 100%. Otherwise, the absolute number of commits per author is shown in the plot. +#' [default: FALSE] #' #' @return a ggplot2/ggraph plot object plot.commit.editor.types.by.author = function(data, percentage.per.author = FALSE) { @@ -63,14 +64,16 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS } ## compute order of bars from data: only author < author and committer < only committer - ordered.editors = plot.data[["editor"]][with(plot.data, order(`only committer`, `author and committer`, `only author`))] + ordered.editors = plot.data[["editor"]][with(plot.data, + order(`only committer`, `author and committer`, `only author`))] ## prepare data for a stacked barplot (prepare for stacking the editor types) plot.data = reshape2::melt(plot.data) names(plot.data) = c("editor", "editor type", "commit count") ## draw plot - plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), y = `commit count`, fill = `editor type`)) + + plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), + y = `commit count`, fill = `editor type`)) + ## use data frame values instead of counting entries ggplot2::geom_bar(stat = 'identity') + ## rotate y-axis labels by 90 degree @@ -78,12 +81,13 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS return(plot) } -#' Produces a barplot showing for how many commits committer and author are the same person and for how many commits committer -#' and author are different. +#' Produces a barplot showing for how many commits committer and author are the same person and for how many commits +#' committer and author are different. #' #' @param data The project data. -#' @param relative.y.scale If \code{TRUE}, the y axis shows the percentage of the number of commits of the special edit type with -#' respect to all commits. If \code{FALSE}, the y axis shows the absolut number of commits. [default: FALSE] +#' @param relative.y.scale If \code{TRUE}, the y axis shows the percentage of the number of commits of the special edit +#' type with respect to all commits. If \code{FALSE}, the y axis shows the absolut number of +#' commits. [default: FALSE] #' #' @return a ggplot2/ggraph plot object plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { From 6ccc9d28267d8aab5f6371de9b7ce9fd7b3ab679 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 30 Sep 2019 13:54:55 +0200 Subject: [PATCH 31/39] Add commit hash for new file util-evaluation-plots in changelog Signed-off-by: Klara Schlueter [CH: Resolved conflicts while rebasing, updated commit hashes.] --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d35f6a66..5bcab600 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) -- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9) +- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) ### Changed/Improved From e00f108f9cf5bb8b43440558f3b75ca422cc9544 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 30 Sep 2019 13:58:56 +0200 Subject: [PATCH 32/39] Add spaces in if statement in util-evaluation-plot Signed-off-by: Klara Schlueter --- util-evaluation-plot.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 63ebbbed..345e0a1e 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -53,7 +53,7 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS plot.data[is.na(plot.data)] = 0 ## if desired, calculate percentage of editor types per author - if(percentage.per.author) { + if (percentage.per.author) { name.column = plot.data[1] value.columns = plot.data[2:4] @@ -101,7 +101,7 @@ plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { colnames(plot.data) = c("edit types", "commit count") ## if desired, calculate values for y axis labes showing percentage of all commits - if(relative.y.scale) { + if (relative.y.scale) { plot.data = cbind(plot.data[1], plot.data[2]/sum(plot.data[2])) } From b0754f53ec694de18ba44834591be1db3c1529b5 Mon Sep 17 00:00:00 2001 From: Klara Date: Mon, 30 Sep 2019 14:08:56 +0200 Subject: [PATCH 33/39] Remove wrong copyright header from showcase.R Signed-off-by: Klara Schlueter --- showcase.R | 1 - 1 file changed, 1 deletion(-) diff --git a/showcase.R b/showcase.R index 041a8b21..16861cac 100644 --- a/showcase.R +++ b/showcase.R @@ -17,7 +17,6 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter -## Copyright 2019 by Klara Schlüter ## All Rights Reserved. From 0f6883b930964219492eec93cfff7221e19e8c29 Mon Sep 17 00:00:00 2001 From: Klara Date: Sat, 5 Oct 2019 19:11:05 +0200 Subject: [PATCH 34/39] Add example calls for evaluation plots in showcase Signed-off-by: Klara Schlueter --- showcase.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/showcase.R b/showcase.R index 16861cac..8a2828a5 100644 --- a/showcase.R +++ b/showcase.R @@ -17,6 +17,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter +## Copyright 2019 by Klara Schlueter ## All Rights Reserved. @@ -80,6 +81,13 @@ revisions.callgraph = proj.conf$get.value("revisions.callgraph") x.data = ProjectData$new(project.conf = proj.conf) x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf) +## * Evaluation plots ------------------------------------------------------ + +# edit.types = plot.commit.edit.types.in.project(x.data) +# edit.types.scaled = plot.commit.edit.types.in.project(x.data, TRUE) +# editor.types = plot.commit.editor.types.by.author(x.data) +# editor.types.scaled = plot.commit.editor.types.by.author(x.data, TRUE) + ## * Data retrieval -------------------------------------------------------- # x.data$get.commits() From 4e35682b99f3aa900701c55279866c2c3ed0add8 Mon Sep 17 00:00:00 2001 From: Klara Date: Sat, 5 Oct 2019 19:18:26 +0200 Subject: [PATCH 35/39] Adapt docs to coding conventions in util-evaluation-plots Signed-off-by: Klara Schlueter [CH: Fixed typo in commit title.] --- util-evaluation-plot.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 345e0a1e..80938b8a 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -26,8 +26,8 @@ requireNamespace("ggplot2") ## plotting #' Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and #' both author and committer. #' -#' @param data The project data. -#' @param percentage.per.author If \code{TRUE}, the barplot shows the relative number of differently edited commits per +#' @param data the project data +#' @param percentage.per.author if \code{TRUE}, the barplot shows the relative number of differently edited commits per #' author: each bar in the barplot (representing the commits of one editor) is scaled to #' 100%. Otherwise, the absolute number of commits per author is shown in the plot. #' [default: FALSE] @@ -84,8 +84,8 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS #' Produces a barplot showing for how many commits committer and author are the same person and for how many commits #' committer and author are different. #' -#' @param data The project data. -#' @param relative.y.scale If \code{TRUE}, the y axis shows the percentage of the number of commits of the special edit +#' @param data the project data +#' @param relative.y.scale if \code{TRUE}, the y axis shows the percentage of the number of commits of the special edit #' type with respect to all commits. If \code{FALSE}, the y axis shows the absolut number of #' commits. [default: FALSE] #' From dcf3de6f49cf96a1d52e8bfec9fb515f4fc55a79 Mon Sep 17 00:00:00 2001 From: Klara Date: Sun, 12 Jan 2020 13:00:44 +0100 Subject: [PATCH 36/39] Apply Claus' and Christian's Review Signed-off-by: Klara Schlueter [CH: Resolved conflicts while rebasing.] --- NEWS.md | 2 +- util-evaluation-plot.R | 42 +++++++++++++++++++++++++++++------------- util-init.R | 1 + 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5bcab600..5fb60fed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) -- Add a new file util-evaluation-plot.R containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) +- Add a new file `util-evaluation-plot.R` containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) ### Changed/Improved diff --git a/util-evaluation-plot.R b/util-evaluation-plot.R index 80938b8a..57155fcc 100644 --- a/util-evaluation-plot.R +++ b/util-evaluation-plot.R @@ -21,7 +21,7 @@ requireNamespace("ggplot2") ## plotting ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Plot functions ---------------------------------------------------------- +## Plots regarding commit edit and editor types --------------------------------------- #' Produces a barplot showing for every editor the number of commits for which he is only author, only committer, and #' both author and committer. @@ -41,13 +41,13 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS ## build data frame as required for plotting both = data.frame(and[["author.name"]], and[["freq"]]) - colnames(both) = c("editor", "author and committer") + colnames(both) = c("editor", "author.and.committer") author = aggregate(or[["freq"]], by = list(or[["author.name"]]), FUN = sum) - colnames(author) = c("editor", "only author") + colnames(author) = c("editor", "only.author") committer = aggregate(or[["freq"]], by = list(or[["committer.name"]]), FUN = sum) - colnames(committer) = c("editor", "only committer") + colnames(committer) = c("editor", "only.committer") plot.data = merge(merge(both, author, all = TRUE), committer, all = TRUE) plot.data[is.na(plot.data)] = 0 @@ -65,19 +65,27 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS ## compute order of bars from data: only author < author and committer < only committer ordered.editors = plot.data[["editor"]][with(plot.data, - order(`only committer`, `author and committer`, `only author`))] + order(`only.committer`, `author.and.committer`, `only.author`))] ## prepare data for a stacked barplot (prepare for stacking the editor types) plot.data = reshape2::melt(plot.data) - names(plot.data) = c("editor", "editor type", "commit count") + names(plot.data) = c("editor", "editor.type", "commit.count") ## draw plot plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(x = factor(editor, levels = ordered.editors), - y = `commit count`, fill = `editor type`)) + + y = `commit.count`, fill = `editor.type`)) + ## use data frame values instead of counting entries ggplot2::geom_bar(stat = 'identity') + ## rotate y-axis labels by 90 degree - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) + + ## set proper legend items and title + ggplot2::scale_fill_discrete(name = "Commit edit type", + labels = c("author and committer", "only author", "only committer")) + + ## add proper axis labels + ggplot2::labs( + x = "Authors", + y = "Commit count" + ) return(plot) } @@ -97,17 +105,25 @@ plot.commit.edit.types.in.project = function(data, relative.y.scale = FALSE) { or = get.committer.not.author.commit.count(data) ## build data frame as required for plotting - plot.data = data.frame(c("author /= committer", "author = committer"), c(sum(or[["freq"]]), sum(and[["freq"]]))) - colnames(plot.data) = c("edit types", "commit count") + plot.data = data.frame(c("author.!=.committer", "author.=.committer"), c(sum(or[["freq"]]), sum(and[["freq"]]))) + colnames(plot.data) = c("edit.types", "commit.count") ## if desired, calculate values for y axis labes showing percentage of all commits if (relative.y.scale) { - plot.data = cbind(plot.data[1], plot.data[2]/sum(plot.data[2])) + plot.data = cbind(plot.data[1], plot.data[2] / sum(plot.data[2])) } ## draw plot - plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit count`, x = `edit types`)) + + plot = ggplot2::ggplot(data = plot.data, mapping = ggplot2::aes(y = `commit.count`, x = `edit.types`)) + ## use data frame values instead of counting entries - ggplot2::geom_bar(stat = 'identity') + ggplot2::geom_bar(stat = 'identity') + + ## set proper bar labels + ggplot2::scale_x_discrete(labels = c("author.!=.committer" = "author != committer", + "author.=.committer" = "author = committer")) + + ## add proper axis labels + ggplot2::labs( + x = "Edit types", + y = "Commit count" + ) return(plot) } diff --git a/util-init.R b/util-init.R index edd8e1c3..94534871 100644 --- a/util-init.R +++ b/util-init.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Raphael Nömmer ## Copyright 2017 by Sofie Kemper ## Copyright 2017 by Felix Prasse +## Copyright 2019 by Klara Schlüter ## All Rights Reserved. From 1e719ece21768478b0abd9a12013238f7d5b1c8a Mon Sep 17 00:00:00 2001 From: Claus Hunsen Date: Thu, 30 Jan 2020 16:49:52 +0100 Subject: [PATCH 37/39] Apply last review comments from PR #171 There are the following changes in this commit: - Add missing spaces around binary operation '/' in Line 61. - Move 't' command from Line 63 to Line 61. - Rename the newly added file to 'util-plot-evaluation.R' for convenience reasons. - Adapt the changelog, the README file, and the initialization script to the new file name. Signed-off-by: Claus Hunsen --- NEWS.md | 2 +- README.md | 2 +- util-init.R | 2 +- util-evaluation-plot.R => util-plot-evaluation.R | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) rename util-evaluation-plot.R => util-plot-evaluation.R (97%) diff --git a/NEWS.md b/NEWS.md index 5fb60fed..4a0a2724 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48) - Add the possibility to filter out patchstack mails from the mails of the `ProjectData`. The option can be toggled using the newly added configuration option `mails.filter.patchstack.mails`. (1608e28ca36610c58d2a5447d12ee2052c6eb976, a932c8cdaa6fe5149c798bc09d9e421ba679c48d) -- Add a new file `util-evaluation-plot.R` containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) +- Add a new file `util-plot-evaluation.R` containing functions to plot commit edit types per author and project. (PR #171, d4af515f859ce16ffaa0963d6d3d4086bcbb7377, aa542a215f59bc3ed869cfefbc5a25fa050b1fc9. 0a0a5903e7c609dfe805a3471749eb2241efafe2) ### Changed/Improved diff --git a/README.md b/README.md index 42dfe3c2..94b84b39 100644 --- a/README.md +++ b/README.md @@ -415,7 +415,7 @@ Additionally, for more examples, the file `showcase.R` is worth a look. * Functionality for the identification of network motifs (subgraph patterns) - `util-plot.R` * Everything needed for plotting networks -- `util-evaluation-plot.R` +- `util-plot-evaluation.R` * Plotting functions for data evaluation - `util-misc.R` * Helper functions and also legacy functions, both needed in the other files diff --git a/util-init.R b/util-init.R index 94534871..df6db710 100644 --- a/util-init.R +++ b/util-init.R @@ -61,4 +61,4 @@ source("util-plot.R") source("util-core-peripheral.R") source("util-networks-metrics.R") source("util-networks-covariates.R") -source("util-evaluation-plot.R") +source("util-plot-evaluation.R") diff --git a/util-evaluation-plot.R b/util-plot-evaluation.R similarity index 97% rename from util-evaluation-plot.R rename to util-plot-evaluation.R index 57155fcc..97dbe409 100644 --- a/util-evaluation-plot.R +++ b/util-plot-evaluation.R @@ -58,9 +58,9 @@ plot.commit.editor.types.by.author = function(data, percentage.per.author = FALS value.columns = plot.data[2:4] ## scale data values per author (represented by one line) to 100% - scaled.value.columns = apply(value.columns, 1, function(x) {x/sum(x)}) + scaled.value.columns = t(apply(value.columns, 1, function(x) {x / sum(x)})) - plot.data = cbind(name.column, t(scaled.value.columns)) + plot.data = cbind(name.column, scaled.value.columns) } ## compute order of bars from data: only author < author and committer < only committer From f6a6d5b485d981f60af67f7a73fc240c4d970003 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Fri, 21 Feb 2020 18:51:33 +0100 Subject: [PATCH 38/39] Again remove duplicated word in README.md Signed-off-by: Thomas Bock --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94b84b39..a325d7f4 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. Please insert the project into yours by use of [git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules). Furthermore, the file `install.R` installs all needed R packages (see [below](#needed-r-packages)) into your R library. -Although, the use of of [packrat](https://rstudio.github.io/packrat/) with your project is recommended. +Although, the use of [packrat](https://rstudio.github.io/packrat/) with your project is recommended. This library is written in a way to not interfere with the loading order of your project's `R` packages (i.e., `library()` calls), so that the library does not lead to masked definitions. From 75ae4a505b36bd73f554a982645c82171c808618 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Fri, 21 Feb 2020 18:53:28 +0100 Subject: [PATCH 39/39] Version v3.6 Signed-off-by: Thomas Bock --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4a0a2724..67c3c6e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # coronet – Changelog -## Unversioned +## 3.6 ### Added - Add a parameter `editor.definition` to the function `add.vertex.attribute.artifact.editor.count` which can be used to define, if author or committer or both count as editors when computing the attribute values. (#92, ff1e147ba563b2d71f8228afd49492a315a5ad48)