From 9589e52cdb727d60cbb3f4e9a056aaff0675bd49 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sun, 13 Dec 2020 18:43:08 +0100 Subject: [PATCH 01/43] Add test commit message data In order to test new functionality (i.e. the read.commit.messages function) new files containing test commit data were needed. Add two files containing messages corresponding to the test commit data that already exists. See #180 Signed-off-by: Niklas Schneider --- .../testing/test_feature/feature/commitMessages.list | 7 +++++++ .../testing/test_proximity/proximity/commitMessages.list | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 tests/codeface-data/results/testing/test_feature/feature/commitMessages.list create mode 100644 tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list diff --git a/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list b/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list new file mode 100644 index 00000000..0f02072a --- /dev/null +++ b/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list @@ -0,0 +1,7 @@ +32712;"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"Add stuff" +32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff" +32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" +32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" +32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" +32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " +32711;"0a1a5c523d835459c42f33e863623138555e2526";"" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list b/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list new file mode 100644 index 00000000..0f02072a --- /dev/null +++ b/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list @@ -0,0 +1,7 @@ +32712;"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"Add stuff" +32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff" +32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" +32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" +32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" +32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " +32711;"0a1a5c523d835459c42f33e863623138555e2526";"" From 85b1d0572c0fb9f4c062bceb1363b0398f98b85f Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sun, 13 Dec 2020 18:50:48 +0100 Subject: [PATCH 02/43] Add new function to read commit messages Add the read.commit.messages and create.empty.commits.list functions to util-read.R as well as a unit test to test the new functions. This allows to read 'commitMessages.list' files and return the commit data separated into message title and body. See #180 Signed-off-by: Niklas Schneider --- tests/test-read.R | 26 ++++++++++++ util-read.R | 106 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/tests/test-read.R b/tests/test-read.R index 55233199..d9aed004 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -17,6 +17,7 @@ ## Copyright 2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker +## Copyright 2020 by Niklas Schneider ## All Rights Reserved. @@ -138,6 +139,31 @@ test_that("Read the raw commit data with the file artifact.", { }) +test_that("Read the commit message data.", { + + ## configuration object for the datapath + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, "file") + + ## read the actual data + commit.message.data.read = read.commit.messages(proj.conf$get.value("datapath")) + + ## build the expected data.frame + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, 32711)), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", + "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", + "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", + "0a1a5c523d835459c42f33e863623138555e2526"), + title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", ""), + message.body = c("", "", "the things are nothing", "", "intensifies", "still doesn't work as expected", "" )) + + ## check the results + expect_identical(commit.message.data.read, commit.data.expected, info = "Commit message data.") +}) + + + + + test_that("Read the synchronicity data.", { ## configuration object proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) diff --git a/util-read.R b/util-read.R index 674cc90f..ddd567e3 100644 --- a/util-read.R +++ b/util-read.R @@ -19,6 +19,7 @@ ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker +## Copyright 2020 by Niklas Schneider ## All Rights Reserved. ## Note: @@ -60,6 +61,19 @@ COMMITS.LIST.DATA.TYPES = c( "character", "character", "character", "numeric" ) + +## column names of a dataframe containing commit messages (see file 'commitMessages.list' and function \code{read.commit.messages}) +COMMIT.MESSAGE.LIST.COLUMNS = c( + "commit.id", # id + "hash", "title", "message.body" +) + +## declare the datatype for each column in the constant 'COMMIT.MESSAGE.LIST.COLUMNS' +COMMIT.MESSAGE.LIST.DATA.TYPES = c( + "character", + "character", "character", "character" +) + #' Read the commits from the 'commits.list' file. #' #' @param data.path the path to the commit list @@ -168,6 +182,96 @@ read.commits = function(data.path, artifact) { return(commit.data) } + +#' +#' @return the empty dataframe +create.empty.commit.message.list = function() { + return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) +} + + +#' Read the commit messages from the 'commitMessages.list' file. +#' +#' @param data.path the path to the commit list +#' +#' @return a dataframe with id, hash, title and message body´ +read.commit.messages = function(data.path) { + logging::logdebug("read.commit.messages: starting.") + + ## read the file with the commit messages + file = file.path(data.path, "commitMessages.list") + + commit.message.data.unprocessed = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, + encoding = "UTF-8"), silent = TRUE) + + + ## handle the case that the list of commits is empty + if (inherits(commit.message.data.unprocessed, "try-error")) { + logging::logwarn("There are no commits available for the current environment.") + logging::logwarn("Datapath: %s", data.path) + + logging::loginfo("Hello error") + + ## return a dataframe with the correct columns but zero rows + return(create.empty.commit.message.list()) + } + + ## split the message string with the new line symbol. + message.split = strsplit(commit.message.data.unprocessed[[3]], " ") + + ## prepare the message.split-object so that it contains a two-element + ## vector for each commit + for (i in seq(1, length(message.split))) { + v = message.split[[i]] + + ## clear the message from empty lines + message.split[[i]] = v[v != ""] + + ## if the commit message was completely empty, add empty title and body + if (length(message.split[[i]]) == 0) { + message.split[[i]] = c("", "") + } + + ## if there is only one line, create an empty body + else if (length(message.split[[i]]) == 1) { + message.split[[i]] = c(message.split[[i]], "") + } + + ## if there are more than two lines, merge all except for the first one + else if (length(message.split[[i]]) > 2) { + message.split[[i]] = c(message.split[[i]][[1]], + paste(tail(message.split[[i]], -1), + collapse = " ")) + } + } + + ## split the list of vectors from above into two vectors + commit.titles = 1 : length(message.split) + commit.message.bodies = 1 : length(message.split) + for (i in seq(1, length(message.split))) { + ## put the first element of each vector in the title vector + commit.titles[[i]] = message.split[[i]][[1]] + ## put the second one in the body vector + commit.message.bodies[[i]] = message.split[[i]][[2]] + } + + ## create a data frame containing all four necessary columns + commit.message.data = data.frame(commit.message.data.unprocessed[[1]], # commit.id + commit.message.data.unprocessed[[2]], # hash + commit.titles, # title + commit.message.bodies) #message.body + + ## set column names for new data frame + colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS + + ## Make commit.id have numeric type and set row names + commit.message.data[["commit.id"]] = sprintf("", commit.message.data[["commit.id"]]) + row.names(commit.message.data) = seq_len(nrow(commit.message.data)) + + return(commit.message.data) +} + + #' Create an empty dataframe which has the same shape as a dataframe containing commits. The dataframe has the column #' names and column datatypes defined in \code{COMMITS.LIST.COLUMNS} and \code{COMMITS.LIST.DATA.TYPES}, respectively. #' @@ -565,3 +669,5 @@ read.synchronicity = function(data.path, artifact, time.window) { create.empty.synchronicity.list = function() { return (create.empty.data.frame(SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)) } + + From 61618db1d9e5a24bfabab10ccefae60802c69f51 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 16 Dec 2020 12:14:09 +0100 Subject: [PATCH 03/43] Change commitMessages.list test files to have for the right line breaks Discussion has shown that codeface separates lines with five spaces, not four. So the two test files have been modified to account for that fact. See discussion in #180 Signed-off-by: Niklas Schneider --- .../testing/test_feature/feature/commitMessages.list | 10 +++++----- .../test_proximity/proximity/commitMessages.list | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list b/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list index 0f02072a..8b830352 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list +++ b/tests/codeface-data/results/testing/test_feature/feature/commitMessages.list @@ -1,7 +1,7 @@ 32712;"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"Add stuff" -32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff" -32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" -32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" -32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" -32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " +32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff " +32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" +32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" +32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" +32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " 32711;"0a1a5c523d835459c42f33e863623138555e2526";"" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list b/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list index 0f02072a..8b830352 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/commitMessages.list @@ -1,7 +1,7 @@ 32712;"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"Add stuff" -32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff" -32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" -32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" -32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" -32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " +32713;"5a5ec9675e98187e1e92561e1888aa6f04faa338";" Add some more stuff " +32710;"3a0ed78458b3976243db6829f63eba3eead26774";" I added important things the things are nothing" +32714;"1143db502761379c2bfcecc2007fc34282e7ee61";" I wish it would work now" +32715;"418d1dc4929ad1df251d2aeb833dd45757b04a6f";"Wish intensifies" +32716;"d01921773fae4bed8186b0aa411d6a2f7a6626e6";" ... still doesn't work as expected " 32711;"0a1a5c523d835459c42f33e863623138555e2526";"" From 17a61edfd1995daa0723520b1bdef48809462540 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 16 Dec 2020 12:16:59 +0100 Subject: [PATCH 04/43] Adapt commit message read test to new test files Change the line breaks in the expected output to \n's. See #180 Signed-off-by: Niklas Schneider --- tests/test-read.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-read.R b/tests/test-read.R index d9aed004..4a3a3370 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -154,7 +154,7 @@ test_that("Read the commit message data.", { "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", "0a1a5c523d835459c42f33e863623138555e2526"), title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", ""), - message.body = c("", "", "the things are nothing", "", "intensifies", "still doesn't work as expected", "" )) + message.body = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "" )) ## check the results expect_identical(commit.message.data.read, commit.data.expected, info = "Commit message data.") From c624c909a34c08b620acd8fd7d5916e2b6477839 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 16 Dec 2020 12:18:27 +0100 Subject: [PATCH 05/43] Adapt read.commit.messages to handle line breaks correctly Replace five spaces with \n, remove any white space at the beginning and the end of a commit message. See #180 Signed-off-by: Niklas Schneider --- util-read.R | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/util-read.R b/util-read.R index ddd567e3..2f6546f2 100644 --- a/util-read.R +++ b/util-read.R @@ -62,7 +62,8 @@ COMMITS.LIST.DATA.TYPES = c( ) -## column names of a dataframe containing commit messages (see file 'commitMessages.list' and function \code{read.commit.messages}) +## column names of a dataframe containing commit messages (see file +## 'commitMessages.list' and function \code{read.commit.messages}) COMMIT.MESSAGE.LIST.COLUMNS = c( "commit.id", # id "hash", "title", "message.body" @@ -74,6 +75,12 @@ COMMIT.MESSAGE.LIST.DATA.TYPES = c( "character", "character", "character" ) +## declare the constant (5 spaces) which is used by codeface to separate lines in +## commit messages +COMMIT.MESSAGE.LINE.SEP.CODEFACE = " " +## declare the constant to how line breaks should look like in the data +COMMIT.MESSAGE.LINE.SEP.REPLACE = "\n" + #' Read the commits from the 'commits.list' file. #' #' @param data.path the path to the commit list @@ -183,14 +190,11 @@ read.commits = function(data.path, artifact) { } -#' -#' @return the empty dataframe -create.empty.commit.message.list = function() { - return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) -} - #' Read the commit messages from the 'commitMessages.list' file. +#' Turn line breaks represented with five spaces into \n line breaks and +#' ignore initial spaces. Also remove spaces at the beginning and the end of +#' the message. #' #' @param data.path the path to the commit list #' @@ -210,14 +214,12 @@ read.commit.messages = function(data.path) { logging::logwarn("There are no commits available for the current environment.") logging::logwarn("Datapath: %s", data.path) - logging::loginfo("Hello error") - ## return a dataframe with the correct columns but zero rows return(create.empty.commit.message.list()) } - ## split the message string with the new line symbol. - message.split = strsplit(commit.message.data.unprocessed[[3]], " ") + ## split the message string with the new line symbol + message.split = strsplit(commit.message.data.unprocessed[[3]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) ## prepare the message.split-object so that it contains a two-element ## vector for each commit @@ -227,6 +229,12 @@ read.commit.messages = function(data.path) { ## clear the message from empty lines message.split[[i]] = v[v != ""] + ## remove spaces before first line + message.split[[i]] = gsub("^\\s+", "", message.split[[i]]) + ## remove spaces at the end of the message + message.split[[i]] = gsub("$\\s+", "", message.split[[i]]) + + ##print(gsub("^\\s+", "", message.split[[i]])) ## if the commit message was completely empty, add empty title and body if (length(message.split[[i]]) == 0) { message.split[[i]] = c("", "") @@ -241,7 +249,7 @@ read.commit.messages = function(data.path) { else if (length(message.split[[i]]) > 2) { message.split[[i]] = c(message.split[[i]][[1]], paste(tail(message.split[[i]], -1), - collapse = " ")) + collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) # use an ascii line break instead } } @@ -280,6 +288,16 @@ create.empty.commits.list = function() { return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) } + +#' Create a empty dataframe which has the same shape as a dataframe containing commit messages. The dataframe has the column +#' names and column datatypes defined in \code{COMMIT.MESSAGE.LIST.COLUMNS} and \code{COMMIT.MESSAGE.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.commit.message.list = function() { + return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) +} + + ## * Mail data ------------------------------------------------------------- ## column names of a dataframe containing mails (see file 'mails.list' and function \code{read.mails}) From fdc414ade1a640f533e809a25cfe012e42b3cffa Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 21 Dec 2020 11:29:30 +0100 Subject: [PATCH 06/43] Add functions that enable merging commit messages into data Get the commit messsage data using the new read function and merge either nothing, the title or message and title into the commit.data of the proj.conf instance. See #180 Signed-off-by: Niklas Schneider --- util-data.R | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/util-data.R b/util-data.R index 0531a204..f03978c7 100644 --- a/util-data.R +++ b/util-data.R @@ -21,6 +21,7 @@ ## Copyright 2017 by Ferdinand Frank ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker +## Copyright 2020 by Niklas Schneider ## All Rights Reserved. @@ -104,6 +105,7 @@ ProjectData = R6::R6Class("ProjectData", ## commits and commit data commits.filtered = NULL, # data.frame commits = NULL, # data.frame + commit.messages = NULL, # data.frame ## mails mails = NULL, # data.frame mails.patchstacks = NULL, # list @@ -508,6 +510,7 @@ ProjectData = R6::R6Class("ProjectData", reset.environment = function() { private$commits.filtered = NULL private$commits = NULL + private$commit.messages = NULL private$mails = NULL private$issues = NULL private$authors = NULL @@ -671,8 +674,27 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits) }, + #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. + #' If the list of commits is not cached in the field \code{commit.messages}, call the read method first. + #' + #' @return the list of commit messages + get.commit.messages = function() { + logging::loginfo("Getting commit messages.´") + + ## if commit messages are not read already, do this + if (is.null(private$commit.messages)) { + commit.message.data = read.commit.messages(self$get.data.path()) + + ## cache the result + private$commit.messages = commit.message.data + } + + return(commit.message.data) + }, + #' Set the commit list of the project to a new one. - #' Add PaStA and sychronicity data if configured in the \code{project.conf}. + #' Add PaStA and synchronicity data if configured in the \code{project.conf} + #' as well as commit message data. #' #' @param commit.data the new list of commits set.commits = function(commit.data) { @@ -682,6 +704,27 @@ ProjectData = R6::R6Class("ProjectData", commit.data = create.empty.commits.list() } + + ## add commit message data if wanted + if (private$project.conf$get.value("commit.messages") != "none") { + logging::loginfo("Merging commit messages into commit data.") + + ## get commit messages + commit.messages = self$get.commit.messages() + + ## drop the hash column as we do not want it twice + commit.messages = commit.messages = commit.messages[-2] + + ## now there are only three columns left: commit.id, title, message.body + ## check whether to include only title or also the messages + if (private$project.conf$get.value("commit.messages") == "title") { + commit.messages = commit.messages[-2] + } + + ## merge them into the commit data + commit.data = merge(commit.data, commit.messages, by = "commit.id") + } + ## store commit data private$commits = commit.data @@ -707,6 +750,7 @@ ProjectData = R6::R6Class("ProjectData", } } + ## sort by date private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] From 5db90d804c585ce1cf7f312b9818d822c2ecaacb Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 21 Dec 2020 11:32:27 +0100 Subject: [PATCH 07/43] Add new configuration option for commit messages Add the new attribute "commit.messages" to the project configuration class with options "none", "title" and "message" to make it possible to specify what exactly of the commit message data is to be merged to the commit data. See #180 Signed-off-by: Niklas Schneider --- util-conf.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/util-conf.R b/util-conf.R index a9c61b35..c973330a 100644 --- a/util-conf.R +++ b/util-conf.R @@ -21,6 +21,7 @@ ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker +## Copyright 2020 by Niklas Schneider ## All Rights Reserved. @@ -356,6 +357,12 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), + commit.messages = list( + default = "none", + type = "character", + allowed = c("none", "title", "message"), + allowed.number = 1 + ), mails.filter.patchstack.mails = list( default = FALSE, type = "logical", From f80b24be0cf550936f6ce1183747d3b95e9fa71f Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 21 Dec 2020 11:39:02 +0100 Subject: [PATCH 08/43] Replace seq with seq_along and add missing log statement in util-read.R Signed-off-by: Niklas Schneider --- util-read.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util-read.R b/util-read.R index 2f6546f2..9ff975f2 100644 --- a/util-read.R +++ b/util-read.R @@ -223,7 +223,7 @@ read.commit.messages = function(data.path) { ## prepare the message.split-object so that it contains a two-element ## vector for each commit - for (i in seq(1, length(message.split))) { + for (i in seq_along(message.split)) { v = message.split[[i]] ## clear the message from empty lines @@ -276,6 +276,8 @@ read.commit.messages = function(data.path) { commit.message.data[["commit.id"]] = sprintf("", commit.message.data[["commit.id"]]) row.names(commit.message.data) = seq_len(nrow(commit.message.data)) + logging::logdebug("read.commit.messages: finished.") + return(commit.message.data) } From 941435708893b12042a751b9d4c84358259803c1 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 28 Dec 2020 10:35:47 +0100 Subject: [PATCH 09/43] Add tests for merging and fix bug when merging only titles Add two tests for testing the merge functionality for both full commit messages and titles only. Fix bug that merges message body instead of title when selecting option "title" See #180 Signed-off-by: Niklas Schneider --- tests/test-data.R | 100 +++++++++++++++++++++++++++++++++++++++++++++- util-data.R | 4 +- 2 files changed, 100 insertions(+), 4 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index f996eefe..f7455fdd 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -14,6 +14,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2019 by Jakob Kronawitter +## Copyright 2020 by Niklas Schneider ## All Rights Reserved. @@ -33,7 +34,7 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") test_that("Compare two ProjectData objects", { - ##initialize a ProjectData object with the ProjectConf and clone it into another one + ## initialize a ProjectData object with the ProjectConf and clone it into another one proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("pasta", TRUE) proj.data.one = ProjectData$new(project.conf = proj.conf) @@ -44,7 +45,7 @@ test_that("Compare two ProjectData objects", { ## Always change one data source in the one object, test for inequality, change it in the ## second object, as well, and test for equality. - ##change the second data object + ## change the second data object proj.data.two$get.pasta() @@ -179,3 +180,98 @@ test_that("Filter patchstack mails with PaStA enabled", { ## ensure that there are no other entries than the ones that have been verified to exist above expect_equal(6, nrow(filtered.pasta)) }) + + +test_that("Merge commit messages to commit data", { + ## initialize a ProjectData object with the ProjectConf + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commit.messages", "message") + proj.data = ProjectData$new(project.conf = proj.conf) + + commits = proj.data$get.commits() + + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, + 32711, 32711)), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", + "2016-07-12 16:06:32", "2016-07-12 16:06:32")), + author.name = c("Björn", "Olaf", "Olaf", "Karl", "Karl", "Thomas", "Thomas", "Thomas"), + author.email = c("bjoern@example.org", "olaf@example.org", "olaf@example.org", "karl@example.org", + "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org"), + committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-20 10:00:44", "2016-07-12 17:05:55", + "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", + "2016-07-12 16:06:32", "2016-07-12 16:06:32")), + committer.name = c("Björn", "Björn", "Thomas", "Karl", "Karl", "Thomas", "Thomas", "Thomas"), + committer.email = c("bjoern@example.org", "bjoern@example.org", "thomas@example.org", "karl@example.org", + "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org"), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", + "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", + "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", + "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526"), + changed.files = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1)), + added.lines = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1)), + deleted.lines = as.integer(c(1, 0, 0, 0, 0, 0, 0, 0)), + diff.size = as.integer(c(2, 1, 1, 1, 1, 1, 1, 1)), + file = c("test.c", "test.c", "test2.c", "test3.c", UNTRACKED.FILE, + UNTRACKED.FILE, "test2.c", "test2.c"), + artifact = c("A", "A", "Base_Feature", "Base_Feature", + UNTRACKED.FILE.EMPTY.ARTIFACT, UNTRACKED.FILE.EMPTY.ARTIFACT, "Base_Feature", "foo"), + artifact.type = c("Feature", "Feature", "Feature","Feature", UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, + UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "Feature", "Feature"), + artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1, 1)), + title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", ""), + message.body = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "", "")) + + # throw away the row names as they are permuted when merging and + # we do not care for their order in the test + rownames(commits) = NULL + rownames(commit.data.expected) = NULL + expect_identical(commits, commit.data.expected) +}) + +test_that("Merge commit message titles to commit data", { + ## initialize a ProjectData object with the ProjectConf + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commit.messages", "title") + proj.data = ProjectData$new(project.conf = proj.conf) + + commits = proj.data$get.commits() + + commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, + 32711, 32711)), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", + "2016-07-12 16:06:32", "2016-07-12 16:06:32")), + author.name = c("Björn", "Olaf", "Olaf", "Karl", "Karl", "Thomas", "Thomas", "Thomas"), + author.email = c("bjoern@example.org", "olaf@example.org", "olaf@example.org", "karl@example.org", + "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org"), + committer.date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-20 10:00:44", "2016-07-12 17:05:55", + "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", + "2016-07-12 16:06:32", "2016-07-12 16:06:32")), + committer.name = c("Björn", "Björn", "Thomas", "Karl", "Karl", "Thomas", "Thomas", "Thomas"), + committer.email = c("bjoern@example.org", "bjoern@example.org", "thomas@example.org", "karl@example.org", + "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org"), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", + "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", + "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", + "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526"), + changed.files = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1)), + added.lines = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1)), + deleted.lines = as.integer(c(1, 0, 0, 0, 0, 0, 0, 0)), + diff.size = as.integer(c(2, 1, 1, 1, 1, 1, 1, 1)), + file = c("test.c", "test.c", "test2.c", "test3.c", UNTRACKED.FILE, + UNTRACKED.FILE, "test2.c", "test2.c"), + artifact = c("A", "A", "Base_Feature", "Base_Feature", + UNTRACKED.FILE.EMPTY.ARTIFACT, UNTRACKED.FILE.EMPTY.ARTIFACT, "Base_Feature", "foo"), + artifact.type = c("Feature", "Feature", "Feature","Feature", UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, + UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "Feature", "Feature"), + artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1, 1)), + title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", "")) + + # throw away the row names as they are permuted when merging and + # we do not care for their order in the test + rownames(commits) = NULL + rownames(commit.data.expected) = NULL + + expect_identical(commits, commit.data.expected) +}) diff --git a/util-data.R b/util-data.R index f03978c7..e0ae74d7 100644 --- a/util-data.R +++ b/util-data.R @@ -718,11 +718,11 @@ ProjectData = R6::R6Class("ProjectData", ## now there are only three columns left: commit.id, title, message.body ## check whether to include only title or also the messages if (private$project.conf$get.value("commit.messages") == "title") { - commit.messages = commit.messages[-2] + commit.messages = commit.messages[-3] } ## merge them into the commit data - commit.data = merge(commit.data, commit.messages, by = "commit.id") + commit.data = merge(commit.data, commit.messages, by.x = "commit.id") } ## store commit data From 359b12c39a7eb6ad7c27a4394099333a6fb2a242 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sat, 2 Jan 2021 12:30:18 +0100 Subject: [PATCH 10/43] Add description of changes to unversioned section of NEWS.md Signed-off-by: Niklas Schneider --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index d7b0c24f..ce7260e8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,13 @@ ## Unversioned +### Added +- Add functionality to read and process commit messages in order to merge them to the commit data (See issue #180). Three values are available for the new attribute `commit.messages` in `proj.conf`: + 1. `none` is the default value and does not change the previous behaviour of `proj.data$set.commits`. + 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. + 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message.body`. + + ### Changed/Improved - Add `.drone.yml` to enable running our CI pipelines on drone.io (PR #191, 1c5804b59c582cf34af6970b435add51452fbd11) From 70c8395ca882842a7a08c838cfdc98d6f155bbf4 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Thu, 7 Jan 2021 21:54:37 +0100 Subject: [PATCH 11/43] Remove unnecessary empty lines from several files Also exchange the merge attribute when merging data frames of commit messages from commit.id to hash. Signed-off-by: Niklas Schneider --- tests/test-read.R | 6 ------ util-data.R | 6 ++---- util-read.R | 16 ++++++---------- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/tests/test-read.R b/tests/test-read.R index 4a3a3370..e98b5651 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -92,7 +92,6 @@ test_that("Read the raw commit data with the feature artifact.", { expect_identical(dates, dates.expected, info = "Ordering by date.") }) - test_that("Read the raw commit data with the file artifact.", { ## configuration object for the datapath @@ -138,7 +137,6 @@ test_that("Read the raw commit data with the file artifact.", { expect_identical(dates, dates.expected, info = "Ordering by date.") }) - test_that("Read the commit message data.", { ## configuration object for the datapath @@ -160,10 +158,6 @@ test_that("Read the commit message data.", { expect_identical(commit.message.data.read, commit.data.expected, info = "Commit message data.") }) - - - - test_that("Read the synchronicity data.", { ## configuration object proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) diff --git a/util-data.R b/util-data.R index e0ae74d7..a998b487 100644 --- a/util-data.R +++ b/util-data.R @@ -704,7 +704,6 @@ ProjectData = R6::R6Class("ProjectData", commit.data = create.empty.commits.list() } - ## add commit message data if wanted if (private$project.conf$get.value("commit.messages") != "none") { logging::loginfo("Merging commit messages into commit data.") @@ -713,7 +712,7 @@ ProjectData = R6::R6Class("ProjectData", commit.messages = self$get.commit.messages() ## drop the hash column as we do not want it twice - commit.messages = commit.messages = commit.messages[-2] + commit.messages = commit.messages[-2] ## now there are only three columns left: commit.id, title, message.body ## check whether to include only title or also the messages @@ -722,7 +721,7 @@ ProjectData = R6::R6Class("ProjectData", } ## merge them into the commit data - commit.data = merge(commit.data, commit.messages, by.x = "commit.id") + commit.data = merge(commit.data, commit.messages, by.x = "hash") } ## store commit data @@ -750,7 +749,6 @@ ProjectData = R6::R6Class("ProjectData", } } - ## sort by date private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] diff --git a/util-read.R b/util-read.R index 9ff975f2..623a3b90 100644 --- a/util-read.R +++ b/util-read.R @@ -189,8 +189,6 @@ read.commits = function(data.path, artifact) { return(commit.data) } - - #' Read the commit messages from the 'commitMessages.list' file. #' Turn line breaks represented with five spaces into \n line breaks and #' ignore initial spaces. Also remove spaces at the beginning and the end of @@ -198,7 +196,7 @@ read.commits = function(data.path, artifact) { #' #' @param data.path the path to the commit list #' -#' @return a dataframe with id, hash, title and message body´ +#' @return a data frame with id, hash, title and message body´ read.commit.messages = function(data.path) { logging::logdebug("read.commit.messages: starting.") @@ -208,7 +206,6 @@ read.commit.messages = function(data.path) { commit.message.data.unprocessed = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) - ## handle the case that the list of commits is empty if (inherits(commit.message.data.unprocessed, "try-error")) { logging::logwarn("There are no commits available for the current environment.") @@ -234,7 +231,6 @@ read.commit.messages = function(data.path) { ## remove spaces at the end of the message message.split[[i]] = gsub("$\\s+", "", message.split[[i]]) - ##print(gsub("^\\s+", "", message.split[[i]])) ## if the commit message was completely empty, add empty title and body if (length(message.split[[i]]) == 0) { message.split[[i]] = c("", "") @@ -247,9 +243,11 @@ read.commit.messages = function(data.path) { ## if there are more than two lines, merge all except for the first one else if (length(message.split[[i]]) > 2) { - message.split[[i]] = c(message.split[[i]][[1]], - paste(tail(message.split[[i]], -1), - collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) # use an ascii line break instead + message.split[[i]] + = c(message.split[[i]][[1]], + paste(tail(message.split[[i]], -1), + ## use an ascii line break instead + collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) } } @@ -689,5 +687,3 @@ read.synchronicity = function(data.path, artifact, time.window) { create.empty.synchronicity.list = function() { return (create.empty.data.frame(SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)) } - - From 89a6ea64470687688c157618b7b0b4092830a46a Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Thu, 7 Jan 2021 22:15:28 +0100 Subject: [PATCH 12/43] Fix a syntax error in util-read Signed-off-by: Niklas Schneider --- util-read.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/util-read.R b/util-read.R index 623a3b90..dcd3ad02 100644 --- a/util-read.R +++ b/util-read.R @@ -243,11 +243,10 @@ read.commit.messages = function(data.path) { ## if there are more than two lines, merge all except for the first one else if (length(message.split[[i]]) > 2) { - message.split[[i]] - = c(message.split[[i]][[1]], - paste(tail(message.split[[i]], -1), - ## use an ascii line break instead - collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) + message.split[[i]] = c(message.split[[i]][[1]], + paste(tail(message.split[[i]], -1), + ## use an ascii line break instead + collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) } } From 6e9147edc8c2eba0eab6952c73d16619e1887fe2 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 8 Jan 2021 12:05:07 +0100 Subject: [PATCH 13/43] Fix merging by hash instead of commit.id As commit.id was the first column of the data frame anyway, merging has not changed the order. But when using the hash column it is taken as the first colum of the resulting data frame. Change the order of the columns in order to not break anything that relies on the order. See #180 Signed-off-by: Niklas Schneider --- tests/test-data.R | 1 + util-data.R | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index f7455fdd..e4db2b2e 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -226,6 +226,7 @@ test_that("Merge commit messages to commit data", { # we do not care for their order in the test rownames(commits) = NULL rownames(commit.data.expected) = NULL + expect_identical(commits, commit.data.expected) }) diff --git a/util-data.R b/util-data.R index a998b487..cec33588 100644 --- a/util-data.R +++ b/util-data.R @@ -711,8 +711,8 @@ ProjectData = R6::R6Class("ProjectData", ## get commit messages commit.messages = self$get.commit.messages() - ## drop the hash column as we do not want it twice - commit.messages = commit.messages[-2] + ## drop the commit.id column as we do not want it twice + commit.messages = commit.messages[-1] ## now there are only three columns left: commit.id, title, message.body ## check whether to include only title or also the messages @@ -721,7 +721,19 @@ ProjectData = R6::R6Class("ProjectData", } ## merge them into the commit data - commit.data = merge(commit.data, commit.messages, by.x = "hash") + commit.data = merge(commit.data, commit.messages, by.x = "hash", by.y = "hash") + + ## when merging by hash, the hash column is taken as the first column of the + ## resulting data frame + ## change that order back depending on how many columns the new data frame has + if (private$project.conf$get.value("commit.messages") == "title") { + ## one column less as message.body is not included + commit.data = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17)] + } + else { + commit.data = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] + } + } ## store commit data From c9c7ff7747c0245674a0ef765178bf844a6fe8fd Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 13 Jan 2021 14:52:14 +0100 Subject: [PATCH 14/43] Modify README and NEWS Follow the review suggestions of @clhunsen. See #180 Signed-off-by: Niklas Schneider --- NEWS.md | 5 +--- README.md | 77 ++++++++++++++++++++++++++++++------------------------- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/NEWS.md b/NEWS.md index ce7260e8..8f54ee05 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,10 +3,7 @@ ## Unversioned ### Added -- Add functionality to read and process commit messages in order to merge them to the commit data (See issue #180). Three values are available for the new attribute `commit.messages` in `proj.conf`: - 1. `none` is the default value and does not change the previous behaviour of `proj.data$set.commits`. - 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. - 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message.body`. +- Add functionality to read and process commit messages in order to merge them to the commit data (See issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages`. ### Changed/Improved diff --git a/README.md b/README.md index f383099e..95b0b431 100644 --- a/README.md +++ b/README.md @@ -11,40 +11,43 @@ If you wonder: The name `coronet` derives as an acronym from the words "configur ## Table of contents -- [Integration](#integration) - * [Requirements](#requirements) - * [R](#r) - * [packrat (recommended)](#packrat) - * [Folder structure of the input data](#folder-structure-of-the-input-data) - * [Needed R packages](#needed-r-packages) - * [Submodule](#submodule) - * [Selecting the correct version](#selecting-the-correct-version) -- [Functionality](#functionality) - * [Configuration](#configuration) - * [Data sources](#data-sources) - * [Network construction](#network-construction) - * [Data sources for network construction](#data-sources-for-network-construction) - * [Types of networks](#types-of-networks) - * [Relations](#relations) - * [Edge-construction algorithms for author networks](#edge-construction-algorithms-for-author-networks) - * [Vertex and edge attributes](#vertex-and-edge-attributes) - * [Further functionalities](#further-functionalities) - * [Splitting data and networks based on defined time windows](#splitting-data-and-networks-based-on-defined-time-windows) - * [Cutting data to unified date ranges](#cutting-data-to-unified-date-ranges) - * [Handling data independently](#handling-data-independently) - * [How-to](#how-to) - * [File/Module overview](#filemodule-overview) -- [Configuration classes](#configuration-classes) - * [ProjectConf](#projectconf) - * [Basic information](#basic-information) - * [Artifact-related information](#artifact-related-information) - * [Revision-related information](#revision-related-information) - * [Data paths](#data-paths) - * [Splitting information](#splitting-information) - * [(Configurable) Data-retrieval-related parameters](#configurable-data-retrieval-related-parameters) - * [NetworkConf](#networkconf) -- [License](#license) -- [Work in progress](#work-in-progress) +- [coronet - The network library](#coronet---the-network-library) + - [Table of contents](#table-of-contents) + - [Integration](#integration) + - [Requirements](#requirements) + - [`R`](#r) + - [`packrat` (recommended)](#packrat-recommended) + - [Folder structure of the input data](#folder-structure-of-the-input-data) + - [Needed R packages](#needed-r-packages) + - [Submodule](#submodule) + - [Selecting the correct version](#selecting-the-correct-version) + - [Functionality](#functionality) + - [Configuration](#configuration) + - [Data sources](#data-sources) + - [Network construction](#network-construction) + - [Data sources for network construction](#data-sources-for-network-construction) + - [Types of networks](#types-of-networks) + - [Relations](#relations) + - [Edge-construction algorithms for author networks](#edge-construction-algorithms-for-author-networks) + - [Vertex and edge attributes](#vertex-and-edge-attributes) + - [Further functionalities](#further-functionalities) + - [Splitting data and networks based on defined time windows](#splitting-data-and-networks-based-on-defined-time-windows) + - [Cutting data to unified date ranges](#cutting-data-to-unified-date-ranges) + - [Handling data independently](#handling-data-independently) + - [How-to](#how-to) + - [File/Module overview](#filemodule-overview) + - [Configuration classes](#configuration-classes) + - [ProjectConf](#projectconf) + - [Basic information](#basic-information) + - [Artifact-related information](#artifact-related-information) + - [Revision-related information](#revision-related-information) + - [Data paths](#data-paths) + - [Splitting information](#splitting-information) + - [(Configurable) Data-retrieval-related parameters](#configurable-data-retrieval-related-parameters) + - [NetworkConf](#networkconf) + - [Contributing](#contributing) + - [License](#license) + - [Work in progress](#work-in-progress) ## Integration @@ -183,7 +186,11 @@ There are two distinguishable types of data sources that are both handled by the * Patch-stack analysis to link patches sent to mailing lists and upstream commits * Synchronicity information on commits (see also the parameter `synchronicity` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class) * Synchronous commits are commits that change a source-code artifact that has also been changed by another author within a reasonable time-window. - + * Commit messages are available through the parameter `commit.messages`. Three values can be used: + 1. `none` is the default value and does not impact the configuration at all. + 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. + 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message.body`. + The important difference is that the *main data sources* are used internally to construct artifact vertices in relevant types of networks. Additionally, these data sources can be used as a basis for splitting `ProjectData` in a time-based or activity-based manner – obtaining `RangeData` instances as a result (see file `split.R` and the contained functions). Thus, `RangeData` objects contain only data of a specific period of time. The *additional data sources* are orthogonal to the main data sources, can augment them by additional information, and, thus, are not split at any time. From 0457dd5a27f80383d66d37573940a3651454b699 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 13 Jan 2021 15:00:40 +0100 Subject: [PATCH 15/43] Rename "message.body" column to "message" everywhere Following the review of #193 Signed-off-by: Niklas Schneider --- NEWS.md | 2 +- README.md | 2 +- tests/test-data.R | 2 +- tests/test-read.R | 2 +- util-read.R | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8f54ee05..4fca0a99 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ ## Unversioned ### Added -- Add functionality to read and process commit messages in order to merge them to the commit data (See issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages`. +- Add functionality to read and process commit messages in order to merge them to the commit data (see issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages`. ### Changed/Improved diff --git a/README.md b/README.md index 95b0b431..9a2aa4d2 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ There are two distinguishable types of data sources that are both handled by the * Commit messages are available through the parameter `commit.messages`. Three values can be used: 1. `none` is the default value and does not impact the configuration at all. 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. - 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message.body`. + 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message`. The important difference is that the *main data sources* are used internally to construct artifact vertices in relevant types of networks. Additionally, these data sources can be used as a basis for splitting `ProjectData` in a time-based or activity-based manner – obtaining `RangeData` instances as a result (see file `split.R` and the contained functions). Thus, `RangeData` objects contain only data of a specific period of time. diff --git a/tests/test-data.R b/tests/test-data.R index e4db2b2e..108e5a8f 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -220,7 +220,7 @@ test_that("Merge commit messages to commit data", { UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "Feature", "Feature"), artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1, 1)), title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", ""), - message.body = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "", "")) + message = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "", "")) # throw away the row names as they are permuted when merging and # we do not care for their order in the test diff --git a/tests/test-read.R b/tests/test-read.R index e98b5651..526a5e1f 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -152,7 +152,7 @@ test_that("Read the commit message data.", { "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", "0a1a5c523d835459c42f33e863623138555e2526"), title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", ""), - message.body = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "" )) + message = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "" )) ## check the results expect_identical(commit.message.data.read, commit.data.expected, info = "Commit message data.") diff --git a/util-read.R b/util-read.R index dcd3ad02..4aabe1ba 100644 --- a/util-read.R +++ b/util-read.R @@ -66,7 +66,7 @@ COMMITS.LIST.DATA.TYPES = c( ## 'commitMessages.list' and function \code{read.commit.messages}) COMMIT.MESSAGE.LIST.COLUMNS = c( "commit.id", # id - "hash", "title", "message.body" + "hash", "title", "message" ) ## declare the datatype for each column in the constant 'COMMIT.MESSAGE.LIST.COLUMNS' @@ -264,7 +264,7 @@ read.commit.messages = function(data.path) { commit.message.data = data.frame(commit.message.data.unprocessed[[1]], # commit.id commit.message.data.unprocessed[[2]], # hash commit.titles, # title - commit.message.bodies) #message.body + commit.message.bodies) #message ## set column names for new data frame colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS From 7e61dcb5b9ca219d5180fd822fa0ec610950bb88 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 13 Jan 2021 15:58:31 +0100 Subject: [PATCH 16/43] Fix style issues and improve message processing Remove some empty lines and indent some lines. Also remove commit.message.data.unprocessed variable and use the commit.message.data variable from the beginning. Add column names beforehand in order to enable access without indices. See #193 Signed-off-by: Niklas Schneider --- util-read.R | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/util-read.R b/util-read.R index 4aabe1ba..c29f3e21 100644 --- a/util-read.R +++ b/util-read.R @@ -203,11 +203,11 @@ read.commit.messages = function(data.path) { ## read the file with the commit messages file = file.path(data.path, "commitMessages.list") - commit.message.data.unprocessed = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, + commit.message.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of commits is empty - if (inherits(commit.message.data.unprocessed, "try-error")) { + if (inherits(commit.message.data, "try-error")) { logging::logwarn("There are no commits available for the current environment.") logging::logwarn("Datapath: %s", data.path) @@ -215,8 +215,11 @@ read.commit.messages = function(data.path) { return(create.empty.commit.message.list()) } + ## set column names for new data frame + ## unprocessed data only has three columns so omit the "title" column + colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS[-3] ## split the message string with the new line symbol - message.split = strsplit(commit.message.data.unprocessed[[3]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) + message.split = strsplit(commit.message.data[["message"]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) ## prepare the message.split-object so that it contains a two-element ## vector for each commit @@ -235,24 +238,22 @@ read.commit.messages = function(data.path) { if (length(message.split[[i]]) == 0) { message.split[[i]] = c("", "") } - ## if there is only one line, create an empty body else if (length(message.split[[i]]) == 1) { message.split[[i]] = c(message.split[[i]], "") } - ## if there are more than two lines, merge all except for the first one else if (length(message.split[[i]]) > 2) { message.split[[i]] = c(message.split[[i]][[1]], paste(tail(message.split[[i]], -1), - ## use an ascii line break instead - collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) + ## use an ascii line break instead + collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) } } ## split the list of vectors from above into two vectors - commit.titles = 1 : length(message.split) - commit.message.bodies = 1 : length(message.split) + commit.titles = seq_along(message.split) + commit.message.bodies = seq_along(message.split) for (i in seq(1, length(message.split))) { ## put the first element of each vector in the title vector commit.titles[[i]] = message.split[[i]][[1]] @@ -261,12 +262,12 @@ read.commit.messages = function(data.path) { } ## create a data frame containing all four necessary columns - commit.message.data = data.frame(commit.message.data.unprocessed[[1]], # commit.id - commit.message.data.unprocessed[[2]], # hash + commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id + commit.message.data[["hash"]], # hash commit.titles, # title commit.message.bodies) #message - ## set column names for new data frame + ## set all the column names colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS ## Make commit.id have numeric type and set row names From 8e28a1f167ec8205180aba26976996e4f9d7af75 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 13 Jan 2021 19:19:34 +0100 Subject: [PATCH 17/43] Put merge functionality into own function Create private function update.commit.message.data in util-data.R which handles the merge and change the location where it is called in set.commits. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 77 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/util-data.R b/util-data.R index cec33588..89396460 100644 --- a/util-data.R +++ b/util-data.R @@ -423,12 +423,50 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("update.synchronicity.data: finished.") }, + ## * * commit messages --------------------------------------------- + + #' Add the columns \code{title} and \code{messages} to commits using the currently available + #' synchronicity data from the field \code{synchronicity}. + #' + #' This method should be called whenever the field \code{synchronicity} is changed. + update.commit.message.data = function() { + logging::loginfo("Merging commit messages into commit data.") + + if (!is.null(private$commits)) { + ## get commit messages + commit.messages = self$get.commit.messages() + + ## drop the commit.id column as we do not want it twice + commit.messages = subset(commit.messages, select=-c(commit.id)) + + ## now there are only three columns left: commit.id, title, message.body + ## check whether to include only title or also the messages + if (private$project.conf$get.value("commit.messages") == "title") { + commit.messages = subset(commit.messages, select=-c(message)) + } + + ## merge them into the commit data + commit.data = merge(private$commits, commit.messages, by.x = "hash", by.y = "hash") + + ## when merging by hash, the hash column is taken as the first column of the + ## resulting data frame + ## change that order back depending on how many columns the new data frame has + if (private$project.conf$get.value("commit.messages") == "title") { + ## one column less as message.body is not included + private$commits = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17)] + } + else { + private$commits = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] + } + } + }, + ## * * timestamps -------------------------------------------------- #' Call the getters of the specified data sources in order to #' initialize the sources and extract the timestamps. #' - #' @param data.sources the data sources to be prepated + #' @param data.sources the data sources to be prepared prepare.timestamps = function(data.sources) { for (source in data.sources) { self[[ paste0("get.", source) ]]() @@ -704,38 +742,6 @@ ProjectData = R6::R6Class("ProjectData", commit.data = create.empty.commits.list() } - ## add commit message data if wanted - if (private$project.conf$get.value("commit.messages") != "none") { - logging::loginfo("Merging commit messages into commit data.") - - ## get commit messages - commit.messages = self$get.commit.messages() - - ## drop the commit.id column as we do not want it twice - commit.messages = commit.messages[-1] - - ## now there are only three columns left: commit.id, title, message.body - ## check whether to include only title or also the messages - if (private$project.conf$get.value("commit.messages") == "title") { - commit.messages = commit.messages[-3] - } - - ## merge them into the commit data - commit.data = merge(commit.data, commit.messages, by.x = "hash", by.y = "hash") - - ## when merging by hash, the hash column is taken as the first column of the - ## resulting data frame - ## change that order back depending on how many columns the new data frame has - if (private$project.conf$get.value("commit.messages") == "title") { - ## one column less as message.body is not included - commit.data = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17)] - } - else { - commit.data = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] - } - - } - ## store commit data private$commits = commit.data @@ -761,6 +767,11 @@ ProjectData = R6::R6Class("ProjectData", } } + ## add commit message data if wanted + if (private$project.conf$get.value("commit.messages") != "none") { + private$update.commit.message.data() + } + ## sort by date private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] From 703ab3e7ec2ae44dcc3be747b97c70b4a778dbf9 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 13 Jan 2021 19:26:06 +0100 Subject: [PATCH 18/43] Fix error when returning a variable that is not defined Fix an error where the value of a variable that is defined in an if block is returned outside that if block. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/util-data.R b/util-data.R index 89396460..033d82c1 100644 --- a/util-data.R +++ b/util-data.R @@ -425,10 +425,9 @@ ProjectData = R6::R6Class("ProjectData", ## * * commit messages --------------------------------------------- - #' Add the columns \code{title} and \code{messages} to commits using the currently available - #' synchronicity data from the field \code{synchronicity}. - #' - #' This method should be called whenever the field \code{synchronicity} is changed. + #' Add the columns \code{title} and \code{messages} to commits using the selected + #' configuration option of \code{commit.messages} and the results of the + #' \code{get.commit.messages}. update.commit.message.data = function() { logging::loginfo("Merging commit messages into commit data.") @@ -727,7 +726,7 @@ ProjectData = R6::R6Class("ProjectData", private$commit.messages = commit.message.data } - return(commit.message.data) + return(private$commit.messages) }, #' Set the commit list of the project to a new one. From 7caaa8d53edccd359b920bc273e0f06618af767a Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 15 Jan 2021 14:52:34 +0100 Subject: [PATCH 19/43] Simplify data frame creation in read.commit.messages Replaced a loop with a conversion from a list of vectors in a data frame and access its columns directly See #193 Signed-off-by: Niklas Schneider --- util-read.R | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/util-read.R b/util-read.R index c29f3e21..1eb7d596 100644 --- a/util-read.R +++ b/util-read.R @@ -251,21 +251,25 @@ read.commit.messages = function(data.path) { } } - ## split the list of vectors from above into two vectors - commit.titles = seq_along(message.split) - commit.message.bodies = seq_along(message.split) - for (i in seq(1, length(message.split))) { - ## put the first element of each vector in the title vector - commit.titles[[i]] = message.split[[i]][[1]] - ## put the second one in the body vector - commit.message.bodies[[i]] = message.split[[i]][[2]] - } + ## convert list of vectors to a data frame with two columns + message.split = as.data.frame(do.call(rbind, message.split)) + colnames(message.split) = c("title", "message") + ## split the list of vectors from above into two vectors + # commit.titles = lapply(message.split, function (v) v[[1]][1]) + # commit.message.bodies = lapply(message.split, function (v) v[[2]]) + # for (i in seq(1, length(message.split))) { + # ## put the first element of each vector in the title vector + # commit.titles[[i]] = message.split[[i]][[1]] + # ## put the second one in the body vector + # commit.message.bodies[[i]] = message.split[[i]][[2]] + # } + # print(commit.titles) ## create a data frame containing all four necessary columns commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id commit.message.data[["hash"]], # hash - commit.titles, # title - commit.message.bodies) #message + message.split[["title"]], # title + message.split[["message"]]) # message ## set all the column names colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS From 8dd410c2285589d7576d22c84d600a69105cb635 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 15 Jan 2021 15:12:11 +0100 Subject: [PATCH 20/43] Reorder functions in util read and replace special functions Move functions concerning reading commit messages and the constants used by them to a new section in util.read. Replace subset with proper indexing and minor comment fixes. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 4 +- util-read.R | 238 +++++++++++++++++++++++++--------------------------- 2 files changed, 115 insertions(+), 127 deletions(-) diff --git a/util-data.R b/util-data.R index 033d82c1..ab4e3957 100644 --- a/util-data.R +++ b/util-data.R @@ -436,12 +436,12 @@ ProjectData = R6::R6Class("ProjectData", commit.messages = self$get.commit.messages() ## drop the commit.id column as we do not want it twice - commit.messages = subset(commit.messages, select=-c(commit.id)) + commit.messages = commit.messages[ , colnames(commit.messages) != "commit.id"] ## now there are only three columns left: commit.id, title, message.body ## check whether to include only title or also the messages if (private$project.conf$get.value("commit.messages") == "title") { - commit.messages = subset(commit.messages, select=-c(message)) + commit.messages = commit.messages[ , colnames(commit.messages) != "message"] } ## merge them into the commit data diff --git a/util-read.R b/util-read.R index 1eb7d596..69b3a6f6 100644 --- a/util-read.R +++ b/util-read.R @@ -61,26 +61,6 @@ COMMITS.LIST.DATA.TYPES = c( "character", "character", "character", "numeric" ) - -## column names of a dataframe containing commit messages (see file -## 'commitMessages.list' and function \code{read.commit.messages}) -COMMIT.MESSAGE.LIST.COLUMNS = c( - "commit.id", # id - "hash", "title", "message" -) - -## declare the datatype for each column in the constant 'COMMIT.MESSAGE.LIST.COLUMNS' -COMMIT.MESSAGE.LIST.DATA.TYPES = c( - "character", - "character", "character", "character" -) - -## declare the constant (5 spaces) which is used by codeface to separate lines in -## commit messages -COMMIT.MESSAGE.LINE.SEP.CODEFACE = " " -## declare the constant to how line breaks should look like in the data -COMMIT.MESSAGE.LINE.SEP.REPLACE = "\n" - #' Read the commits from the 'commits.list' file. #' #' @param data.path the path to the commit list @@ -189,101 +169,6 @@ read.commits = function(data.path, artifact) { return(commit.data) } -#' Read the commit messages from the 'commitMessages.list' file. -#' Turn line breaks represented with five spaces into \n line breaks and -#' ignore initial spaces. Also remove spaces at the beginning and the end of -#' the message. -#' -#' @param data.path the path to the commit list -#' -#' @return a data frame with id, hash, title and message body´ -read.commit.messages = function(data.path) { - logging::logdebug("read.commit.messages: starting.") - - ## read the file with the commit messages - file = file.path(data.path, "commitMessages.list") - - commit.message.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, - encoding = "UTF-8"), silent = TRUE) - - ## handle the case that the list of commits is empty - if (inherits(commit.message.data, "try-error")) { - logging::logwarn("There are no commits available for the current environment.") - logging::logwarn("Datapath: %s", data.path) - - ## return a dataframe with the correct columns but zero rows - return(create.empty.commit.message.list()) - } - - ## set column names for new data frame - ## unprocessed data only has three columns so omit the "title" column - colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS[-3] - ## split the message string with the new line symbol - message.split = strsplit(commit.message.data[["message"]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) - - ## prepare the message.split-object so that it contains a two-element - ## vector for each commit - for (i in seq_along(message.split)) { - v = message.split[[i]] - - ## clear the message from empty lines - message.split[[i]] = v[v != ""] - - ## remove spaces before first line - message.split[[i]] = gsub("^\\s+", "", message.split[[i]]) - ## remove spaces at the end of the message - message.split[[i]] = gsub("$\\s+", "", message.split[[i]]) - - ## if the commit message was completely empty, add empty title and body - if (length(message.split[[i]]) == 0) { - message.split[[i]] = c("", "") - } - ## if there is only one line, create an empty body - else if (length(message.split[[i]]) == 1) { - message.split[[i]] = c(message.split[[i]], "") - } - ## if there are more than two lines, merge all except for the first one - else if (length(message.split[[i]]) > 2) { - message.split[[i]] = c(message.split[[i]][[1]], - paste(tail(message.split[[i]], -1), - ## use an ascii line break instead - collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) - } - } - - ## convert list of vectors to a data frame with two columns - message.split = as.data.frame(do.call(rbind, message.split)) - colnames(message.split) = c("title", "message") - - ## split the list of vectors from above into two vectors - # commit.titles = lapply(message.split, function (v) v[[1]][1]) - # commit.message.bodies = lapply(message.split, function (v) v[[2]]) - # for (i in seq(1, length(message.split))) { - # ## put the first element of each vector in the title vector - # commit.titles[[i]] = message.split[[i]][[1]] - # ## put the second one in the body vector - # commit.message.bodies[[i]] = message.split[[i]][[2]] - # } - # print(commit.titles) - ## create a data frame containing all four necessary columns - commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id - commit.message.data[["hash"]], # hash - message.split[["title"]], # title - message.split[["message"]]) # message - - ## set all the column names - colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS - - ## Make commit.id have numeric type and set row names - commit.message.data[["commit.id"]] = sprintf("", commit.message.data[["commit.id"]]) - row.names(commit.message.data) = seq_len(nrow(commit.message.data)) - - logging::logdebug("read.commit.messages: finished.") - - return(commit.message.data) -} - - #' Create an empty dataframe which has the same shape as a dataframe containing commits. The dataframe has the column #' names and column datatypes defined in \code{COMMITS.LIST.COLUMNS} and \code{COMMITS.LIST.DATA.TYPES}, respectively. #' @@ -292,16 +177,6 @@ create.empty.commits.list = function() { return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) } - -#' Create a empty dataframe which has the same shape as a dataframe containing commit messages. The dataframe has the column -#' names and column datatypes defined in \code{COMMIT.MESSAGE.LIST.COLUMNS} and \code{COMMIT.MESSAGE.LIST.DATA.TYPES}, respectively. -#' -#' @return the empty dataframe -create.empty.commit.message.list = function() { - return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) -} - - ## * Mail data ------------------------------------------------------------- ## column names of a dataframe containing mails (see file 'mails.list' and function \code{read.mails}) @@ -545,6 +420,119 @@ create.empty.authors.list = function() { return (create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) } +## * Commit message data --------------------------------------------------- + +## column names of a dataframe containing commit messages (see file +## 'commitMessages.list' and function \code{read.commit.messages}) +COMMIT.MESSAGE.LIST.COLUMNS = c( + "commit.id", # id + "hash", "title", "message" +) + +## declare the datatype for each column in the constant 'COMMIT.MESSAGE.LIST.COLUMNS' +COMMIT.MESSAGE.LIST.DATA.TYPES = c( + "character", + "character", "character", "character" +) + +## declare the constant (5 spaces) which is used by codeface to separate lines in +## commit messages +COMMIT.MESSAGE.LINE.SEP.CODEFACE = paste0(rep(" ", 5), collapse = "") +## declare the constant to how line breaks should look like in the data +COMMIT.MESSAGE.LINE.SEP.REPLACE = "\n" + +#' Read the commit messages from the 'commitMessages.list' file. +#' Turn line breaks represented with five spaces into \n line breaks and +#' ignore initial spaces. Also remove spaces at the beginning and the end of +#' the message. +#' +#' @param data.path the path to the commit-messages list +#' +#' @return a data frame with id, hash, title and message body´ +read.commit.messages = function(data.path) { + logging::logdebug("read.commit.messages: starting.") + + ## read the file with the commit messages + file = file.path(data.path, "commitMessages.list") + + commit.message.data = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, + encoding = "UTF-8"), silent = TRUE) + + ## handle the case that the list of commits is empty + if (inherits(commit.message.data, "try-error")) { + logging::logwarn("There are no commit messages available for the current environment.") + logging::logwarn("Datapath: %s", data.path) + + ## return a dataframe with the correct columns but zero rows + return(create.empty.commit.message.list()) + } + + ## set column names for new data frame; unprocessed data only has three columns so omit the "title" column + colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS[COMMIT.MESSAGE.LIST.COLUMNS != "title"] + ## split the message string with the new line symbol + message.split = strsplit(commit.message.data[["message"]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) + + ## prepare the 'message.split' object so that it contains a two-element + ## vector for each commit + for (i in seq_along(message.split)) { + v = message.split[[i]] + + ## clear the message from empty lines + message.split[[i]] = v[v != ""] + + ## remove spaces before first line + message.split[[i]] = gsub("^\\s+", "", message.split[[i]]) + ## remove spaces at the end of the message + message.split[[i]] = gsub("$\\s+", "", message.split[[i]]) + + ## if the commit message was completely empty, add empty title and body + if (length(message.split[[i]]) == 0) { + message.split[[i]] = c("", "") + } + ## if there is only one line, create an empty body + else if (length(message.split[[i]]) == 1) { + message.split[[i]] = c(message.split[[i]], "") + } + ## if there are more than two lines, merge all except for the first one + else if (length(message.split[[i]]) > 2) { + message.split[[i]] = c(message.split[[i]][[1]], + paste(tail(message.split[[i]], -1), + ## use an ascii line break instead + collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) + } + } + + ## convert list of vectors to a data frame with two columns + message.split = as.data.frame(do.call(rbind, message.split)) + colnames(message.split) = c("title", "message") + + ## create a data frame containing all four necessary columns + commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id + commit.message.data[["hash"]], # hash + message.split[["title"]], # title + message.split[["message"]]) # message + + ## set all the column names + colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS + + ## Make commit.id have numeric type and set row names + commit.message.data[["commit.id"]] = sprintf("", commit.message.data[["commit.id"]]) + row.names(commit.message.data) = seq_len(nrow(commit.message.data)) + + logging::logdebug("read.commit.messages: finished.") + + return(commit.message.data) +} + +#' Create a empty dataframe which has the same shape as a dataframe containing commit messages. +#' The dataframe has the column names and column datatypes defined in \code{COMMIT.MESSAGE.LIST.COLUMNS} and +#' \code{COMMIT.MESSAGE.LIST.DATA.TYPES}, respectively. +#' +#' @return the empty dataframe +create.empty.commit.message.list = function() { + return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) +} + ## * PaStA data ------------------------------------------------------------ ## column names of a dataframe containing PaStA data (see function \code{read.pasta}) From eb1cec82f251f0d76219d675c266b81bb1ed584c Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 15 Jan 2021 15:18:42 +0100 Subject: [PATCH 21/43] Fix comments in and change order in 'set.commits' Also adapt 'update.commit.messages' to better match the implementation of similar methods. Add 'set.commit.messages' in order to be able to set the commit messages to NULL. See #193. Signed-off-by: Niklas Schneider --- tests/test-data.R | 12 +++---- util-data.R | 88 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index 108e5a8f..9148e518 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -222,12 +222,12 @@ test_that("Merge commit messages to commit data", { title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", ""), message = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "", "")) - # throw away the row names as they are permuted when merging and - # we do not care for their order in the test + ## throw away the row names as they are permuted when merging and + ## we do not care for their order in the test rownames(commits) = NULL rownames(commit.data.expected) = NULL - expect_identical(commits, commit.data.expected) + expect_identical(commits, commit.data.expected, info = "Add commit messages with title") }) test_that("Merge commit message titles to commit data", { @@ -269,10 +269,10 @@ test_that("Merge commit message titles to commit data", { artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1, 1)), title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", "")) - # throw away the row names as they are permuted when merging and - # we do not care for their order in the test + ## throw away the row names as they are permuted when merging and + ## we do not care for their order in the test rownames(commits) = NULL rownames(commit.data.expected) = NULL - expect_identical(commits, commit.data.expected) + expect_identical(commits, commit.data.expected, info = "Add only commit title") }) diff --git a/util-data.R b/util-data.R index ab4e3957..727cfd62 100644 --- a/util-data.R +++ b/util-data.R @@ -425,20 +425,19 @@ ProjectData = R6::R6Class("ProjectData", ## * * commit messages --------------------------------------------- - #' Add the columns \code{title} and \code{messages} to commits using the selected - #' configuration option of \code{commit.messages} and the results of the - #' \code{get.commit.messages}. + #' Add the columns \code{title} and \code{message} to commits using the selected + #' configuration option of \code{commit.messages} and the results of the function \code{get.commit.messages}. update.commit.message.data = function() { logging::loginfo("Merging commit messages into commit data.") if (!is.null(private$commits)) { ## get commit messages - commit.messages = self$get.commit.messages() + commit.messages = private$commit.messages ## drop the commit.id column as we do not want it twice commit.messages = commit.messages[ , colnames(commit.messages) != "commit.id"] - ## now there are only three columns left: commit.id, title, message.body + ## now there are only three columns left: commit.id, title, message ## check whether to include only title or also the messages if (private$project.conf$get.value("commit.messages") == "title") { commit.messages = commit.messages[ , colnames(commit.messages) != "message"] @@ -711,24 +710,6 @@ ProjectData = R6::R6Class("ProjectData", return(private$commits) }, - #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. - #' If the list of commits is not cached in the field \code{commit.messages}, call the read method first. - #' - #' @return the list of commit messages - get.commit.messages = function() { - logging::loginfo("Getting commit messages.´") - - ## if commit messages are not read already, do this - if (is.null(private$commit.messages)) { - commit.message.data = read.commit.messages(self$get.data.path()) - - ## cache the result - private$commit.messages = commit.message.data - } - - return(private$commit.messages) - }, - #' Set the commit list of the project to a new one. #' Add PaStA and synchronicity data if configured in the \code{project.conf} #' as well as commit message data. @@ -744,6 +725,18 @@ ProjectData = R6::R6Class("ProjectData", ## store commit data private$commits = commit.data + ## add commit message data if wanted + if (private$project.conf$get.value("commit.messages") != "none") { + if (is.null(private$synchronicity)) { + ## get data that has been cached before + self$get.commit.messages() + } else { + ## update the commit message data + private$update.commit.message.data() + } + + } + ## add synchronicity data if wanted if (private$project.conf$get.value("synchronicity")) { if (is.null(private$synchronicity)) { @@ -766,11 +759,6 @@ ProjectData = R6::R6Class("ProjectData", } } - ## add commit message data if wanted - if (private$project.conf$get.value("commit.messages") != "none") { - private$update.commit.message.data() - } - ## sort by date private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] @@ -779,6 +767,50 @@ ProjectData = R6::R6Class("ProjectData", private$commits.filtered = NULL }, + #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. + #' If the list of commits is not cached in the field \code{commit.messages}, call the read method first. + #' + #' @return the list of commit messages + get.commit.messages = function() { + logging::loginfo("Getting commit messages.´") + + if (private$project.conf$get.value("commit.messages") != "none") { + ## if commit messages are not read already, do this + if (is.null(private$commit.messages)) { + commit.message.data = read.commit.messages(self$get.data.path()) + + ## cache the result + private$commit.messages = commit.message.data + + private$update.commit.message.data() + } + } else { + logging::logwarn("You have set the ProjectConf parameter 'commit.messages' to 'none'! Ignoring...") + ## mark synchronicity data as empty + self$set.commit.messages(NULL) + } + + return(private$commit.messages) + }, + + #' Set the commit message data to the given new data and, if configured in the field \code{project.conf}, + #' also update it for the commit data. + #' + #' @param data the new commit message data + set.commit.messages = function(data) { + logging::loginfo("Setting commit messages data.") + + if (is.null(data)) { + data = create.empty.commit.message.list() + } + + ## set the actual data + private$commit.messages = data + + ## add commit message data to the commit data if configured + update.commit.message.data() + }, + #' Get the synchronicity data. If it is not already stored in the ProjectData, this function triggers a read in #' from disk. #' From d5c8c7891dd5629a7e4e0ad20331b1743f0dcba4 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 15 Jan 2021 15:49:59 +0100 Subject: [PATCH 22/43] Add helper function to format 'commit.id' column Introduce new function 'format.commit.ids' in along with new section in util-read.R. Also put format "" into a constant. See #193 Signed-off-by: Niklas Schneider --- util-read.R | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/util-read.R b/util-read.R index 69b3a6f6..f4544091 100644 --- a/util-read.R +++ b/util-read.R @@ -161,7 +161,7 @@ read.commits = function(data.path, artifact) { commit.data = commit.data[order(commit.data[["date"]], decreasing = FALSE), ] # sort! ## set pattern for commit ID for better recognition - commit.data[["commit.id"]] = sprintf("", commit.data[["commit.id"]]) + commit.data[["commit.id"]] = format.commit.ids(commit.data[["commit.id"]]) row.names(commit.data) = seq_len(nrow(commit.data)) ## store the commit data @@ -516,7 +516,7 @@ read.commit.messages = function(data.path) { colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS ## Make commit.id have numeric type and set row names - commit.message.data[["commit.id"]] = sprintf("", commit.message.data[["commit.id"]]) + commit.message.data[["commit.id"]] = format.commit.ids(commit.message.data[["commit.id"]]) row.names(commit.message.data) = seq_len(nrow(commit.message.data)) logging::logdebug("read.commit.messages: finished.") @@ -679,3 +679,20 @@ read.synchronicity = function(data.path, artifact, time.window) { create.empty.synchronicity.list = function() { return (create.empty.data.frame(SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)) } + + +## Helper functions -------------------------------------------------------- + +## declare a global format for the commit.id column in several data frames +COMMIT.ID.FORMAT = "" + +#' Format a vector of commit ids into a global format +#' +#' @param commit.ids a vector containing all the commit ids to be formatted +#' +#' @return a vector with the formatted commit ids +format.commit.ids = function(commit.ids) { + return (sprintf(COMMIT.ID.FORMAT, commit.ids)) +} + + From 43e1894998e18faff3a65114fa65ee54e1d2f66e Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Fri, 15 Jan 2021 16:40:44 +0100 Subject: [PATCH 23/43] Change commit message merge process Take advice by @clhunsen to replace if else cascade for rearranging columns with better merge call. Also modify test-data tests regarding commit messages: Row names are no longer ignored. See #193 Signed-off-by: Niklas Schneider --- tests/test-data.R | 10 ---------- util-data.R | 24 ++++++++---------------- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index 9148e518..e2450a96 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -222,11 +222,6 @@ test_that("Merge commit messages to commit data", { title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", ""), message = c("", "", "the things are\nnothing", "", "intensifies", "still\ndoesn't\nwork\nas expected", "", "")) - ## throw away the row names as they are permuted when merging and - ## we do not care for their order in the test - rownames(commits) = NULL - rownames(commit.data.expected) = NULL - expect_identical(commits, commit.data.expected, info = "Add commit messages with title") }) @@ -269,10 +264,5 @@ test_that("Merge commit message titles to commit data", { artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1, 1)), title = c("Add stuff", "Add some more stuff", "I added important things", "I wish it would work now", "Wish", "...", "", "")) - ## throw away the row names as they are permuted when merging and - ## we do not care for their order in the test - rownames(commits) = NULL - rownames(commit.data.expected) = NULL - expect_identical(commits, commit.data.expected, info = "Add only commit title") }) diff --git a/util-data.R b/util-data.R index 727cfd62..117f7889 100644 --- a/util-data.R +++ b/util-data.R @@ -434,28 +434,19 @@ ProjectData = R6::R6Class("ProjectData", ## get commit messages commit.messages = private$commit.messages - ## drop the commit.id column as we do not want it twice - commit.messages = commit.messages[ , colnames(commit.messages) != "commit.id"] - ## now there are only three columns left: commit.id, title, message ## check whether to include only title or also the messages if (private$project.conf$get.value("commit.messages") == "title") { commit.messages = commit.messages[ , colnames(commit.messages) != "message"] } + ## get a vector with the column names in the right order + col.names = unique(c(colnames(private$commits), colnames(commit.messages))) ## merge them into the commit data - commit.data = merge(private$commits, commit.messages, by.x = "hash", by.y = "hash") - - ## when merging by hash, the hash column is taken as the first column of the - ## resulting data frame - ## change that order back depending on how many columns the new data frame has - if (private$project.conf$get.value("commit.messages") == "title") { - ## one column less as message.body is not included - private$commits = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17)] - } - else { - private$commits = commit.data[, c(2, 3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] - } + private$commits = merge(private$commits, commit.messages, + by = c("commit.id", "hash"), all.x = TRUE, sort = FALSE) + ## adjust the column order + private$commits = private$commits[col.names] } }, @@ -774,7 +765,8 @@ ProjectData = R6::R6Class("ProjectData", get.commit.messages = function() { logging::loginfo("Getting commit messages.´") - if (private$project.conf$get.value("commit.messages") != "none") { + if (private$project.conf$get.value("commit.messages") == "title" | + private$project.conf$get.value("commit.messages") == "message") { ## if commit messages are not read already, do this if (is.null(private$commit.messages)) { commit.message.data = read.commit.messages(self$get.data.path()) From 70b3cb6c29a45efa772d99a74fef47f6b8474756 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sat, 16 Jan 2021 20:55:22 +0100 Subject: [PATCH 24/43] Change order of data sources to be alphabetical Change order in 'README.md', 'util-conf.R' and 'util-data.R' Also fix table of contents in the readme. See #193 Signed-off-by: Niklas Schneider --- README.md | 12 ++++++------ util-conf.R | 24 ++++++++++++------------ util-data.R | 54 ++++++++++++++++++++++++++--------------------------- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 9a2aa4d2..8f2f3ac5 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ If you wonder: The name `coronet` derives as an acronym from the words "configur ## Table of contents -- [coronet - The network library](#coronet---the-network-library) - - [Table of contents](#table-of-contents) + - [Integration](#integration) - [Requirements](#requirements) - [`R`](#r) @@ -182,14 +181,15 @@ There are two distinguishable types of data sources that are both handled by the * Issue data (called `"issues"` internally) - Additional (orthogonal) data sources (augmentable to main data sources, not splittable) + * Commit messages are available through the parameter `commit.messages` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class. Three values can be used: + 1. `none` is the default value and does not impact the configuration at all. + 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. + 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message`. * [PaStA](https://github.com/lfd/PaStA/) data (patch-stack analysis, see also the parameter `pasta` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class)) * Patch-stack analysis to link patches sent to mailing lists and upstream commits * Synchronicity information on commits (see also the parameter `synchronicity` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class) * Synchronous commits are commits that change a source-code artifact that has also been changed by another author within a reasonable time-window. - * Commit messages are available through the parameter `commit.messages`. Three values can be used: - 1. `none` is the default value and does not impact the configuration at all. - 2. `title` merges the commit message titles (i.e. the first non white space line of a commit message) to the commit data. This gives the data frame an additional column `title`. - 3. `messages` merges both titles and message bodies to the commit data frame. This adds two new columns `title` and `message`. + The important difference is that the *main data sources* are used internally to construct artifact vertices in relevant types of networks. Additionally, these data sources can be used as a basis for splitting `ProjectData` in a time-based or activity-based manner – obtaining `RangeData` instances as a result (see file `split.R` and the contained functions). Thus, `RangeData` objects contain only data of a specific period of time. diff --git a/util-conf.R b/util-conf.R index c973330a..4e5b2dbf 100644 --- a/util-conf.R +++ b/util-conf.R @@ -363,6 +363,18 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, allowed = c("none", "title", "message"), allowed.number = 1 ), + issues.only.comments = list( + default = TRUE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 + ), + issues.from.source = list( + default = c("jira", "github"), + type = "character", + allowed = c("jira", "github"), + allowed.number = Inf + ), mails.filter.patchstack.mails = list( default = FALSE, type = "logical", @@ -386,18 +398,6 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, type = "logical", allowed = c(TRUE, FALSE), allowed.number = 1 - ), - issues.only.comments = list( - default = TRUE, - type = "logical", - allowed = c(TRUE, FALSE), - allowed.number = 1 - ), - issues.from.source = list( - default = c("jira", "github"), - type = "character", - allowed = c("jira", "github"), - allowed.number = Inf ) ), diff --git a/util-data.R b/util-data.R index 117f7889..bea0855e 100644 --- a/util-data.R +++ b/util-data.R @@ -220,6 +220,33 @@ ProjectData = R6::R6Class("ProjectData", return(mails) }, + ## * * commit message data ------------------------------------------ + + #' Add the columns \code{title} and \code{message} to commits using the selected + #' configuration option of \code{commit.messages} and the results of the function \code{get.commit.messages}. + update.commit.message.data = function() { + logging::loginfo("Merging commit messages into commit data.") + + if (!is.null(private$commits)) { + ## get commit messages + commit.messages = private$commit.messages + + ## now there are only three columns left: commit.id, title, message + ## check whether to include only title or also the messages + if (private$project.conf$get.value("commit.messages") == "title") { + commit.messages = commit.messages[ , colnames(commit.messages) != "message"] + } + + ## get a vector with the column names in the right order + col.names = unique(c(colnames(private$commits), colnames(commit.messages))) + ## merge them into the commit data + private$commits = merge(private$commits, commit.messages, + by = c("commit.id", "hash"), all.x = TRUE, sort = FALSE) + ## adjust the column order + private$commits = private$commits[col.names] + } + }, + ## * * PaStA data -------------------------------------------------- #' Use the information about the deleted patchstack mails that are stored in the field \code{patchstack.mails} @@ -423,33 +450,6 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("update.synchronicity.data: finished.") }, - ## * * commit messages --------------------------------------------- - - #' Add the columns \code{title} and \code{message} to commits using the selected - #' configuration option of \code{commit.messages} and the results of the function \code{get.commit.messages}. - update.commit.message.data = function() { - logging::loginfo("Merging commit messages into commit data.") - - if (!is.null(private$commits)) { - ## get commit messages - commit.messages = private$commit.messages - - ## now there are only three columns left: commit.id, title, message - ## check whether to include only title or also the messages - if (private$project.conf$get.value("commit.messages") == "title") { - commit.messages = commit.messages[ , colnames(commit.messages) != "message"] - } - - ## get a vector with the column names in the right order - col.names = unique(c(colnames(private$commits), colnames(commit.messages))) - ## merge them into the commit data - private$commits = merge(private$commits, commit.messages, - by = c("commit.id", "hash"), all.x = TRUE, sort = FALSE) - ## adjust the column order - private$commits = private$commits[col.names] - } - }, - ## * * timestamps -------------------------------------------------- #' Call the getters of the specified data sources in order to From 31e0f85ac145f61d94a7b5abf8c5abd3c925201a Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sat, 16 Jan 2021 21:03:59 +0100 Subject: [PATCH 25/43] Update 'NEWS.md' with commit hashes See #193 Signed-off-by: Niklas Schneider --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4fca0a99..ebb86686 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ ## Unversioned ### Added -- Add functionality to read and process commit messages in order to merge them to the commit data (see issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages`. +- Add functionality to read and process commit messages in order to merge them to the commit data (see issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages` (PR #193, 85b1d0572c0fb9f4c062bceb1363b0398f98b85f, fdc414ade1a640f533e809a25cfe012e42b3cffa, 43e1894998e18faff3a65114fa65ee54e1d2f66e) ### Changed/Improved From a0d5e327b57d7afa3e32b194285efb11522b9d94 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 20 Jan 2021 15:02:14 +0100 Subject: [PATCH 26/43] Add package 'data.table' to coronet and refactor README Add the package in 'install.R' and a description in the 'README.md'. Also rearrange the parameter descriptions of 'ProjectConf' to be sorted alphabetically. See #193 Signed-off-by: Niklas Schneider --- README.md | 22 ++++++++++++---------- install.R | 1 + 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8f2f3ac5..aae56ddf 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,6 @@ If you wonder: The name `coronet` derives as an acronym from the words "configur ## Table of contents - - - [Integration](#integration) - [Requirements](#requirements) - [`R`](#r) @@ -125,6 +123,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `parallel`: For parallelization - `logging`: Logging - `sqldf`: For advanced aggregation of `data.frame` objects +- `data.table`: For faster data processing. - `testthat`: For the test suite - `patrick`: For the test suite - `ggplot2`: For plotting of data @@ -539,16 +538,23 @@ There is no way to update the entries, except for the revision-based parameters. - `commits.filter.untracked.files` * Remove all information concerning untracked files from the commit data. This effect becomes clear when retrieving commits using `get.commits.filtered`, because then the result of which does not contain any commits that solely changed untracked files. Networks built on top of this `ProjectData` do also not contain any information about untracked files. * [*`TRUE`*, `FALSE`] -- `mails.filter.patchstack.mails` - * Filter patchstack mails from the mail data. In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread creator and has been sent within a short time window after the preceding mail. The mails spanned by a patchstack are called -'patchstack mails' and for each patchstack, every patchstack mail but the first one are filtered when `mails.filter.patchstack.mails = TRUE`. - * [`TRUE`, *`FALSE`*] +- `commmit.messages` + * Read and add commit messages to commits. The column `title` will contain the first line of the message and, if selected, the column `message` will contain the rest. + * [*`none`*, `title`, `messages`] - `issues.only.comments` * Only use comments from the issue data on disk and no further events such as references and label changes * [*`TRUE`*, `FALSE`] - `issues.from.source` * Choose from which sources the issue data on disk is read in. Multiple sources can be chosen. * [*`github`, `jira`*] +- `mails.filter.patchstack.mails` + * Filter patchstack mails from the mail data. In a thread, a patchstack spans the first sequence of mails where each mail has been authored by the thread creator and has been sent within a short time window after the preceding mail. The mails spanned by a patchstack are called +'patchstack mails' and for each patchstack, every patchstack mail but the first one are filtered when `mails.filter.patchstack.mails = TRUE`. + * [`TRUE`, *`FALSE`*] +- `pasta` + * Read and integrate [PaStA](https://github.com/lfd/PaStA/) data with commit and mail data (columns `pasta` and `revision.set.id`) + * [`TRUE`, *`FALSE`*] + * **Note**: To include PaStA-based edge attributes, you need to give the `"pasta"` edge attribute for `edge.attributes`. - `synchronicity` * Read and add synchronicity data to commits (column `synchronicity`) * [`TRUE`, *`FALSE`*] @@ -557,10 +563,6 @@ There is no way to update the entries, except for the revision-based parameters. * The time-window (in days) to use for synchronicity data if enabled by `synchronicity = TRUE` * [1, *5*, 10, 15] * **Note**: If, at least, one artifact in a commit has been edited by more than one developer within the configured time window, then the whole commit is considered to be synchronous. -- `pasta` - * Read and integrate [PaStA](https://github.com/lfd/PaStA/) data with commit and mail data (columns `pasta` and `revision.set.id`) - * [`TRUE`, *`FALSE`*] - * **Note**: To include PaStA-based edge attributes, you need to give the `"pasta"` edge attribute for `edge.attributes`. ### NetworkConf diff --git a/install.R b/install.R index d796d8d0..570dbe29 100644 --- a/install.R +++ b/install.R @@ -32,6 +32,7 @@ packages = c( "parallel", "logging", "sqldf", + "data.table", "testthat", "patrick", "ggplot2", From 4c4926918da223223d37d8e82bd9df2fec49db60 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 20 Jan 2021 15:04:10 +0100 Subject: [PATCH 27/43] Increase perfomance of commit message read Use the new data.table package to replace do.call with data.table::rbindlist which is faster in processing data.frames. See #193 Signed-off-by: Niklas Schneider --- util-read.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util-read.R b/util-read.R index f4544091..dd4d4f03 100644 --- a/util-read.R +++ b/util-read.R @@ -36,7 +36,7 @@ requireNamespace("parallel") # for parallel computation requireNamespace("plyr") requireNamespace("digest") # for sha1 hashing of IDs requireNamespace("sqldf") # for SQL-selections on data.frames - +requireNamespace("data.table") # for faster data.frame processing ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Main data sources ------------------------------------------------------- @@ -503,7 +503,7 @@ read.commit.messages = function(data.path) { } ## convert list of vectors to a data frame with two columns - message.split = as.data.frame(do.call(rbind, message.split)) + message.split = data.table::rbindlist(lapply(message.split, as.data.frame.list)) colnames(message.split) = c("title", "message") ## create a data frame containing all four necessary columns From 19655ddd706a4fe63e9234addcfe39a5960a08fd Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 20 Jan 2021 15:10:09 +0100 Subject: [PATCH 28/43] Update my copyright notices Signed-off-by: Niklas Schneider --- tests/test-data.R | 2 +- tests/test-read.R | 2 +- util-conf.R | 2 +- util-data.R | 2 +- util-read.R | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index e2450a96..27936cbe 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -14,7 +14,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2019 by Jakob Kronawitter -## Copyright 2020 by Niklas Schneider +## Copyright 2020-2021 by Niklas Schneider ## All Rights Reserved. diff --git a/tests/test-read.R b/tests/test-read.R index 526a5e1f..0df71de1 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -17,7 +17,7 @@ ## Copyright 2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker -## Copyright 2020 by Niklas Schneider +## Copyright 2020-2021 by Niklas Schneider ## All Rights Reserved. diff --git a/util-conf.R b/util-conf.R index 4e5b2dbf..05fb4670 100644 --- a/util-conf.R +++ b/util-conf.R @@ -21,7 +21,7 @@ ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker -## Copyright 2020 by Niklas Schneider +## Copyright 2020-21 by Niklas Schneider ## All Rights Reserved. diff --git a/util-data.R b/util-data.R index bea0855e..036f8db7 100644 --- a/util-data.R +++ b/util-data.R @@ -21,7 +21,7 @@ ## Copyright 2017 by Ferdinand Frank ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker -## Copyright 2020 by Niklas Schneider +## Copyright 2020-2021 by Niklas Schneider ## All Rights Reserved. diff --git a/util-read.R b/util-read.R index dd4d4f03..704335bc 100644 --- a/util-read.R +++ b/util-read.R @@ -19,7 +19,7 @@ ## Copyright 2017-2018 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker -## Copyright 2020 by Niklas Schneider +## Copyright 2020-2021 by Niklas Schneider ## All Rights Reserved. ## Note: From a36bde46746110d188b2df9d1de9ccd507ef956b Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Wed, 20 Jan 2021 15:37:44 +0100 Subject: [PATCH 29/43] Fix spelling errors in 'README.md' and 'util-conf.R' See #193 Signed-off-by: Niklas Schneider --- README.md | 2 +- util-conf.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aae56ddf..3c12206c 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `parallel`: For parallelization - `logging`: Logging - `sqldf`: For advanced aggregation of `data.frame` objects -- `data.table`: For faster data processing. +- `data.table`: For faster data processing - `testthat`: For the test suite - `patrick`: For the test suite - `ggplot2`: For plotting of data diff --git a/util-conf.R b/util-conf.R index 05fb4670..5aeff8b3 100644 --- a/util-conf.R +++ b/util-conf.R @@ -21,7 +21,7 @@ ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker -## Copyright 2020-21 by Niklas Schneider +## Copyright 2020-2021 by Niklas Schneider ## All Rights Reserved. From aab07515f1498701371a1e82c50334ef2906e3d1 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 10:27:35 +0100 Subject: [PATCH 30/43] Use new helper function in tests to format commit ids Remove hardcoded string formatting and replace it in tests for creating expected data using the new function 'format.commit.ids'. See #193 Signed-off-by: Niklas Schneider --- tests/test-data.R | 4 ++-- tests/test-read.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test-data.R b/tests/test-data.R index 27936cbe..8794abef 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -190,7 +190,7 @@ test_that("Merge commit messages to commit data", { commits = proj.data$get.commits() - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, + commit.data.expected = data.frame(commit.id = format.commit.ids(c(32712, 32713, 32710, 32714, 32715, 32716, 32711, 32711)), date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", @@ -233,7 +233,7 @@ test_that("Merge commit message titles to commit data", { commits = proj.data$get.commits() - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, + commit.data.expected = data.frame(commit.id = format.commit.ids(c(32712, 32713, 32710, 32714, 32715, 32716, 32711, 32711)), date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", diff --git a/tests/test-read.R b/tests/test-read.R index 0df71de1..0e78b848 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -44,7 +44,7 @@ test_that("Read the raw commit data with the feature artifact.", { commit.data.read = read.commits(proj.conf$get.value("datapath"), proj.conf$get.value("artifact")) ## build the expected data.frame - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32712, 32713, 32713, 32710, 32710, 32714, 32715, 32716, + commit.data.expected = data.frame(commit.id = format.commit.ids(c(32712, 32712, 32713, 32713, 32710, 32710, 32714, 32715, 32716, 32711, 32711)), date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:00:45", "2016-07-12 16:05:41", "2016-07-12 16:05:41", @@ -101,7 +101,7 @@ test_that("Read the raw commit data with the file artifact.", { commit.data.read = read.commits(proj.conf$get.value("datapath"), proj.conf$get.value("artifact")) ## build the expected data.frame - commit.data.expected = data.frame(commit.id = sprintf("", c(32716, 32717, 32718, 32719, 32720, 32721, 32715)), + commit.data.expected = data.frame(commit.id = format.commit.ids(c(32716, 32717, 32718, 32719, 32720, 32721, 32715)), date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:20", "2016-07-12 16:06:30", "2016-07-12 16:06:32")), @@ -146,7 +146,7 @@ test_that("Read the commit message data.", { commit.message.data.read = read.commit.messages(proj.conf$get.value("datapath")) ## build the expected data.frame - commit.data.expected = data.frame(commit.id = sprintf("", c(32712, 32713, 32710, 32714, 32715, 32716, 32711)), + commit.data.expected = data.frame(commit.id = format.commit.ids(c(32712, 32713, 32710, 32714, 32715, 32716, 32711)), hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", "418d1dc4929ad1df251d2aeb833dd45757b04a6f", "d01921773fae4bed8186b0aa411d6a2f7a6626e6", From 0859b9afa0fb5c9d39b15c594af33b572e16d7a9 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 10:30:15 +0100 Subject: [PATCH 31/43] Replace for-loop with lapply call in function to read commit messages Follow @clhunsen's advice to create commit message data with an lapply to avoid having a for-loop and an additional lapply call afterwards See #193 Signed-off-by: Niklas Schneider --- util-read.R | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/util-read.R b/util-read.R index 704335bc..bc7742be 100644 --- a/util-read.R +++ b/util-read.R @@ -472,45 +472,42 @@ read.commit.messages = function(data.path) { ## split the message string with the new line symbol message.split = strsplit(commit.message.data[["message"]], COMMIT.MESSAGE.LINE.SEP.CODEFACE) - ## prepare the 'message.split' object so that it contains a two-element - ## vector for each commit - for (i in seq_along(message.split)) { - v = message.split[[i]] - + ## prepare the 'message.split' object so that it contains a two-element vector for each commit + message.split.df = lapply(message.split, function(tuple) { ## clear the message from empty lines - message.split[[i]] = v[v != ""] + lines = tuple[tuple != ""] ## remove spaces before first line - message.split[[i]] = gsub("^\\s+", "", message.split[[i]]) + lines = gsub("^\\s+", "", lines) ## remove spaces at the end of the message - message.split[[i]] = gsub("$\\s+", "", message.split[[i]]) + lines = gsub("$\\s+", "", lines) + + ## set title and message empty in case there was on actual commit message or it was consisting of spaces only + title = "" + message = "" - ## if the commit message was completely empty, add empty title and body - if (length(message.split[[i]]) == 0) { - message.split[[i]] = c("", "") - } ## if there is only one line, create an empty body - else if (length(message.split[[i]]) == 1) { - message.split[[i]] = c(message.split[[i]], "") + if (length(lines) == 1) { + title = lines[[1]] } ## if there are more than two lines, merge all except for the first one - else if (length(message.split[[i]]) > 2) { - message.split[[i]] = c(message.split[[i]][[1]], - paste(tail(message.split[[i]], -1), - ## use an ascii line break instead - collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE)) + else if (length(lines) >= 2) { + title = lines[[1]] + ## use an ascii line break instead + message = paste(tail(lines, -1), collapse = COMMIT.MESSAGE.LINE.SEP.REPLACE) } - } - ## convert list of vectors to a data frame with two columns - message.split = data.table::rbindlist(lapply(message.split, as.data.frame.list)) - colnames(message.split) = c("title", "message") + return(data.table::data.table(title = title, message = message)) + }) + + ## convert to a data.table with two columns + message.split.df = data.table::rbindlist(message.split.df) ## create a data frame containing all four necessary columns commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id commit.message.data[["hash"]], # hash - message.split[["title"]], # title - message.split[["message"]]) # message + message.split.df[["title"]], # title + message.split.df[["message"]]) # message ## set all the column names colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS From fc5d20f117867a2f7939b957f7f2dbc9fc069846 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 10:35:42 +0100 Subject: [PATCH 32/43] Fix minor comment issues and add checks before updating commit messages Add check for the ProjectConf attribute 'commit.messages' before calling 'update.commit.messages'. Also fix a few errors in comments as well as one if condition where the wrong attribute was checked. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/util-data.R b/util-data.R index 036f8db7..54d339e7 100644 --- a/util-data.R +++ b/util-data.R @@ -718,7 +718,7 @@ ProjectData = R6::R6Class("ProjectData", ## add commit message data if wanted if (private$project.conf$get.value("commit.messages") != "none") { - if (is.null(private$synchronicity)) { + if (is.null(private$scommit.messages)) { ## get data that has been cached before self$get.commit.messages() } else { @@ -778,7 +778,7 @@ ProjectData = R6::R6Class("ProjectData", } } else { logging::logwarn("You have set the ProjectConf parameter 'commit.messages' to 'none'! Ignoring...") - ## mark synchronicity data as empty + ## mark commit messages data as empty self$set.commit.messages(NULL) } @@ -800,7 +800,10 @@ ProjectData = R6::R6Class("ProjectData", private$commit.messages = data ## add commit message data to the commit data if configured - update.commit.message.data() + if (private$project.conf$get.value("commit.messages") == "title" | + private$project.conf$get.value("commit.messages") == "message") { + update.commit.message.data() + } }, #' Get the synchronicity data. If it is not already stored in the ProjectData, this function triggers a read in From 686459ebe92b464bb42d994e567616e22dffcf58 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 11:49:28 +0100 Subject: [PATCH 33/43] Initialize commit message data on RangeData-objects in 'util-split.R' Add the getter call to the 'additional.data' list. See #193 Signed-off-by: Niklas Schneider --- util-split.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util-split.R b/util-split.R index aedc276b..2d7f1531 100644 --- a/util-split.R +++ b/util-split.R @@ -19,6 +19,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock ## Copyright 2020 by Thomas Bock +## Copyright 2021 by Niklas Schneider ## All Rights Reserved. @@ -70,6 +71,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ## initialize additional data sources to avoid multiple redundant initalizations later additional.data = list( authors = project.data$get.authors(), + commit.messages = project.data$get.commit.messages(), pasta = project.data$get.pasta(), synchronicity = project.data$get.synchronicity() ) @@ -157,7 +159,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = setter.name = sprintf("set.%s", data.source) cf.range.data[[setter.name]](df.list[[data.source]]) } - ## set additional data sources: authors, pasta, synchronicity + ## set additional data sources: authors, commit.messages, pasta, synchronicity for (data.source in additional.data.sources) { setter.name = sprintf("set.%s", data.source) cf.range.data[[setter.name]](additional.data[[data.source]]) From 613a773afe4aec27c19ca94f5d3f5ee2b06284f3 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 11:51:24 +0100 Subject: [PATCH 34/43] Fix minor spelling errors See #193 Signed-off-by: Niklas Schneider --- util-data.R | 2 +- util-read.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/util-data.R b/util-data.R index 54d339e7..711be016 100644 --- a/util-data.R +++ b/util-data.R @@ -718,7 +718,7 @@ ProjectData = R6::R6Class("ProjectData", ## add commit message data if wanted if (private$project.conf$get.value("commit.messages") != "none") { - if (is.null(private$scommit.messages)) { + if (is.null(private$commit.messages)) { ## get data that has been cached before self$get.commit.messages() } else { diff --git a/util-read.R b/util-read.R index bc7742be..e752372c 100644 --- a/util-read.R +++ b/util-read.R @@ -482,7 +482,7 @@ read.commit.messages = function(data.path) { ## remove spaces at the end of the message lines = gsub("$\\s+", "", lines) - ## set title and message empty in case there was on actual commit message or it was consisting of spaces only + ## set title and message empty in case there was no actual commit message or it was consisting of spaces only title = "" message = "" From 98e83b037ecc88d9a29e8e4ca93598a9978e85a2 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 11:51:58 +0100 Subject: [PATCH 35/43] Change all data split tests to include commit message data Add (empty) commit message data to all data split tests in 'tests-split.R'. Also sor the additional data sources alphabetically in the tests. See #193 Signed-off-by: Niklas Schneider --- tests/test-split.R | 403 +++++++++++++++++++++++++++------------------ 1 file changed, 240 insertions(+), 163 deletions(-) diff --git a/tests/test-split.R b/tests/test-split.R index b97926f5..bd8c8e52 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -18,6 +18,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker +## Copyright 2021 by Niklas Schneider ## All Rights Reserved. @@ -73,10 +74,11 @@ test_that("Split a data object time-based (split.basis = 'commits').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -99,33 +101,39 @@ test_that("Split a data object time-based (split.basis = 'commits').", { "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22), ], "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15,29), ], "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -148,10 +156,11 @@ test_that("Split a data object time-based (split.basis = 'mails').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -177,11 +186,11 @@ test_that("Split a data object time-based (split.basis = 'mails').", { "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + commit.messages = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], @@ -189,29 +198,35 @@ test_that("Split a data object time-based (split.basis = 'mails').", { "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") - }) @@ -230,10 +245,11 @@ test_that("Split a data object time-based (split.basis = 'issues').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -257,33 +273,39 @@ test_that("Split a data object time-based (split.basis = 'issues').", { "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commits[0, ] ), - mails = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] + commit.messages = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 14:34, ], "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 35:36, ] ), - synchronicity = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -307,10 +329,11 @@ test_that("Split a data object time-based (bins = ... ).", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -329,29 +352,32 @@ test_that("Split a data object time-based (bins = ... ).", { commits = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits ), - mails = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] + commit.messages = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages ), issues = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% 14:34, ] ), - synchronicity = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] ), pasta = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") - }) ## * * ranges -------------------------------------------------------------- @@ -412,8 +438,6 @@ test_that("Test splitting data by networks", { result.data = results[[aggregation.level]] expected.range.names = expected.ranges[[aggregation.level]] - - lapply(seq_along(result.data), function(i) { result.entry = result.data[[i]] @@ -451,20 +475,21 @@ test_that("Test splitting data by ranges", { ## check data for all ranges expected.data = list( commits = lapply(expected.results, function(cf.data) cf.data$get.commits()), - mails = lapply(expected.results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(expected.results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(expected.results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(expected.results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(expected.results, function(cf.data) cf.data$get.pasta()) + mails = lapply(expected.results, function(cf.data) cf.data$get.mails()), + pasta = lapply(expected.results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(expected.results, function(cf.data) cf.data$get.synchronicity()) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") - }) ## * activity-based -------------------------------------------------------- @@ -484,10 +509,11 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -510,33 +536,39 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") @@ -560,25 +592,29 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { commits = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29), ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") @@ -604,29 +640,34 @@ test_that("Split a data object activity-based (activity.type = 'commits').", { "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") @@ -660,10 +701,11 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -692,13 +734,13 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 17, ] + commit.messages = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commit.messages, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commit.messages, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], @@ -708,13 +750,13 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 17, ] ), pasta = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, @@ -723,14 +765,23 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -754,25 +805,29 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { commits = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails + commit.messages = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails ), pasta = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") @@ -798,29 +853,34 @@ test_that("Split a data object activity-based (activity.type = 'mails').", { "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] + commit.messages = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commit.messages, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] ), pasta = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") @@ -853,10 +913,11 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -881,11 +942,11 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$commits[3:8, ], "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] ), - mails = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + commit.messages = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$commit.messages, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$commit.messages, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], @@ -893,25 +954,32 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 29:30), ], "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] ), - synchronicity = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, - "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, - "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$pasta, "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$pasta, "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -935,25 +1003,29 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { commits = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits ), - mails = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + commit.messages = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues ), - synchronicity = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] ), pasta = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") @@ -979,29 +1051,34 @@ test_that("Split a data object activity-based (activity.type = 'issues').", { "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commits[1:2, ], "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commits[3:8, ] ), - mails = list( - "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] + commit.messages = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commit.messages, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28), ], "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36), ] ), - synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, - "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] ), pasta = list( "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$pasta, "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") From 2e42fca54d772c29ee2b3b633428b7bed5a13541 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 25 Jan 2021 12:18:24 +0100 Subject: [PATCH 36/43] Change all sliding window data tests to include commit message data Add (empty) commit message data to all data split tests in 'test-split-sliding-window.R'. Also sort the additional data sources alphabetically in the tests. See #193 Signed-off-by: Niklas Schneider --- tests/test-split-sliding-window.R | 511 ++++++++++++++++++------------ 1 file changed, 307 insertions(+), 204 deletions(-) diff --git a/tests/test-split-sliding-window.R b/tests/test-split-sliding-window.R index ea9d712e..e665ed13 100644 --- a/tests/test-split-sliding-window.R +++ b/tests/test-split-sliding-window.R @@ -18,6 +18,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker +## Copyright 2021 by Niklas Schneider ## All Rights Reserved. @@ -66,10 +67,11 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -97,12 +99,12 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commits[3:5, ], "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[rownames(data$mails) %in% c(16, 17), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commit.messages, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commit.messages, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22), ], @@ -111,12 +113,12 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) == 29, ], "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[rownames(data$mails) %in% c(16, 17), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, @@ -124,14 +126,22 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$pasta, "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -153,10 +163,11 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -188,14 +199,14 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commits[0, ], "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + commit.messages = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commit.messages, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commit.messages, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commit.messages, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], @@ -206,14 +217,14 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, @@ -223,14 +234,24 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$pasta, "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -252,10 +273,11 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -281,11 +303,11 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commits ), - mails = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + commit.messages = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commit.messages, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], @@ -293,25 +315,32 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 14:34, ], "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:36, ] ), - synchronicity = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] ), pasta = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$pasta, "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -334,10 +363,11 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -359,39 +389,43 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits, "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commits[0, ] ), - mails = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] + commit.messages = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages ), issues = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% 14:34, ], "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] ), - synchronicity = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] ), pasta = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta, "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$pasta + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") - }) ## * activity-based -------------------------------------------------------- -## -## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows -## +# +# Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +# test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { @@ -404,10 +438,11 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -432,11 +467,11 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], @@ -444,25 +479,32 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") @@ -486,25 +528,29 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin commits = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29), ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") @@ -530,29 +576,34 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") @@ -591,10 +642,11 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -621,12 +673,12 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], @@ -635,12 +687,12 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, @@ -648,14 +700,22 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") @@ -677,10 +737,11 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -717,17 +778,17 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], - "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[rownames(data$mails) %in% 2:4, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], - "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[rownames(data$mails) %in% 5:7, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], - "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[rownames(data$mails) %in% 8:10, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[rownames(data$mails) %in% c(11:12, 14), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 15:17, ] + commit.messages = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$commit.messages, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commit.messages, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$commit.messages, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commit.messages, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$commit.messages, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commit.messages, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], @@ -741,17 +802,17 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, - "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$synchronicity, - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, - "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$synchronicity, - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, - "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$synchronicity, - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[rownames(data$mails) %in% 2:4, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[rownames(data$mails) %in% 5:7, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[rownames(data$mails) %in% 8:10, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[rownames(data$mails) %in% c(11:12, 14), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 15:17, ] ), pasta = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, @@ -764,14 +825,27 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$pasta, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$synchronicity, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$synchronicity, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$synchronicity, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -795,25 +869,29 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. commits = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails + commit.messages = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails ), pasta = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") @@ -839,29 +917,34 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] + commit.messages = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commit.messages, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity + mails = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] ), pasta = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") @@ -895,10 +978,11 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding project.data = ProjectData$new(proj.conf) data = list( commits = project.data$get.commits(), - mails = project.data$get.mails(), + commit.messages = project.data$get.commit.messages(), issues = project.data$get.issues(), - synchronicity = project.data$get.synchronicity(), - pasta = project.data$get.pasta() + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) ## split data @@ -929,14 +1013,14 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commits[0, ], "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] ), - mails = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], - "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$mails[rownames(data$mails) %in% 14:15, ], - "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + commit.messages = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$commit.messages, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$commit.messages, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$commit.messages, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$commit.messages, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commit.messages, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], @@ -947,14 +1031,14 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 30:34),], "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] ), - synchronicity = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, - "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$synchronicity, - "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, - "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$synchronicity, - "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$mails[rownames(data$mails) %in% 14:15, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, @@ -964,14 +1048,24 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$pasta, "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$pasta, "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$synchronicity, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$synchronicity, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges.") @@ -995,25 +1089,29 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding commits = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits ), - mails = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + commit.messages = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues ), - synchronicity = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] ), pasta = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") @@ -1039,29 +1137,34 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commits[1:2, ], "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commits[3:8, ] ), - mails = list( - "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] + commit.messages = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commit.messages, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28), ], "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36), ] ), - synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, - "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity + mails = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] ), pasta = list( "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$pasta, "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( commits = lapply(results, function(cf.data) cf.data$get.commits()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), issues = lapply(results, function(cf.data) cf.data$get.issues()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") From c052dfbad0dbd1238028178e3afd4ba118d1f8b0 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Tue, 26 Jan 2021 10:02:38 +0100 Subject: [PATCH 37/43] Fix minor comment issue in 'test-split-sliding-window.R' See #193 Signed-off-by: Niklas Schneider --- tests/test-split-sliding-window.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-split-sliding-window.R b/tests/test-split-sliding-window.R index e665ed13..258e6f2c 100644 --- a/tests/test-split-sliding-window.R +++ b/tests/test-split-sliding-window.R @@ -423,9 +423,9 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." ## * activity-based -------------------------------------------------------- -# -# Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows -# +## +## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +## test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { From d3bbae0672d6b63f2d68970085c69ad9c0a3b6b4 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sat, 30 Jan 2021 11:48:42 +0100 Subject: [PATCH 38/43] Add new cleanup functions for commit messages and synchronicity These functions act like cleanup.pasta, they remove rows from the data that are not part of a range data object anymore. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/util-data.R b/util-data.R index 711be016..96413789 100644 --- a/util-data.R +++ b/util-data.R @@ -806,6 +806,22 @@ ProjectData = R6::R6Class("ProjectData", } }, + #' Remove lines in the commit message data that contain message ids or commit hashes + #' that don't appear in the commit data. + cleanup.commit.message.data = function() { + logging::loginfo("Cleaning up commit message data") + + ## remove commit hashes that don't appear in the commit data + if (!is.null(private$commits)) { + commit.message.hashes = unlist(private$commit.messages[["hash"]]) + commit.message.hashes.contained = unlist(private$commit.messages[["hash"]]) %in% private$commits[["hash"]] + commit.hashes.to.eliminate = commit.message.hashes[!commit.message.hashes.contained] + commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] + rows.to.remove = unlist(private$commit.messages[["hash"]]) %in% commit.hashes.to.eliminate + private$commit.messages = private$commit.messages[!rows.to.remove, ] + } + }, + #' Get the synchronicity data. If it is not already stored in the ProjectData, this function triggers a read in #' from disk. #' @@ -862,6 +878,22 @@ ProjectData = R6::R6Class("ProjectData", } }, + #' Remove lines in the synchronicity data that contain message ids or commit hashes + #' that don't appear in the commit data. + cleanup.synchronicity.data = function() { + logging::loginfo("Cleaning up synchronicity data") + + ## remove commit hashes that don't appear in the commit data + if (!is.null(private$commits)) { + synchronicity.hashes = unlist(private$synchronicity[["hash"]]) + synchronicity.hashes.contained = unlist(private$synchronicity[["hash"]]) %in% private$commits[["hash"]] + commit.hashes.to.eliminate = commit.message.hashes[!synchronicity.hashes.contained] + commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] + rows.to.remove = unlist(private$synchronicity[["hash"]]) %in% commit.hashes.to.eliminate + private$synchronicity = private$synchronicity[!rows.to.remove, ] + } + }, + #' Get the PaStA data. If it is not already stored in the ProjectData, this function triggers a read in #' from disk. #' From 93850840ac9e27cc10e9236e44d98217abadf2f9 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Sat, 30 Jan 2021 12:07:24 +0100 Subject: [PATCH 39/43] Fix wrong variable name in 'cleanup.synchronicity' Fix a copy-paste-error where a varibale name was not changed from a prior code snippet. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util-data.R b/util-data.R index 96413789..bc21189a 100644 --- a/util-data.R +++ b/util-data.R @@ -887,7 +887,7 @@ ProjectData = R6::R6Class("ProjectData", if (!is.null(private$commits)) { synchronicity.hashes = unlist(private$synchronicity[["hash"]]) synchronicity.hashes.contained = unlist(private$synchronicity[["hash"]]) %in% private$commits[["hash"]] - commit.hashes.to.eliminate = commit.message.hashes[!synchronicity.hashes.contained] + commit.hashes.to.eliminate = synchronicity.hashes[!synchronicity.hashes.contained] commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] rows.to.remove = unlist(private$synchronicity[["hash"]]) %in% commit.hashes.to.eliminate private$synchronicity = private$synchronicity[!rows.to.remove, ] From 63b6f791c8a9dc829beab85030f83ed5f48e2a39 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 1 Feb 2021 17:11:56 +0100 Subject: [PATCH 40/43] Add cleanup functions to NEWS.md See #193 Signed-off-by: Niklas Schneider --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ebb86686..5f9ef5d3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ ### Added - Add functionality to read and process commit messages in order to merge them to the commit data (see issue #180). Three values are available for the new attribute `commit.messages` in `ProjectConf`: `none`, `title` and `messages` (PR #193, 85b1d0572c0fb9f4c062bceb1363b0398f98b85f, fdc414ade1a640f533e809a25cfe012e42b3cffa, 43e1894998e18faff3a65114fa65ee54e1d2f66e) - +- Add functions `cleanup.commit.message.data` and `cleanup.synchronicity.data` to remove commit hashes that are not any more present in the commit data from the commit message data or synchronicity data (PR #193, 98e83b037ecc88d9a29e8e4ca93598a9978e85a2) ### Changed/Improved - Add `.drone.yml` to enable running our CI pipelines on drone.io (PR #191, 1c5804b59c582cf34af6970b435add51452fbd11) From c63a25a6b88bbab23dceed99d2bc0ea4b2ee1b24 Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 1 Feb 2021 17:16:02 +0100 Subject: [PATCH 41/43] Remove unnecassary function calls and add logging output Warn user when updating commit messages or synchronicity that they should call the corresponding cleanup method. Remove calls to unlist in those cleanup functions. Fix copy-pasted comments. See #193 Signed-off-by: Niklas Schneider --- util-data.R | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/util-data.R b/util-data.R index bc21189a..2528c282 100644 --- a/util-data.R +++ b/util-data.R @@ -245,6 +245,9 @@ ProjectData = R6::R6Class("ProjectData", ## adjust the column order private$commits = private$commits[col.names] } + + logging::logwarn("There might be commit message data that does not appear in the commit data. + To clean this up you can call the function 'cleanup.commit.message.data()'.") }, ## * * PaStA data -------------------------------------------------- @@ -447,6 +450,8 @@ ProjectData = R6::R6Class("ProjectData", } + logging::logwarn("There might be synchronicity data that does not appear in the commit data. + To clean this up you can call the function 'cleanup.synchronicity.data()'.") logging::logdebug("update.synchronicity.data: finished.") }, @@ -806,18 +811,18 @@ ProjectData = R6::R6Class("ProjectData", } }, - #' Remove lines in the commit message data that contain message ids or commit hashes + #' Remove lines in the commit message data that contain commit hashes #' that don't appear in the commit data. cleanup.commit.message.data = function() { logging::loginfo("Cleaning up commit message data") ## remove commit hashes that don't appear in the commit data if (!is.null(private$commits)) { - commit.message.hashes = unlist(private$commit.messages[["hash"]]) - commit.message.hashes.contained = unlist(private$commit.messages[["hash"]]) %in% private$commits[["hash"]] + commit.message.hashes = private$commit.messages[["hash"]] + commit.message.hashes.contained = private$commit.messages[["hash"]] %in% private$commits[["hash"]] commit.hashes.to.eliminate = commit.message.hashes[!commit.message.hashes.contained] commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] - rows.to.remove = unlist(private$commit.messages[["hash"]]) %in% commit.hashes.to.eliminate + rows.to.remove = private$commit.messages[["hash"]] %in% commit.hashes.to.eliminate private$commit.messages = private$commit.messages[!rows.to.remove, ] } }, @@ -878,18 +883,18 @@ ProjectData = R6::R6Class("ProjectData", } }, - #' Remove lines in the synchronicity data that contain message ids or commit hashes + #' Remove lines in the synchronicity data that contain commit hashes #' that don't appear in the commit data. cleanup.synchronicity.data = function() { logging::loginfo("Cleaning up synchronicity data") ## remove commit hashes that don't appear in the commit data if (!is.null(private$commits)) { - synchronicity.hashes = unlist(private$synchronicity[["hash"]]) - synchronicity.hashes.contained = unlist(private$synchronicity[["hash"]]) %in% private$commits[["hash"]] + synchronicity.hashes = private$synchronicity[["hash"]] + synchronicity.hashes.contained = private$synchronicity[["hash"]] %in% private$commits[["hash"]] commit.hashes.to.eliminate = synchronicity.hashes[!synchronicity.hashes.contained] commit.hashes.to.eliminate = commit.hashes.to.eliminate[!is.na(commit.hashes.to.eliminate)] - rows.to.remove = unlist(private$synchronicity[["hash"]]) %in% commit.hashes.to.eliminate + rows.to.remove = private$synchronicity[["hash"]] %in% commit.hashes.to.eliminate private$synchronicity = private$synchronicity[!rows.to.remove, ] } }, From e1e1ba82a31bda663a3ad9e04481ce534e1282ed Mon Sep 17 00:00:00 2001 From: Niklas Schneider Date: Mon, 1 Feb 2021 17:37:34 +0100 Subject: [PATCH 42/43] Fix regex when filtering out spaces and change data frame assignment Fix the regex that removes spaces at the end of a commit message. Change the assignment of the 'commit.message.data' data frame such that no new data frame is instantiated anymore. See #193 Signed-off-by: Niklas Schneider --- util-read.R | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/util-read.R b/util-read.R index e752372c..57d4559a 100644 --- a/util-read.R +++ b/util-read.R @@ -480,7 +480,7 @@ read.commit.messages = function(data.path) { ## remove spaces before first line lines = gsub("^\\s+", "", lines) ## remove spaces at the end of the message - lines = gsub("$\\s+", "", lines) + lines = gsub("\\s+$", "", lines) ## set title and message empty in case there was no actual commit message or it was consisting of spaces only title = "" @@ -504,13 +504,10 @@ read.commit.messages = function(data.path) { message.split.df = data.table::rbindlist(message.split.df) ## create a data frame containing all four necessary columns - commit.message.data = data.frame(commit.message.data[["commit.id"]], # commit.id - commit.message.data[["hash"]], # hash - message.split.df[["title"]], # title - message.split.df[["message"]]) # message - - ## set all the column names - colnames(commit.message.data) = COMMIT.MESSAGE.LIST.COLUMNS + commit.message.data["title"] = message.split.df[["title"]] # title + commit.message.data["message"] = message.split.df[["message"]] # message + ## reorder columns because they are added alphabetically + commit.message.data = commit.message.data[, COMMIT.MESSAGE.LIST.COLUMNS] ## Make commit.id have numeric type and set row names commit.message.data[["commit.id"]] = format.commit.ids(commit.message.data[["commit.id"]]) From 18843a8d1b639982d2674022422319aa5971bb13 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Wed, 3 Feb 2021 01:17:32 +0100 Subject: [PATCH 43/43] Fix problems in CI pipeline for R-3.3 Our CI pipeline does currently not work for "R-3.3" (but for all the other R versions in the pipeline). The reason for that is the following: As of 2021-01-26, there is a new version of package `memoise` (2.0.0), which is imported by `RSQLite`, which in turn is imported by `sqldf`. The new version of `memoise` imports package `cachem`, which in turn imports package `fastmap`. Using all these packages is not a problem per se, they are compatible with R 3.3.3. However, the docker container `r-base:3.3.3`, which we use in our pipeline "R-3.3", uses g++ version 6.3.0-9, which contains a bug. Due to the bug in the compiler, `fastmap` cannot be compiled and installed in tje `r-base:3.3.3` docker container, letting the CI pipeline fail. To circumvent this problem but still keep "R-3.3" in our CI pipeline, we now use docker image `r-base:3.3.2` instead, as there is another g++ version installed in this docker image. With that, the CI pipeline should succeed for "R-3.3" again. Signed-off-by: Thomas Bock --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 1122a2ca..77084d71 100644 --- a/.drone.yml +++ b/.drone.yml @@ -64,7 +64,7 @@ steps: - name: R-3.3 pull: if-not-exists - image: r-base:3.3.3 + image: r-base:3.3.2 commands: *runTests depends_on: [clone]