From 1a7c8322ff09dcdf31298cf9e59095d9a9e0cd8a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 15 Aug 2023 21:40:52 +0100 Subject: [PATCH 01/81] fix failure to merge BODS and NR gtfs files because data type of join columns not being explicitly set - and some fields assumed to be present in route table (route_desc) are not present in BODS data --- R/gtfs_merge.R | 7 +++++-- R/gtfs_read.R | 47 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index a1f1b3e..cf08bfe 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -131,8 +131,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { route_id$route_id_new <- seq(1, nrow(route_id)) routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id")) - routes <- routes[, c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")] - names(routes) <- c("route_id", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") + + columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") + columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)] + routes <- routes[, columns_to_select] + names(routes) <- columns_to_select } diff --git a/R/gtfs_read.R b/R/gtfs_read.R index 5f58818..089f2a5 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -20,20 +20,24 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), - col_types = readr::cols(agency_id = readr::col_character()), + col_types = readr::cols(agency_id = readr::col_character(), + agency_noc = readr::col_character()), show_col_types = FALSE, lazy = FALSE) } else { warning("Unable to find required file: agency.txt") } - if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), col_types = readr::cols(stop_id = readr::col_character(), stop_code = readr::col_character(), stop_name = readr::col_character(), stop_lat = readr::col_number(), - stop_lon = readr::col_number()), + stop_lon = readr::col_number(), + wheelchair_boarding = readr::col_logical(), + location_type = readr::col_integer(), + parent_station = readr::col_character(), + platform_code = readr::col_character()), lazy = FALSE, show_col_types = FALSE) @@ -46,7 +50,8 @@ gtfs_read <- function(path){ col_types = readr::cols(route_id = readr::col_character(), agency_id = readr::col_character(), route_short_name = readr::col_character(), - route_long_name = readr::col_character()), + route_long_name = readr::col_character(), + route_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) } else { @@ -56,7 +61,12 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"trips.txt"))){ gtfs$trips <- readr::read_csv(file.path(tmp_folder,"trips.txt"), col_types = readr::cols(trip_id = readr::col_character(), - route_id = readr::col_character()), + route_id = readr::col_character(), + service_id = readr::col_character(), + block_id = readr::col_character(), + shape_id = readr::col_character(), + wheelchair_accessible = readr::col_logical() + ), show_col_types = FALSE, lazy = FALSE) } else { @@ -66,8 +76,14 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"stop_times.txt"))){ gtfs$stop_times <- readr::read_csv(file.path(tmp_folder,"stop_times.txt"), col_types = readr::cols(trip_id = readr::col_character(), + stop_id = readr::col_character(), + stop_sequence = readr::col_integer(), departure_time = readr::col_character(), - arrival_time = readr::col_character()), + arrival_time = readr::col_character(), + shape_dist_traveled = readr::col_number(), + timepoint = readr::col_logical(), + pickup_type = readr::col_integer(), + drop_off_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) gtfs$stop_times$arrival_time <- lubridate::hms(gtfs$stop_times$arrival_time) @@ -79,7 +95,15 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), - col_types = readr::cols(start_date = readr::col_date(format = "%Y%m%d"), + col_types = readr::cols(service_id = readr::col_character(), + monday = readr::col_logical(), + tuesday = readr::col_logical(), + wednesday = readr::col_logical(), + thursday = readr::col_logical(), + friday = readr::col_logical(), + saturday = readr::col_logical(), + sunday = readr::col_logical(), + start_date = readr::col_date(format = "%Y%m%d"), end_date = readr::col_date(format = "%Y%m%d")), show_col_types = FALSE, lazy = FALSE) @@ -90,7 +114,9 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar_dates.txt"))){ gtfs$calendar_dates <- readr::read_csv(file.path(tmp_folder,"calendar_dates.txt"), - col_types = readr::cols(date = readr::col_date(format = "%Y%m%d")), + col_types = readr::cols(service_id = readr::col_character(), + date = readr::col_date(format = "%Y%m%d"), + exception_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) } else { @@ -115,6 +141,11 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), + col_types = readr::cols(shape_id = readr::col_character(), + shape_pt_lat = readr::col_number(), + shape_pt_lon = readr::col_number(), + shape_pt_sequence = readr::col_integer(), + shape_dist_traveled = readr::col_number()), show_col_types = FALSE, lazy = FALSE) } else { From b31dfb76689add03f59dc155543009a3879d5a8a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:05:43 +0100 Subject: [PATCH 02/81] trip id is a varchar in BODS data - remove assumption that it's an int --- R/gtfs_subset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/gtfs_subset.R b/R/gtfs_subset.R index b824899..c26ff8d 100644 --- a/R/gtfs_subset.R +++ b/R/gtfs_subset.R @@ -36,7 +36,7 @@ gtfs_clip <- function(gtfs, bounds) { gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% stops_inc, ] # Check for single stop trips n_stops <- table(gtfs$stop_times$trip_id) - single_stops <- as.integer(names(n_stops[n_stops == 1])) + single_stops <- names(n_stops[n_stops == 1]) gtfs$stop_times <- gtfs$stop_times[!gtfs$stop_times$trip_id %in% single_stops, ] # Check for any unused stops From 074826c931844c152a848dfa1d4de37eab43759a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:07:04 +0100 Subject: [PATCH 03/81] exception_type is an int - tests failing after tightening up data type definitions --- R/transxchange_export.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/transxchange_export.R b/R/transxchange_export.R index 8517cc5..acae130 100644 --- a/R/transxchange_export.R +++ b/R/transxchange_export.R @@ -499,12 +499,12 @@ transxchange_export <- function(obj, calendar_dates <- data.frame( trip_id = character(), date = character(), - exception_type = character(), + exception_type = integer(), stringsAsFactors = FALSE ) calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) } else { - # remove calendar_dates for trips that have been competly removed + # remove calendar_dates for trips that have been competely removed calendar_dates <- calendar_dates[calendar_dates$trip_id %in% calendar$trip_id, ] calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) From 8d4c2d84d00d8677ab14ff65b55881f976fe69f4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:10:03 +0100 Subject: [PATCH 04/81] code was assuming a set of tables, and hence dumping tables like 'frequencies' and 'feed_info' when reading / writing BODS data. Removed assumptions about table names, all additional tables encountered in zip are read and written. also performance improvement when writing. --- R/gtfs_read.R | 63 ++++++++++++++++++-------------------------------- R/write_gtfs.R | 46 +++++++++++++++--------------------- 2 files changed, 41 insertions(+), 68 deletions(-) diff --git a/R/gtfs_read.R b/R/gtfs_read.R index 089f2a5..fcd585d 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -16,7 +16,6 @@ gtfs_read <- function(path){ files <- list.files(tmp_folder, pattern = ".txt") gtfs <- list() - message_log <- c("Unable to find optional files: ") if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), @@ -27,6 +26,7 @@ gtfs_read <- function(path){ } else { warning("Unable to find required file: agency.txt") } + if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), col_types = readr::cols(stop_id = readr::col_character(), @@ -34,7 +34,7 @@ gtfs_read <- function(path){ stop_name = readr::col_character(), stop_lat = readr::col_number(), stop_lon = readr::col_number(), - wheelchair_boarding = readr::col_logical(), + wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file location_type = readr::col_integer(), parent_station = readr::col_character(), platform_code = readr::col_character()), @@ -65,7 +65,7 @@ gtfs_read <- function(path){ service_id = readr::col_character(), block_id = readr::col_character(), shape_id = readr::col_character(), - wheelchair_accessible = readr::col_logical() + wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file ), show_col_types = FALSE, lazy = FALSE) @@ -81,7 +81,7 @@ gtfs_read <- function(path){ departure_time = readr::col_character(), arrival_time = readr::col_character(), shape_dist_traveled = readr::col_number(), - timepoint = readr::col_logical(), + timepoint = readr::col_integer(), #boolean but treat as integer so 0|1 written to file pickup_type = readr::col_integer(), drop_off_type = readr::col_integer()), show_col_types = FALSE, @@ -96,13 +96,13 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), col_types = readr::cols(service_id = readr::col_character(), - monday = readr::col_logical(), - tuesday = readr::col_logical(), - wednesday = readr::col_logical(), - thursday = readr::col_logical(), - friday = readr::col_logical(), - saturday = readr::col_logical(), - sunday = readr::col_logical(), + monday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + tuesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + wednesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + thursday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + friday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + saturday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + sunday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file start_date = readr::col_date(format = "%Y%m%d"), end_date = readr::col_date(format = "%Y%m%d")), show_col_types = FALSE, @@ -123,22 +123,6 @@ gtfs_read <- function(path){ message("Unable to find conditionally required file: calendar_dates.txt") } - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_attributes.txt"))){ - gtfs$fare_attributes <- readr::read_csv(file.path(tmp_folder,"fare_attributes.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "fare_attributes.txt") - } - - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_rules.txt"))){ - gtfs$fare_rules <- readr::read_csv(file.path(tmp_folder,"fare_rules.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "fare_rules.txt") - } - if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), col_types = readr::cols(shape_id = readr::col_character(), @@ -148,25 +132,24 @@ gtfs_read <- function(path){ shape_dist_traveled = readr::col_number()), show_col_types = FALSE, lazy = FALSE) - } else { - message_log <- c(message_log, "shapes.txt") - } - - if(checkmate::test_file_exists(file.path(tmp_folder,"transfers.txt"))){ - gtfs$transfers <- readr::read_csv(file.path(tmp_folder,"transfers.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "transfers.txt") } - unlink(tmp_folder, recursive = TRUE) + #load any other tables in the .zip file + filenamesOnly <- tools::file_path_sans_ext(basename(files)) + notLoadedFiles = setdiff( filenamesOnly, names(gtfs) ) - if(length(message_log) > 0){ - message(paste(message_log, collapse = " ")) + for (fileName in notLoadedFiles) + { + table <- readr::read_csv(file.path( tmp_folder, paste0( fileName, ".txt" ) ), + show_col_types = FALSE, + lazy = FALSE) + gtfs[[fileName]] <- table } + #remove temp directory + unlink(tmp_folder, recursive = TRUE) + return(gtfs) } diff --git a/R/write_gtfs.R b/R/write_gtfs.R index 6e2e5d9..2142f7f 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -1,6 +1,6 @@ #' Write GTFS #' -#' Takes a list of data frames represneting the GTFS fromat and saves them as GTFS +#' Takes a list of data frames representing the GTFS format and saves them as GTFS #' Zip file. #' #' @param gtfs named list of data.frames @@ -19,6 +19,7 @@ gtfs_write <- function(gtfs, stripTab = TRUE, stripNewline = TRUE, quote = FALSE) { + if (stripComma) { for (i in seq_len(length(gtfs))) { gtfs[[i]] <- stripCommas(gtfs[[i]]) @@ -33,7 +34,6 @@ gtfs_write <- function(gtfs, #Format Dates - if(class(gtfs$calendar$start_date) == "Date"){ gtfs$calendar$start_date <- format(gtfs$calendar$start_date, "%Y%m%d") } @@ -57,24 +57,20 @@ gtfs_write <- function(gtfs, dir.create(paste0(tempdir(), "/gtfs_temp")) - data.table::fwrite(gtfs$calendar, paste0(tempdir(), "/gtfs_temp/calendar.txt"), row.names = FALSE, quote = quote) - if (nrow(gtfs$calendar_dates) > 0) { - data.table::fwrite(gtfs$calendar_dates, paste0(tempdir(), "/gtfs_temp/calendar_dates.txt"), row.names = FALSE, quote = quote) - } - data.table::fwrite(gtfs$routes, paste0(tempdir(), "/gtfs_temp/routes.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stop_times, paste0(tempdir(), "/gtfs_temp/stop_times.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$trips, paste0(tempdir(), "/gtfs_temp/trips.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stops, paste0(tempdir(), "/gtfs_temp/stops.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$agency, paste0(tempdir(), "/gtfs_temp/agency.txt"), row.names = FALSE, quote = quote) - if ("transfers" %in% names(gtfs)) { - data.table::fwrite(gtfs$transfers, paste0(tempdir(), "/gtfs_temp/transfers.txt"), row.names = FALSE, quote = quote) - } - if ("shapes" %in% names(gtfs)) { - data.table::fwrite(gtfs$shapes, paste0(tempdir(), "/gtfs_temp/shapes.txt"), row.names = FALSE, quote = quote) + + for ( tableName in names(gtfs) ) + { + table <- gtfs[[tableName]] + + if ( !is.null(table) & nrow(table) > 0 ) + { + data.table::fwrite(table, file.path(tempdir(), "gtfs_temp", paste0(tableName, ".txt")), row.names = FALSE, quote = quote) + } } + zip::zipr(paste0(folder, "/", name, ".zip"), list.files(paste0(tempdir(), "/gtfs_temp"), full.names = TRUE), recurse = FALSE) + unlink(paste0(tempdir(), "/gtfs_temp"), recursive = TRUE) - message(paste0(folder, "/", name, ".zip")) } @@ -126,9 +122,11 @@ stripTabs <- function(df, stripNewline) { #' Convert Period to GTFS timestamps +#' When writing a 400mb (zipped) file, we spend nearly 4 minutes in this fn(), about 10x longer than writing the files to the filesystem. +#' profiler reports this being mostly nchar(), so we optimise down to one sprintf which reduces the time to 1 minute +#' .format() is about 7x slower than sprintf() #' -#' -#' @param x peridos +#' @param x periods #' @noRd #' period2gtfs <- function(x) { @@ -139,14 +137,6 @@ period2gtfs <- function(x) { stop("Days detected in period objects, incorectly formatted period object") } - hrs <- as.character(lubridate::hour(x)) - min <- as.character(lubridate::minute(x)) - sec <- as.character(lubridate::second(x)) - - hrs <- ifelse(nchar(hrs) == 1,paste0("0",hrs), hrs) - min <- ifelse(nchar(min) == 1,paste0("0",min), min) - sec <- ifelse(nchar(sec) == 1,paste0("0",sec), sec) - - return(paste0(hrs,":",min,":",sec)) + return( sprintf("%02d:%02d:%02d", lubridate::hour(x), lubridate::minute(x), lubridate::second(x)) ) } From ccd451e6f0795aa16d8ef0a3d842f9465eade36e Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:12:42 +0100 Subject: [PATCH 05/81] merge was expecting a fixed set of tables and columns and throwing away anything else that it wasn't expecting. OK for the GTFS tables generated by this tool from CIF files, but caused data loss when using BODS data. modified to keep all encountered columns and tables - passes through if there isn't any specific code for the table in question. --- R/gtfs_merge.R | 225 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 147 insertions(+), 78 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index cf08bfe..cf3f1de 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -1,5 +1,11 @@ #' merge a list of gtfs files #' +#' !WARNING! only the tables: +#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes +#' are processed, any other tables in the input timetables are passed through +#' +#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output. +#' #' @param gtfs_list a list of gtfs objects to be merged #' @param force logical, if TRUE duplicated values are merged taking the fist #' @param quiet logical, if TRUE less messages @@ -8,40 +14,62 @@ #' @export gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { - # remove any NULLS + # remove any empty input tables gtfs_list <- gtfs_list[lengths(gtfs_list) != 0] + flattened <- unlist(gtfs_list, recursive = FALSE) + rm(gtfs_list) - # Split out lists - agency <- sapply(gtfs_list, "[", "agency") - stops <- sapply(gtfs_list, "[", "stops") - routes <- sapply(gtfs_list, "[", "routes") - trips <- sapply(gtfs_list, "[", "trips") - stop_times <- sapply(gtfs_list, "[", "stop_times") - calendar <- sapply(gtfs_list, "[", "calendar") - calendar_dates <- sapply(gtfs_list, "[", "calendar_dates") + #get unique input table names + tableNames <- unique(names(flattened)) + + grouped_list <- list() + + # Loop through table names names and group data frames + for (tableName in tableNames) { + + matched <- purrr::imap( flattened, function( item, name ) { + if (name == tableName) { + return(item) + } + }) + + #remove empty input tables + matched <- matched[lengths(matched) != 0] + + #assign each instance of the input table a unique number + names(matched) <- seq(1, length(matched)) - # bind together - names(agency) <- seq(1, length(agency)) - suppressWarnings(agency <- dplyr::bind_rows(agency, .id = "file_id")) + #add a column to the data frame containing this unique number + suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id")) - names(stops) <- seq(1, length(stops)) - suppressWarnings(stops <- dplyr::bind_rows(stops, .id = "file_id")) + #if("calendar_dates"==tableName) + #{ + # #don't understand what this is doing ? comment would be nice. + # calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + # #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + #} - names(routes) <- seq(1, length(routes)) - suppressWarnings(routes <- dplyr::bind_rows(routes, .id = "file_id")) + #add to map + grouped_list[[tableName]] <- matched + } - names(trips) <- seq(1, length(trips)) - suppressWarnings(trips <- dplyr::bind_rows(trips, .id = "file_id")) + rm(flattened) - names(stop_times) <- seq(1, length(stop_times)) - suppressWarnings(stop_times <- dplyr::bind_rows(stop_times, .id = "file_id")) + # Split out lists + agency <- grouped_list$agency + stops <- grouped_list$stops + routes <- grouped_list$routes + trips <- grouped_list$trips + stop_times <- grouped_list$stop_times + calendar <- grouped_list$calendar + calendar_dates <- grouped_list$calendar_dates + shapes <- grouped_list$shapes + frequencies <- grouped_list$frequencies - names(calendar) <- seq(1, length(calendar)) - suppressWarnings(calendar <- dplyr::bind_rows(calendar, .id = "file_id")) + #remove items from map. + grouped_list <- grouped_list[setdiff(names(grouped_list), + c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes", "frequencies" ))] - names(calendar_dates) <- seq(1, length(calendar_dates)) - calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] - suppressWarnings(calendar_dates <- dplyr::bind_rows(calendar_dates, .id = "file_id")) # fix typo agency$agency_name <- as.character(agency$agency_name) @@ -112,49 +140,54 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { } else { stop("Duplicated Stop IDS") } - - } # routes if (any(duplicated(routes$route_id))) { if(!quiet){message("De-duplicating route_id")} - route_id <- routes[, c("file_id", "route_id")] - if (any(duplicated(route_id))) { + + retainedColumnNames <- colnames(routes)[!(colnames(routes) %in% c("route_id", "file_id"))] + + new_route_id <- routes[, c("file_id", "route_id")] + if (any(duplicated(new_route_id))) { if(force){ - routes <- routes[!duplicated(route_id), ] - route_id <- routes[, c("file_id", "route_id")] + routes <- routes[!duplicated(new_route_id), ] + new_route_id <- routes[, c("file_id", "route_id")] } else { stop("Duplicated route_id within the same GTFS file, try using force = TRUE") } } - route_id$route_id_new <- seq(1, nrow(route_id)) - routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id")) + new_route_id$route_id_new <- seq(1, nrow(new_route_id)) + routes <- dplyr::left_join(routes, new_route_id, by = c("file_id", "route_id")) - columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") - columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)] - routes <- routes[, columns_to_select] - names(routes) <- columns_to_select + routes <- routes[, c("route_id_new", retainedColumnNames)] + routes <- routes %>% dplyr::rename(route_id = route_id_new) } # calendar if (any(duplicated(calendar$service_id))) { if(!quiet){message("De-duplicating service_id")} - service_id <- calendar[, c("file_id", "service_id")] - if (any(duplicated(service_id))) { + + new_service_id <- calendar[, c("file_id", "service_id")] + if (any(duplicated(new_service_id))) { stop("Duplicated service_id within the same GTFS file") } - service_id$service_id_new <- seq(1, nrow(service_id)) - calendar <- dplyr::left_join(calendar, service_id, by = c("file_id", "service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + + new_service_id$service_id_new <- seq(1, nrow(new_service_id)) + + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] + calendar <- dplyr::left_join(calendar, new_service_id, by = c("file_id", "service_id")) + calendar <- calendar[, c("service_id_new", retainedColumnNames)] + names(calendar) <- c("service_id", retainedColumnNames) if (nrow(calendar_dates) > 0) { - calendar_dates <- dplyr::left_join(calendar_dates, service_id, by = c("file_id", "service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] + + calendar_dates <- dplyr::left_join(calendar_dates, new_service_id, by = c("file_id", "service_id")) + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) } } @@ -162,45 +195,60 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { # Trips if (any(duplicated(trips$trip_id))) { if(!quiet){message("De-duplicating trip_id")} - trip_id <- trips[, c("file_id", "trip_id")] - if (any(duplicated(trip_id))) { + + new_trip_id <- trips[, c("file_id", "trip_id")] + if (any(duplicated(new_trip_id))) { if(force){ trips <- unique(trips) stop_times <- unique(stop_times) - trip_id <- trips[, c("file_id", "trip_id")] + new_trip_id <- trips[, c("file_id", "trip_id")] } else{ stop("Duplicated trip_id within the same GTFS file") } } - trip_id$trip_id_new <- seq(1, nrow(trip_id)) - trips <- dplyr::left_join(trips, trip_id, by = c("file_id", "trip_id")) - trips <- trips[, c("route_id", "service_id", "trip_id_new", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") - - - stop_times <- dplyr::left_join(stop_times, trip_id, by = c("file_id", "trip_id")) - stop_times <- stop_times[, c("trip_id_new", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint")] - names(stop_times) <- c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint") + new_trip_id$trip_id_new <- seq(1, nrow(new_trip_id)) + + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("trip_id"))] + trips <- dplyr::left_join(trips, new_trip_id, by = c("file_id", "trip_id")) + trips <- trips[, c("trip_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(trip_id = trip_id_new) + + retainedColumnNames <- colnames(stop_times)[!(colnames(stop_times) %in% c("trip_id", "file_id"))] + stop_times <- dplyr::left_join(stop_times, new_trip_id, by = c("file_id", "trip_id")) + stop_times <- stop_times[, c("trip_id_new", retainedColumnNames)] + stop_times <- stop_times %>% dplyr::rename(trip_id = trip_id_new) + + if ( length(frequencies) > 0 ) + { + retainedColumnNames <- colnames(frequencies)[!(colnames(frequencies) %in% c("trip_id", "file_id"))] + frequencies <- dplyr::left_join(frequencies, new_trip_id, by = c("file_id", "trip_id")) + frequencies <- frequencies[, c("trip_id_new", retainedColumnNames)] + frequencies <- frequencies %>% dplyr::rename(trip_id = trip_id_new) + } } - if (exists("service_id")) { - trips <- dplyr::left_join(trips, service_id, by = c("file_id", "service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_service_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id"))] + trips <- dplyr::left_join(trips, new_service_id, by = c("file_id", "service_id")) + trips <- trips[, c(retainedColumnNames, "service_id_new")] + trips <- trips %>% dplyr::rename(service_id = service_id_new) } - if (exists("route_id")) { - trips <- dplyr::left_join(trips, route_id, by = c("file_id", "route_id")) - trips <- trips[, c("route_id_new", "service_id", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_route_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("route_id"))] + trips <- dplyr::left_join(trips, new_route_id, by = c("file_id", "route_id")) + trips <- trips[, c("route_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(route_id = route_id_new) } - trips <- trips[, c("route_id", "service_id", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips$file_id <- NULL # Condense Duplicate Service patterns if (nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} + calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) if(class(calendar_dates_summary$date) == "Date"){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, @@ -221,27 +269,48 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary) calendar_summary <- calendar_summary[, c("service_id_new", "service_id")] + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id", "route_id"))] trips <- dplyr::left_join(trips, calendar_summary, by = c("service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips <- trips[, c("route_id", "service_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(service_id = service_id_new) + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] calendar <- dplyr::left_join(calendar, calendar_summary, by = c("service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + calendar <- calendar[, c("service_id_new", retainedColumnNames)] + calendar <- calendar %>% dplyr::rename(service_id = service_id_new) calendar <- calendar[!duplicated(calendar$service_id), ] - + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] calendar_dates <- dplyr::left_join(calendar_dates, calendar_summary, by = c("service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } + # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object + composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#") + if (any(duplicated(composite_key))) { + if(force){ + shapes <- shapes[!duplicated(composite_key),] + } else { + stop("Duplicated Shapes IDS") + } + } + + shapes$file_id <- NULL stop_times$file_id <- NULL routes$file_id <- NULL calendar$file_id <- NULL + res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) + names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") + + #for tables we don't explicitly process - hope items are unique + for (item in grouped_list) { + item$file_id <- NULL + } + + #remove nulls (e.g. tables that are often empty like frequencies) + res_final <- Filter(Negate(is.null), res_final) - res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates) - names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates") - return(res_final) + return (c(res_final, grouped_list)) } From c9f7cce8da16710a0a7518e380675c7697a37d42 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:29:38 +0100 Subject: [PATCH 06/81] added FOC YG + MV (appearing in NR timetable extracts) renamed ZZ to reflect it's a generic obfuscated freight, not a specific operator --- data/atoc_agency.rda | Bin 2055 -> 2133 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/atoc_agency.rda b/data/atoc_agency.rda index 93c47cad3b74f115a6b19afb7c8bdd3d7cd81550..97495dad84e4893b5dc671509aefc7d12b9e8329 100644 GIT binary patch literal 2133 zcmV-b2&(rViwFP!000002JKqia}!4rcOay|fC-Qg5=fZj5*MjD+uGHqQ}^O1{yHSX zdgTvzQ7x;nw6V0Sn%%YRd%FMX{=4IPc4m5K*0LO4@{kjSOw;q5{_dW}X8*V2^5XI0 z!otGMg->tXSh#r$$c0-6ovoF|0&w6jpDcX3a0mXp>m`xj_4+jQF97}+Sf9b?bNK8^ z$c>GCq>dETY$COTl&h%CHc~n7Kv7$GcRQ!{kV0+SsBQaLQ9HjPg?D#4NFi>wrKr|# zNNpmug%n!adW{rX&_e55sH2Ti+J}nT!_&RDigGrPLYp15*+Huul;@yT&NfmQUFV-j zy+R7TbkGk6Bk7<=&Ksm0q|i$Ty>!qI2eaj%mk#E`$-R81C>Q;3(NY&}anTkRZE?{8 z7bSPlqYmOa`6=o>L`#oQ$`Sf;jB!53)3+GYckc=net%;)47|{%WZw&hlKFUpvEa-j z9p(jLOpd%Ejpflz-}WW>usH~Z31wk+b>dwp%u|;#KO#-I){8=rPTV1-W6I?D7n@$3 zkao};dSNe?TIAX7&6GuP;wkKt!+<5JHzb@trXcH$mkbp8%dLQcsHQg#62LU0Fr zT-#$51u1F}bxNyJVwm&wc5 z;R;BDD78I*09M97tH`z@KN>|za7GCf%30uJaXpf<55itS_+p!~m|n2FM(-VZ%o_!< zUbpvET6N~PA|Y?Vw0ICOO4?Dx;zDA6G>nI!_lT2v45AgU9kD3vla8wz>#2ct(>SK* z=O;~+&~BixgZ_$Hv@C2IIp%?l!t!)!Z=qQxy5ME zAMlcsuVXmj;Ah)Z~3$!#^sYrtgg zD;poU6EB3duF&GQd7Wu<;i1r$wT&_N7CQrPMB`x)o~niv;~7w4{Dy`hM6WQ9p{LK& zn7r&gheW(ONRjj9E>tlJ={3+AQ{34r9{;cXaa z!BTN;!t)t7VRR-~FtKE z4&M!uWcz$tDoynX;BSHU=;aS0}+~Fs8l0Ta8$MjsI<} zu(N!l&|LJveyhFfy1RSZAW{`_dA7W)+`4kB>6)v89)Lc7x_z6 zOY%jibVYhm4Pd{TmfmTirr4|pH0HyD`#(+Ird|LtTv0k65B*G>_Z}{<;T0wVy0l4jb~zeym|l`&7w*9 z$80}D#o`mz&6Z77xkPUfWl1RB$jkjZN6k@l)SrR+sW{l>UMd$06Ku^M0PcXEoCI{( z>lU}y7nii?8b(u1j)#7P`>l>zxyA~y0vDc4eMa(CX!g!9^NG`e3GT~N+|iedWSqaM zSG+_a-=n!7{CvbWXbJlJY=`A~-8I(hwb^&X^4;aDc2{4%ugP$ZSK(FqD%V^Ti+lGc zyVkw5xa;ev&v$rV?C{fq|AW$_J%gjDM`O+qtAw5=RZN3y?H|Cd{_%}c_^zS&Zk$}x z=FB;B&YUyn%)dCZ@WW9W&Z#-|m!ixHzgL1^M1GLhs8mUSfhBoL7U3 zRbthu$SjvZSPec^l$asw#Iiv|y)jeuALR*i%9Tdd&NTzCWY()REmCRJ>K_^3%<#HxgNcUWRg2V1HmIpv$kc9^ zj96woEy7nRs*@^YqcLaxUol06l%Oqbdq#m|rDPEsdUze`?&Ie~J!4d%baT~$M-1jw z!y{`ztA+)?1nmp;os?moSaw;nvdkmKd6m2^%VitdA9}Nv zUpLy){BL2mLjK(h`4W64nRmp06YB=*yU}+ggM%e&QL<8|e$e(na*|*~|CRqZUELje zVP82ZS>pkR6%6u+UaSmK%*CFUc&jIjt2g0ySOouqmpZNWZm literal 2055 zcmV+i2>ACxT4*^jL0KkKS)Y2IIsgkEf5`uT|NsAI|MWlq|M0*6-|#>H03ZMX;0S+3 z$$D6&ZM)q!PTdC}06GR3qzQrv>Uv4&o}kcpQKp`ugwPrdJx^28c|8px4^wIY9*{JX zLV5ujo}uj#Jw}ZH007VdkOM#sG%$bxrkYJ8%4%qOo`?bJ2dD#020@?z000d!44MEX zGDtN+r1dm<_DYY6j9--zApkFcbA%;+j&K1o6})1Nwf6m8#j-)Lmh5r?IyPohiJi^7`h5IyGB@zApJ1 z$}ibrLbP_ciMwRI1OU@V-CCF6hRGsGhjDlD-5)wY+0S2vRyw;9fkQXGk$o_EPjVy;0Fk^W5rxKdHx;6yxKn)^+qO?PS z3NBXx7uz+*mz}OyQ4^`{AK@VrtlG(K?W!XEvn`|B4iPhiC}SJ+G1ix{UYyIBszU)0 zx4HM$(%4ARaOx6@EM?*iUQPppT#FNj$re`F;NV~evBt%SgkreQrE_aL>=at~EJfjU zA|Pm4xE$J^-ET>mzA7wp$0is%(?|-zB#qNcCa(IJ2;Byhj1CdGz=aL5yc#sJP0l`q zkQ;*(kPR&BZnTaYDbi>BcqB9lZ#*p>{-^{V7K9Nj-76HSmXc8g1Q8p&ZpmO7?ZiTE z#M_*_EJH~RwIq`eew33bPlj$!^*b>NqY2^HeH)z6DqPWI;9`y&U|>XHX!6S@WQIBo zboytaBD8;M5~flHe^-bb!!scK!gM`rMB3~S&h)K0u<;V zijX*^oLdPzc7oc|K-vZYMIZwP{lPt02MWcj!9um&tGh0{UG1tyA#p~F5lAF0uGC$K zwNFV_nkihSZ>qt=sf37t2dN5#9dW+MobW;#L5S^@gG(<@fq9=>jan(^6U1i|8h257 z3Vvt1`?jHsK^g=%00IJG>lrXDQftj9B@~7LH$iuw2n*G(Ki1OIEh}F#N(qMiD(@Yb z&tN=4%4Z(}baA+_Ip?1o#H`Q(DOyGrM}Ag&A^%z=nt>!1#fV{py%kDq3ep1Ipve1+ zSyHUVA!DEAx^#V)L8oVvgV&i>JU!p@A!OG^>*ReSBK^93`fuNNCoSR7ZJ|a62pAuh zrt{)h;($Z*ZPv-Vu5V?t!+|C^*PkOWJ$`UnnFdl1s9T+?dTTm*MDRZQ2+!A zD5z9KC0NMR9)^LXhUmmA;h1x65wHtZ$+Uqb7>rDdZp$}s+`7TBjjE^_fX-ZCU(=q1 zyOfUytAPSx++N|fCDncZxS29WO6;7*_kmIXf1!UzAzU{J7Fau@#(F%9v8Utgppn=B9#NrX z>4s&L%0Wbzq=Ix~(vehp(TceG#*Wau^5u_VL<{bKDNEiGCIXXDNPvNiZH<6wXvv`_ zm`iR9?P%MEbK<+UP`g$}b0E|BD`H%-(p8~SM!~TUV0tvPFxP-cp3;nMD z8QV z(nqs>Wye?0IWoXiBa`zSMf4^@;nvf2Rb6}n{3fw2m##C=<_K81fM!za0D?RRH#E%% znflp(UW}yKtO%1xks;_9F}UL=_&M+2q=Bep5Q;*xL=!E-W}+c$Vkqp?GR&p~1nv!lf>8tw zb2ZN5{(Vi&V>H;=C?T}&LoYfRC5pNSj}Vf*phYW;wcQq>iKw9?OtmC{nL|wrl~4+H zJuVIR8KEQL$4cRAzB|qi(Fit__9}(9aKbuu5PL{NV1sRr;uQ30A`OWf5+MMrfhizM ln^w?irp?>4Wt=ofT~s6)l$G!I&>+9!?ntK!5) Date: Fri, 25 Aug 2023 15:41:50 +0100 Subject: [PATCH 07/81] fix null columns being inserted in calendar_dates key field when a calendar_date exists with no calendar (rare but valid data configuration) --- R/gtfs_merge.R | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index cf3f1de..4926c20 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -1,20 +1,21 @@ #' merge a list of gtfs files #' #' !WARNING! only the tables: -#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes +#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies #' are processed, any other tables in the input timetables are passed through #' -#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output. +#' if duplicate IDs are detected then completely new IDs for all rows will be generated in the output. #' #' @param gtfs_list a list of gtfs objects to be merged #' @param force logical, if TRUE duplicated values are merged taking the fist -#' @param quiet logical, if TRUE less messages #' instance to be the correct instance, in most cases this is ok, but may #' cause some errors +#' @param quiet logical, if TRUE less messages +#' @param condenseServicePatterns logical, if TRUE service patterns across all routes are condensed into a unique set of patterns #' @export -gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { +gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePatterns = TRUE) { - # remove any empty input tables + # remove any empty input GTFS objects gtfs_list <- gtfs_list[lengths(gtfs_list) != 0] flattened <- unlist(gtfs_list, recursive = FALSE) rm(gtfs_list) @@ -33,7 +34,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { } }) - #remove empty input tables + #remove input tables not matching tableName matched <- matched[lengths(matched) != 0] #assign each instance of the input table a unique number @@ -41,10 +42,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { #add a column to the data frame containing this unique number suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id")) + matched$file_id <- as.integer(matched$file_id) #if("calendar_dates"==tableName) #{ - # #don't understand what this is doing ? comment would be nice. + # #don't understand what this complex line is doing ? comment would be nice. # calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] # #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] #} @@ -167,7 +169,9 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { # calendar - if (any(duplicated(calendar$service_id))) { + calendar_dates_key <- paste(calendar_dates$service_id, calendar_dates$date, calendar_dates$exception_type, sep="#") + + if (any(duplicated(calendar$service_id)) || any(duplicated(calendar_dates_key))) { if(!quiet){message("De-duplicating service_id")} new_service_id <- calendar[, c("file_id", "service_id")] @@ -175,6 +179,10 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { stop("Duplicated service_id within the same GTFS file") } + # it is valid to have calendar_dates with no associated calendar (see comments further down) + # so create the distinct set of service_id in both calendar and calendar_dates + new_service_id <- dplyr::union(unique(new_service_id), unique(calendar_dates[, c("file_id", "service_id")])) + new_service_id$service_id_new <- seq(1, nrow(new_service_id)) retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] @@ -246,9 +254,18 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { trips$file_id <- NULL # Condense Duplicate Service patterns - if (nrow(calendar_dates) > 0) { + + # in an ideal world we should not have a trip without a service pattern, and not have calendar_dates with no associated calendar, + # but the real world data isn't that tidy. + # In a typical all GB BODS extract Around 0.2% of trips have a calendar ID but no row in calendar, + # 0.2% of calendar_dates have no trips, 0.1% of calendar_dates have no corresponding calendar. + # we need to guard against this to make sure we don't end up putting null values into any key fields + # This documentation https://gtfs.org/schedule/reference/#calendar_datestxt specifically mentions calendar dates without calendars + # as being a legitimate way to construct the data. + if (condenseServicePatterns && nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} + #find every unique combination of calendar_dates and calender values calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) if(class(calendar_dates_summary$date) == "Date"){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, @@ -260,12 +277,15 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { ) } - calendar_summary <- dplyr::left_join(calendar, calendar_dates_summary, by = "service_id") + #we want to keep all rows in calendar_dates even if they don't have a row in calendar + calendar_summary <- dplyr::full_join(calendar, calendar_dates_summary, by = "service_id") calendar_summary <- dplyr::group_by( calendar_summary, start_date, end_date, monday, tuesday, wednesday, thursday, friday, saturday, sunday, pattern ) + + #give every unique combination of dates / days / exceptions a new distinct service ID calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary) calendar_summary <- calendar_summary[, c("service_id_new", "service_id")] @@ -287,7 +307,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } - # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object + + # shapes in a BODS extract are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#") if (any(duplicated(composite_key))) { if(force){ @@ -301,6 +322,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { stop_times$file_id <- NULL routes$file_id <- NULL calendar$file_id <- NULL + frequencies$file_id <- NULL res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") From bf8f460acbf624f594215cbea94132c602a924de Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:42:38 +0100 Subject: [PATCH 08/81] correct comment - column is a 3 valued enum, not bool --- R/gtfs_read.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/gtfs_read.R b/R/gtfs_read.R index fcd585d..ead852c 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -34,7 +34,7 @@ gtfs_read <- function(path){ stop_name = readr::col_character(), stop_lat = readr::col_number(), stop_lon = readr::col_number(), - wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + wheelchair_boarding = readr::col_integer(), #enum value 2 is valid but rarely seen outside the spec document location_type = readr::col_integer(), parent_station = readr::col_character(), platform_code = readr::col_character()), @@ -65,7 +65,7 @@ gtfs_read <- function(path){ service_id = readr::col_character(), block_id = readr::col_character(), shape_id = readr::col_character(), - wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file + wheelchair_accessible = readr::col_integer() #enum value 2 is valid but rarely seen outside the spec document ), show_col_types = FALSE, lazy = FALSE) From f04ae8b4231bf5e3a41bc84a20708dc0ea43b7d6 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:43:35 +0100 Subject: [PATCH 09/81] pass 'silent' value down the call stack and give some level of warning about merge() failing --- R/transxchange.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/transxchange.R b/R/transxchange.R index 5cf8172..b6c6c7a 100644 --- a/R/transxchange.R +++ b/R/transxchange.R @@ -208,11 +208,11 @@ transxchange2gtfs <- function(path_in, if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} - gtfs_merged <- try(gtfs_merge(gtfs_all, force = force_merge)) + gtfs_merged <- try(gtfs_merge(gtfs_all, force=force_merge, quiet=silent)) if (class(gtfs_merged) == "try-error") { - message("Merging failed, returing unmerged GFTS object for analysis") - return(gtfs_all) + warning("Merging failed, returing unmerged GFTS object for analysis") + return(gtfs_all) #this is not helpful - caller has no idea there was an error and ploughs on, causing strange errors much later on } return(gtfs_merged) } From b4b5651ebf7205883e4b516a23cc05fd678710f6 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:44:12 +0100 Subject: [PATCH 10/81] format frequencies table columns correctly --- R/write_gtfs.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/write_gtfs.R b/R/write_gtfs.R index 2142f7f..a56faa8 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -55,6 +55,16 @@ gtfs_write <- function(gtfs, gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) } + if("frequencies" %in% names(gtfs)) + { + if("difftime" %in% class(gtfs$frequencies$start_time)){ + gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S") + } + + if("difftime" %in% class(gtfs$frequencies$end_time)){ + gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S") + } + } dir.create(paste0(tempdir(), "/gtfs_temp")) From 4048f2774cb082d1f0964331faf85c81eb84e983 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:44:56 +0100 Subject: [PATCH 11/81] exercise more code paths with test by producing more verbose output --- tests/testthat/test_transxchange.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_transxchange.R b/tests/testthat/test_transxchange.R index 112cc7b..92f3578 100644 --- a/tests/testthat/test_transxchange.R +++ b/tests/testthat/test_transxchange.R @@ -26,7 +26,8 @@ test_that("test transxchange2gtfs singlecore", { naptan = naptan, ncores = 1, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs2") expect_true(file.exists(file.path(file_path,"txc_gtfs2.zip"))) @@ -40,7 +41,8 @@ if(.Platform$OS.type == "unix") { naptan = naptan, ncores = 2, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs") expect_true(file.exists(file.path(file_path,"txc_gtfs.zip"))) From 8391315238b1dfbdd5956fa716d27d531057077c Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 17:10:08 +0100 Subject: [PATCH 12/81] stop errors in merge() being swallowed silently, with the only clue to the caller being a different shaped structure being passed back --- R/transxchange.R | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/R/transxchange.R b/R/transxchange.R index b6c6c7a..ddd5cca 100644 --- a/R/transxchange.R +++ b/R/transxchange.R @@ -8,13 +8,14 @@ #' @param cal Calendar object from get_bank_holidays() #' @param naptan Naptan stop locations from get_naptan() #' @param scotland character, should Scottish bank holidays be used? Can be -#' "auto" (defualt), "yes", "no". If "auto" and path_in ends with "S.zip" +#' "auto" (default), "yes", "no". If "auto" and path_in ends with "S.zip" #' Scottish bank holidays will be used, otherwise England and Wales bank #' holidays are used. #' @param try_mode Logical, if TRUE import and conversion are wrapped in try #' calls thus a failure on a single file will not cause the whole process to #' fail. Warning this could result in a GTFS file with missing routes. #' @param force_merge Logical, passed to gtfs_merge(force), default FALSE +#' @param merge Logical, if results are merged into one GTFS object by calling gtfs_merge, default TRUE #' @return A GTFS named list #' @details #' @@ -38,7 +39,8 @@ transxchange2gtfs <- function(path_in, naptan = get_naptan(), scotland = "auto", try_mode = TRUE, - force_merge = FALSE) { + force_merge = FALSE, + merge = TRUE) { # Check inputs checkmate::assert_numeric(ncores) checkmate::assert_logical(silent) @@ -206,13 +208,16 @@ transxchange2gtfs <- function(path_in, message("All files converted") } - if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} + if(merge) + { + if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} - gtfs_merged <- try(gtfs_merge(gtfs_all, force=force_merge, quiet=silent)) + gtfs_merged <- gtfs_merge(gtfs_all, force=force_merge, quiet=silent) - if (class(gtfs_merged) == "try-error") { - warning("Merging failed, returing unmerged GFTS object for analysis") - return(gtfs_all) #this is not helpful - caller has no idea there was an error and ploughs on, causing strange errors much later on + return(gtfs_merged) + } + else + { + return (gtfs_all) } - return(gtfs_merged) } From baf5346bd8d8b567f07c17fe05419399a326b428 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 17:11:19 +0100 Subject: [PATCH 13/81] add additional combinations of activity codes arising from reading entries in LO table --- data/activity_codes.rda | Bin 260 -> 263 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/activity_codes.rda b/data/activity_codes.rda index 3dceea321ed7aad383c2315e4be5cdaf4fd5cc7c..24a65f9772e156b42e0966ec48e3b4172ee094af 100644 GIT binary patch literal 263 zcmV+i0r>tOiwFP!0000027Qswio!4u#wSe++C>q4jjory?74&L$2GVe3aTo|o7nUAdy15J) zvNge$V1>)1@8CX|?hJHf@Hp5L&DP1!&f%6P!2k6&v%x=j-|DM-{{Gpy7Dx0sdjC=t z>Bmo%G{I_Z)H1K}yg~Z7F8`KvkFHtggiG7DXAGBa NUI7vrtJ| zQ%_L!4?v9o10W3#MEz6L4K#>G)OwhJXwYep00w}PDKY_|05lAm007bQB1#*o6o?V# zFax3iL*#{l&?qDV2tAeAp?E0~M2YD80!W35A#o%|z_d#)s5VV%S(Qq83ye`4*-^4t zgoZ)}2X|l{fhS}eL4Z?{Swzx82+QV`cm20jTJKiX#;q(9V%oA~YA`e%1TDiM4c1MX z%9NcXPlb-JW3>^dl#Ecx7h2v-A%X?A7RoX6h}fOq+K7xK<44*XC~bTJ`k_L Date: Tue, 29 Aug 2023 18:25:20 +0100 Subject: [PATCH 14/81] fix activity not being read from LO change to use data.table (likely a breaking change) performance improvements add setting of route_long_name "Train from..." to change based on mode --- NAMESPACE | 6 ++ R/atoc_export.R | 188 +++++++++++++++++++----------------- R/atoc_import.R | 239 ++++++++++++++++++++++++++++++---------------- R/atoc_main.R | 89 ++++++++++------- R/atoc_nr.R | 16 +++- R/gtfs_cleaning.R | 77 ++++++++++----- R/gtfs_read.R | 226 ++++++++++++++++++++++++++++--------------- R/write_gtfs.R | 2 +- 8 files changed, 530 insertions(+), 313 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3c0b662..d1b340c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,3 +31,9 @@ export(nr2gtfs) export(station2stops) export(transxchange2gtfs) export(transxchange_import) +import(data.table) +importFrom(data.table,":=") +importFrom(data.table,.alf) +importFrom(data.table,Import) +importFrom(data.table,file) +importFrom(data.table,the) diff --git a/R/atoc_export.R b/R/atoc_export.R index 82b21dc..65f00d3 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -132,7 +132,8 @@ station2transfers <- function(station, flf, path_out) { return(transfers) } -#' split overlapping start and end dates# +#' split overlapping start and end dates +#' this function is performance critical - profile any changes #' #' @param cal cal object #' @details split overlapping start and end dates @@ -140,18 +141,16 @@ station2transfers <- function(station, flf, path_out) { splitDates <- function(cal) { - # get all the dates that + # get a vector of all the start and end dates together from all base & overlay timetables dates <- c(cal$start_date, cal$end_date) dates <- dates[order(dates)] # create all unique pairs - dates.df <- data.frame( + dates.dt <- unique( data.table( start_date = dates[seq(1, length(dates) - 1)], end_date = dates[seq(2, length(dates))] - ) + ) ) - cal.new <- dplyr::right_join(cal, dates.df, - by = c( "start_date", "end_date" ) - ) + cal.new <- cal[dates.dt, on = c("start_date", "end_date")] if ("P" %in% cal$STP) { match <- "P" @@ -163,34 +162,26 @@ splitDates <- function(cal) { # fill in the original missing schedule for (j in seq(1, nrow(cal.new))) { if (is.na(cal.new$UID[j])) { - st_tmp <- cal.new$start_date[j] - ed_tmp <- cal.new$end_date[j] - new.UID <- cal$UID[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Days <- cal$Days[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.roWID <- cal$rowID[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.ATOC <- cal$`ATOC Code`[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Retail <- cal$`Retail Train ID`[cal$STP == match & - cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.head <- cal$Headcode[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Status <- cal$`Train Status`[cal$STP == match & - cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - if (length(new.UID) == 1) { - cal.new$UID[j] <- new.UID - cal.new$Days[j] <- new.Days - cal.new$rowID[j] <- new.roWID - cal.new$`ATOC Code`[j] <- new.ATOC - cal.new$`Retail Train ID`[j] <- new.Retail - cal.new$`Train Status`[j] <- new.Status - cal.new$Headcode[j] <- new.head + + + matches = (cal$STP == match + & cal$start_date <= cal.new$start_date[j] + & cal$end_date >= cal.new$end_date[j]) + + sumM = sum(matches) + + if (sumM == 1) { + + #cal.new[j, `:=`(UID = cal$UID[ matches ], + # Days = cal$Days[matches], + # rowID = cal$rowID[matches], + # STP = match)] + + cal.new$UID[j] <- cal$UID[ matches ] + cal.new$Days[j] <- cal$Days[ matches ] + cal.new$rowID[j] <- cal$rowID[ matches ] cal.new$STP[j] <- match - } else if (length(new.UID) > 1) { + } else if (sumM > 1) { message("Going From") print(cal) message("To") @@ -205,9 +196,9 @@ splitDates <- function(cal) { cal.new <- cal.new[!is.na(cal.new$UID), ] # remove duplicated rows - cal.new <- cal.new[!duplicated(cal.new), ] + cal.new <- cal.new[!duplicated(cal.new), ] #this is expensive - is it needed ? - # modify end and start dates + # modify end and start dates on base timetable so they don't overlap the overlay dates. for (j in seq(1, nrow(cal.new))) { if (cal.new$STP[j] == "P") { # check if end date need changing @@ -302,7 +293,6 @@ checkrows <- function(tmp) { } } -# TODO: make mode affect name #' internal function for constructing longnames of routes #' #' @details @@ -317,12 +307,10 @@ longnames <- function(routes, stop_times) { stop_times_sub <- dplyr::summarise(stop_times_sub, schedule = unique(schedule), stop_a = stop_id[stop_sequence == 1], - # seq = min(stop_sequence), stop_b = stop_id[stop_sequence == max(stop_sequence)] ) - stop_times_sub$route_long_name <- paste0("Train from ", - stop_times_sub$stop_a, + stop_times_sub$route_long_name <- paste0(stop_times_sub$stop_a, " to ", stop_times_sub$stop_b) stop_times_sub <- stop_times_sub[!duplicated(stop_times_sub$schedule), ] @@ -331,6 +319,10 @@ longnames <- function(routes, stop_times) { routes <- dplyr::left_join(routes, stop_times_sub, by = c("rowID" = "schedule")) + routes[`Train Category` == "SS", route_long_name := paste("Ship from",route_long_name)] + routes[`Train Category` %in% c("BS", "BR"), route_long_name := paste("Bus from",route_long_name)] + routes[!(`Train Category` %in% c("SS", "BS", "BR")), route_long_name := paste("Train from",route_long_name)] + return(routes) } @@ -345,29 +337,20 @@ longnames <- function(routes, stop_times) { #' makeCalendar <- function(schedule, ncores = 1) { # prep the inputs - calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", - "Days Run", "STP indicator", "rowID", "Headcode", - "ATOC Code", "Retail Train ID", "Train Status")] + calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] + names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) + calendar$`STP indicator` <- as.character(calendar$`STP indicator`) - # calendar = calendar[order(-calendar$`STP indicator`),] - names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", - "rowID", "Headcode", "ATOC Code", - "Retail Train ID", "Train Status") calendar$duration <- calendar$end_date - calendar$start_date + 1 # UIDs = unique(calendar$UID) # length_todo = length(UIDs) message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) - calendar_split <- split(calendar, calendar$UID) - + calendar$UID2 <- calendar$UID + calendar_split <- calendar[, .(list(.SD)), by = UID2][,V1] if (ncores > 1) { cl <- parallel::makeCluster(ncores) - # parallel::clusterExport( - # cl = cl, - # varlist = c("calendar", "UIDs"), - # envir = environment() - # ) parallel::clusterEvalQ(cl, { loadNamespace("UK2GTFS") }) @@ -375,8 +358,6 @@ makeCalendar <- function(schedule, ncores = 1) { res <- pbapply::pblapply(calendar_split, # 1:length_todo, makeCalendar.inner, - # UIDs = UIDs, - # calendar = calendar, cl = cl ) parallel::stopCluster(cl) @@ -385,9 +366,7 @@ makeCalendar <- function(schedule, ncores = 1) { res <- pbapply::pblapply( calendar_split, # 1:length_todo, - makeCalendar.inner # , - # UIDs = UIDs, - # calendar = calendar + makeCalendar.inner ) } @@ -415,7 +394,7 @@ makeCalendar <- function(schedule, ncores = 1) { #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar))) #performance - doing this split on 500k rows takes 60s - longer than the parallel execution below and consumes 3gb memory. - res.calendar.days <- res.calendar[,CHECKROWS_NAME_VECTOR] + res.calendar.days <- res.calendar[, ..CHECKROWS_NAME_VECTOR] res.calendar.days <- data.table::transpose(res.calendar.days) #transpose on the same size runs in around 3s, but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer. @@ -452,48 +431,61 @@ makeCalendar.inner <- function(calendar.sub) { # i, UIDs, calendar){ return(list(calendar.sub, NA)) } else { # check duration and types + + #get durations of overlays dur <- as.numeric(calendar.sub$duration[calendar.sub$STP != "P"]) + + #get vector of types of overlays typ <- calendar.sub$STP[calendar.sub$STP != "P"] + + #get vector of all timetable types including base timetable typ.all <- calendar.sub$STP - if (all(dur == 1) & all(typ == "C") & length(typ) > 0 & - length(typ.all) == 2) { - # One Day cancellations + + #if every overlay is a one day cancellation (and there is only one of them) + if (all(dur == 1) & all(typ == "C") & length(typ) > 0 & length(typ.all) == 2) { + # Modify in the calendar_dates.txt return(list( calendar.sub[calendar.sub$STP == "P", ], calendar.sub[calendar.sub$STP != "P", ] )) + } else { - # check for identical day pattern - if (length(unique(calendar.sub$Days)) == 1 & - sum(typ.all == "P") == 1) { + # if the day patterns are all identical, and we have only one base timetable + if (length(unique(calendar.sub$Days)) == 1 & sum(typ.all == "P") == 1) { + calendar.new <- splitDates(calendar.sub) #calendar.new <- UK2GTFS:::splitDates(calendar.sub) return(list(calendar.new, NA)) + } else { + # split by day pattern splits <- list() daypatterns <- unique(calendar.sub$Days) + for (k in seq(1, length(daypatterns))) { # select for each pattern but include cancellations with a # different day pattern - calendar.sub.day <- calendar.sub[calendar.sub$Days == daypatterns[k] | - calendar.sub$STP == "C", ] + calendar.sub.day <- calendar.sub[calendar.sub$Days == daypatterns[k] | calendar.sub$STP == "C", ] if (all(calendar.sub.day$STP == "C")) { - # ignore cases of only cancelled + # ignore cases of everything is cancelled splits[[k]] <- NULL - } else { + } + else { calendar.new.day <- splitDates(calendar.sub.day) #calendar.new.day <- UK2GTFS:::splitDates(calendar.sub.day) # rejects nas - if (class(calendar.new.day) == "data.frame") { + if (inherits(calendar.new.day, "data.frame")) { calendar.new.day$UID <- paste0(calendar.new.day$UID, k) splits[[k]] <- calendar.new.day } } } + splits <- data.table::rbindlist(splits, use.names=FALSE) # dplyr::bind_rows(splits) + return(list(splits, NA)) } } @@ -519,7 +511,10 @@ duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { stop_times <- dplyr::left_join(stop_times, rowID.unique, by = c("schedule" = "Var1") ) - stop_times_split <- split(stop_times, stop_times$schedule) + + + stop_times$schedule2 <- stop_times$schedule + stop_times_split <- stop_times[, .(list(.SD)), by = "schedule2"][,V1] # TODO: The could handle cases of non duplicated stoptimes within duplicate.stop_times.int # rather than splitting and rejoining, would bring code tidyness and speed improvements @@ -543,6 +538,9 @@ duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { stop_times.dup <- pbapply::pblapply(stop_times_split, duplicate.stop_times.int) } else { cl <- parallel::makeCluster(ncores) + parallel::clusterEvalQ(cl, { + loadNamespace("UK2GTFS") + }) stop_times.dup <- pbapply::pblapply(stop_times_split, duplicate.stop_times.int, cl = cl @@ -551,13 +549,14 @@ duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { rm(cl) } - stop_times.dup <- dplyr::bind_rows(stop_times.dup) + #stop_times.dup <- dplyr::bind_rows(stop_times.dup) performance + stop_times.dup <- data.table::rbindlist(stop_times.dup, use.names=FALSE) # stop_times.dup$index <- NULL # Join on the nonduplicated trip_ids trip.ids.nodup <- calendar.nodup[, c("rowID", "trip_id")] stop_times <- dplyr::left_join(stop_times, trip.ids.nodup, by = c("schedule" = "rowID")) - stop_times <- stop_times[!is.na(stop_times$trip_id), ] # when routes are cancled their stop times are left without valid trip_ids + stop_times <- stop_times[!is.na(stop_times$trip_id), ] # when routes are cancelled their stop times are left without valid trip_ids # join on the duplicated trip_ids calendar2 <- dplyr::group_by(calendar, rowID) @@ -585,21 +584,21 @@ duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { -#' fix times for jounrneys that run past midnight +#' fix times for journeys that run past midnight #' #' @details -#' When train runs over midnight GTFS requries the stop times to be in +#' When train runs over midnight GTFS requires the stop times to be in #' 24h+ e.g. 26:30:00 #' #' @param stop_times stop_times data.frame #' @param safe logical (default = TRUE) should the check for trains -#' running more than 24h be perfomed? +#' running more than 24h be performed? #' #' @details #' Not running the 24 check is faster, if the check is run a warning #' is returned, but the error is not fixed. As the longest train -#' jounrey in the UK is 13 hours (Aberdeen to Penzance) this is -#' unlikley to be a problem. +#' journey in the UK is 13 hours (Aberdeen to Penzance) this is +#' unlikely to be a problem. #' @noRd #' afterMidnight <- function(stop_times, safe = TRUE) { @@ -632,9 +631,8 @@ afterMidnight <- function(stop_times, safe = TRUE) { } numb2time2 <- function(numb){ - numb <- stringr::str_pad(as.character(numb), 4, pad = "0") - numb <- paste0(substr(numb,1,2),":",substr(numb,3,4),":00") - numb + #performance, substr is relatively expensive + numb <- sprintf("%02d:%02d:00", numb %/% 100, numb %% 100) } stop_times$arrival_time <- numb2time2(stop_times$arvfinal) @@ -656,7 +654,7 @@ afterMidnight <- function(stop_times, safe = TRUE) { #' #' @noRd #' -clean_activities2 <- function(x) { +clean_activities2 <- function(x, public_only = TRUE) { #x <- strsplit(x," ") #x <- lapply(x, function(y){ @@ -665,14 +663,24 @@ clean_activities2 <- function(x) { #x <- unlist(x) x <- data.frame(activity = x, stringsAsFactors = FALSE) - x <- dplyr::left_join(x, activity_codes, by = c("activity")) - if (anyNA(x$pickup_type)) { - mss <- unique(x$activity[is.na(x$pickup_type)]) - message("Unknown Activity codes '", paste(unique(mss), collapse = "' '"), "' please report these codes as a GitHub Issue") - x$pickup_type[is.na(x$pickup_type)] <- 0 - x$drop_off_type[is.na(x$drop_off_type)] <- 0 + + if (public_only) + { + x <- dplyr::left_join(x, activity_codes, by = c("activity")) + if (anyNA(x$pickup_type)) { + mss <- unique(x$activity[is.na(x$pickup_type)]) + warning("Unknown Activity codes '", paste(unique(mss), collapse = "' '"), "' please report these codes as a GitHub Issue") + x$pickup_type[is.na(x$pickup_type)] <- 0 + x$drop_off_type[is.na(x$drop_off_type)] <- 0 + } + x <- x[, c("pickup_type", "drop_off_type")] + } + else #set all of the stops on a route to be passenger boarding / alighting from a GTFS perspective + { + x$pickup_type <- 0 + x$drop_off_type <- 0 + x <- x[, c("pickup_type", "drop_off_type", "activity")] } - x <- x[, c("pickup_type", "drop_off_type")] return(x) } diff --git a/R/atoc_import.R b/R/atoc_import.R index f3eb9a6..a1005e0 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -1,3 +1,6 @@ +#' @import data.table +#' @importFrom data.table ":=" + #' Import the .alf file #' #' @details @@ -17,7 +20,7 @@ importALF <- function(file) { stringsAsFactors = FALSE ) - # Now Fix Misaigned Values + # Now Fix Misaligned Values # Check each column for misalignments checkCol <- function(x, val) { checkCol.inner <- function(x, val) { @@ -142,7 +145,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 17 - 1), widths = c(1, 4, 26 + 4, 1, 7, 3, 3, 3, 5, 1, 5, 2, 1, 1, 11, 3) ) - + station <- data.table(station) names(station) <- c( "Record Type", "Reserved1", "Station Name", "CATE Interchange status", "TIPLOC Code", "CRS Reference Code", @@ -162,7 +165,7 @@ importMSN <- function(file, silent = TRUE) { station <- strip_whitespace(station) # convert to SF object - # for some reasonf the coordinates are mangled + # for some reason the coordinates are mangled station$`Ordnance Survey Grid Ref East` <- as.numeric(station$`Ordnance Survey Grid Ref East`) station$`Ordnance Survey Grid Ref North` <- as.numeric(station$`Ordnance Survey Grid Ref North`) station$`Ordnance Survey Grid Ref East` <- station$`Ordnance Survey Grid Ref East` * 100 - 1e6 @@ -190,7 +193,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 5 - 1), widths = c(1, 4, 26 + 4, 45) ) - + timetable <- data.table(timetable) names(timetable) <- c( "Record Type", "Reserved1", "Station Name", "GBTT numbers" @@ -213,7 +216,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 2), widths = c(1, 79) ) - + comment <- data.table(comment) names(comment) <- c("Record Type", "Comment") comment$`Record Type` <- NULL @@ -229,7 +232,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 6 - 1), widths = c(1, 4, 26 + 5, 26, 20) ) - + alias <- data.table(alias) names(alias) <- c( "Record Type", "Reserved1", "Station Name", "Station Alias", "Reserved3" @@ -250,13 +253,14 @@ importMSN <- function(file, silent = TRUE) { #' Strip White Space #' #' @details -#' Strips whitespace from a dataframe of charters vectors and returns -#' the data frame +#' Strips trailing whitespace from a dataframe of character vectors +#' empty values are converted to NA +#' returns the data frame #' #' @param df data frame #' @noRd #' -strip_whitespace <- function(df) { +strip_whitespace_df <- function(df) { sws <- function(val) { val <- trimws(val, which = "right") val[val == ""] <- NA @@ -266,6 +270,109 @@ strip_whitespace <- function(df) { return(df) } +#' Strip White Space +#' +#' @details +#' Strips trailing whitespace from all char columns in a data.table +#' empty values are converted to NA +#' returns the data.table +#' +#' @param dt data table +#' @noRd +#' +strip_whitespace <- function(dt) { + + char_cols <- sapply(dt, is.character) + char_col_names <- names(char_cols[char_cols]) + + return ( dt[, (char_col_names) := lapply(.SD, function(val) { + val <- trimws(val, which = "right") + val[val == ""] <- NA + return(val) + }), .SDcols = char_col_names] ) +} + + + + +process_times <- function(dt, working_timetable) { + if (working_timetable) { + if ("Scheduled Arrival Time" %in% colnames(dt)) { + dt[, `Arrival Time` := gsub("H", "", `Scheduled Arrival Time`)] + } + + if ("Scheduled Departure Time" %in% colnames(dt)) { + dt[, `Departure Time` := gsub("H", "", `Scheduled Departure Time`)] + } + } else { + if ("Public Arrival Time" %in% colnames(dt)) { + dt[, `Arrival Time` := gsub("H", "", `Public Arrival Time`)] + } + + if ("Public Departure Time" %in% colnames(dt)) { + dt[, `Departure Time` := gsub("H", "", `Public Departure Time`)] + } + } + + return(dt) +} + + +# Process Activity Codes +process_activity <- function(dt, public_only) { + + dt[, Activity := strsplit(Activity, "(?<=.{2})", perl=TRUE)] + + if (public_only) { + # Filter to stops for passengers + #see https://wiki.openraildata.com/index.php?title=Activity_codes for definitions + acts <- c( + "TB", # Train Starts + "T ", # Stops to take up and set down passengers + "D ", # Stops to set down passengers + "U ", # Stops to take up passengers + "R ", # Request stop + "TF" # Train Finishes + ) + + clean_activity3 <- function(x) { + x <- x[x %in% acts] + if (length(x) > 0) { + x <- paste(x, collapse = ",") + return(x) + } else { + return("Other") + } + } + } else { + + + clean_activity3 <- function(x) { + + #remove empty elements + x <- x[x != " "] + + if (length(x) > 0) { + x <- paste(x, collapse = ",") + return(x) + } else { + return("Other") + } + } + } + + dt[, Activity := lapply(Activity, clean_activity3)] + + dt <- dt[Activity != "Other"] + + dt[, Activity := gsub("\\s+", "", Activity)] + + return(dt) +} + + + + #' Import the .mca file #' #' @details @@ -273,16 +380,18 @@ strip_whitespace <- function(df) { #' #' @param file Path to .mca file #' @param silent logical, should messages be displayed -#' @param ncores number of cores to use when paralell processing +#' @param ncores number of cores to use when parallel processing #' @param full_import import all data, default FALSE #' @param working_timetable use rail industry scheduling times instead of public times +#' @param public_only only return calls that are for public passenger pick up/set down #' @export #' importMCA <- function(file, silent = TRUE, ncores = 1, full_import = FALSE, - working_timetable = FALSE) { + working_timetable = FALSE, + public_only = TRUE) { # see https://wiki.openraildata.com/index.php/CIF_File_Format if (!silent) { @@ -311,6 +420,7 @@ importMCA <- function(file, 6, 1, 1, 1, 1, 4, 4, 1, 1 ) ) + BS <- data.table(BS) names(BS) <- c( "Record Identity", "Transaction Type", "Train UID", "Date Runs From", "Date Runs To", "Days Run", "Bank Holiday Running", "Train Status", @@ -350,6 +460,7 @@ importMCA <- function(file, col_types = rep("character", 8), widths = c(2, 4, 5, 2, 1, 8, 1, 57) ) + BX <- data.table(BX) names(BX) <- c( "Record Identity", "Traction Class", "UIC Code", "ATOC Code", "Applicable Timetable Code", "Retail Train ID", "Source", "Spare" @@ -362,6 +473,8 @@ importMCA <- function(file, # Add the rowid BX$rowID <- seq(from = 1, to = length(types))[types == "BX"] + + # Origin Station if (!silent) { message(paste0(Sys.time(), " importing Origin Station")) @@ -372,28 +485,26 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 1, 5, 4, 3, 3, 2, 2, 12, 2, 37) ) + LO <- data.table(LO) names(LO) <- c( "Record Identity", "Location", "Suffix", "Scheduled Departure Time", "Public Departure Time", "Platform", "Line", "Engineering Allowance", - "Pathing Allowance", "Pathing Allowance", "Performance Allowance", + "Pathing Allowance", "Activity", "Performance Allowance", "Spare" ) LO$Spare <- NULL LO$`Record Identity` <- NULL - LO <- strip_whitespace(LO) + # Add the rowid + LO$rowID <- seq(from = 1, to = length(types))[types == "LO"] - if(working_timetable){ - LO$`Departure Time` <- gsub("H", "", - LO$`Scheduled Departure Time`) - }else{ - LO$`Departure Time` <- gsub("H", "", - LO$`Public Departure Time`) - } + LO <- process_activity(LO, public_only) - LO <- LO[, c("Location", "Departure Time")] + LO <- process_times( LO, working_timetable ) + + LO <- LO[, c("rowID", "Location", "Activity", "Departure Time" )] + + LO <- strip_whitespace(LO) - # Add the rowid - LO$rowID <- seq(from = 1, to = length(types))[types == "LO"] # Intermediate Station if (!silent) { @@ -405,6 +516,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 7, 1, 5, 5, 5, 4, 4, 3, 3, 3, 12, 2, 2, 2, 20) ) + LI <- data.table(LI) names(LI) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Scheduled Departure Time", "Scheduled Pass", "Public Arrival Time", @@ -414,57 +526,16 @@ importMCA <- function(file, ) LI$Spare <- NULL LI$`Record Identity` <- NULL - - # Process Activity Codes - activity <- strsplit(LI$Activity, "(?<=.{2})", perl=TRUE) - - clean_activity3 <- function(x){ - # Filter to stops for passengers - acts <- c( - "TB", # Train Starts - "T ", # Stops to take up and set down passengers - "D ", # Stops to set down passengers - "U ", # Stops to take up passengers - "R ", # Request stop - "TF" # Train Finishes - ) - x <- x[x %in% acts] - x <- gsub(" ","",x) - if(length(x) > 0){ - x <- paste(x, collapse = ",") - return(x) - } else { - return("Other") - } - } - - LI$Activity <- unlist(lapply(activity, clean_activity3)) - - LI <- strip_whitespace(LI) - # Add the rowid LI$rowID <- seq(from = 1, to = length(types))[types == "LI"] - LI <- LI[LI$Activity != "Other",] - # Check for errors in the times + LI <- process_activity(LI, public_only) + LI <- process_times( LI, working_timetable ) - if(working_timetable){ - LI$`Arrival Time` <- gsub("H", "", - LI$`Scheduled Arrival Time`) - LI$`Departure Time` <- gsub("H", "", - LI$`Scheduled Departure Time`) - }else{ - LI$`Arrival Time` <- gsub("H", "", - LI$`Public Arrival Time`) - LI$`Departure Time` <- gsub("H", "", - LI$`Public Departure Time`) - } + LI <- LI[, c("rowID", "Location", "Activity", "Arrival Time", "Departure Time" )] - LI <- LI[, c( - "Location", "Arrival Time", - "Departure Time", "Activity", "rowID" - )] + LI <- strip_whitespace(LI) @@ -478,30 +549,24 @@ importMCA <- function(file, col_types = rep("character", 9), widths = c(2, 7, 1, 5, 4, 3, 3, 12, 43) ) + LT <- data.table(LT) names(LT) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Public Arrival Time", "Platform", "Path", "Activity", "Spare" ) LT$Spare <- NULL LT$`Record Identity` <- NULL + # Add the rowid + LT$rowID <- seq(from = 1, to = length(types))[types == "LT"] - # Process Activity Codes - activity <- strsplit(LT$Activity, "(?<=.{2})", perl=TRUE) - LT$Activity <- unlist(lapply(activity, clean_activity3)) + LT <- process_activity(LT, public_only) - LT <- strip_whitespace(LT) - LT$`Scheduled Arrival Time` <- gsub("H", "", LT$`Scheduled Arrival Time`) + LT <- process_times( LT, working_timetable ) - if(working_timetable){ - LT$`Arrival Time` <- gsub("H", "", LT$`Scheduled Arrival Time`) - }else{ - LT$`Arrival Time` <- gsub("H", "", LT$`Public Arrival Time`) - } + LT <- LT[, c("rowID", "Location", "Activity", "Arrival Time" )] - LT <- LT[, c("Location", "Arrival Time", "Activity")] + LT <- strip_whitespace(LT) - # Add the rowid - LT$rowID <- seq(from = 1, to = length(types))[types == "LT"] # TIPLOC Insert if (full_import) { @@ -518,6 +583,7 @@ importMCA <- function(file, 4, 4, 5, 8, 5 ) ) + CR <- data.table(CR) names(CR) <- c( "Record Identity", "Location", "Train Category", "Train Identity", "Headcode", "Course Indicator", @@ -543,6 +609,7 @@ importMCA <- function(file, col_types = rep("character", 11), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 8) ) + TI <- data.table(TI) names(TI) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", @@ -565,6 +632,7 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 7, 1) ) + TA <- data.table(TA) names(TA) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", "STANOX", "PO MCP Code", @@ -587,6 +655,7 @@ importMCA <- function(file, col_types = rep("character", 3), widths = c(2, 7, 71) ) + TD <- data.table(TD) names(TD) <- c("Record Identity", "TIPLOC code", "Spare") TD$Spare <- NULL TD$`Record Identity` <- NULL @@ -608,6 +677,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 1, 6, 6, 6, 6, 7, 2, 1, 7, 1, 1, 1, 1, 31, 1) ) + AA <- data.table(AA) names(AA) <- c( "Record Identity", "Transaction Type", "Base UID", "Assoc UID", "Assoc Start date", "Assoc End date", "Assoc Days", "Assoc Cat", @@ -646,6 +716,7 @@ importMCA <- function(file, col_types = rep("character", 2), widths = c(2, 78) ) + ZZ <- data.table(ZZ) names(ZZ) <- c("Record Identity", "Spare") ZZ$Spare <- NULL ZZ <- strip_whitespace(ZZ) @@ -657,8 +728,12 @@ importMCA <- function(file, if (!silent) { message(paste0(Sys.time(), " Preparing Imported Data")) } + stop_times <- dplyr::bind_rows(list(LO, LI, LT)) stop_times <- stop_times[order(stop_times$rowID), ] + + #the BS record is followed by the LO, LI, LT records relating to it + #- so we effectively group by the BS record and apply the BS row ID to the 'schedule' column of the stop times relating to it. stop_times$schedule <- as.integer(as.character(cut(stop_times$rowID, c(BS$rowID, ZZ$rowID[1]), labels = BS$rowID @@ -666,6 +741,8 @@ importMCA <- function(file, stop_times$stop_sequence <- sequence(rle(stop_times$schedule)$lengths) + # the BX record appears the row after the BS record, so it's rowId is one more than it's corresponding BS record. + # use this to join the two records together. BX$rowIDm1 <- BX$rowID - 1 BX$rowID <- NULL schedule <- dplyr::left_join(BS, BX, by = c("rowID" = "rowIDm1")) diff --git a/R/atoc_main.R b/R/atoc_main.R index 32e779a..c2cdcdf 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -7,9 +7,10 @@ #' @param schedule list of dataframes #' @param silent logical #' @param ncores number of cores to use +#' @param public_only filters to services / calls that are public pickup/set down only #' @noRd #' -schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { +schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, public_only=TRUE) { ### SECTION 1: ############################################################################### @@ -18,9 +19,6 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { message(paste0(Sys.time(), " Building stop_times")) } - # Convert Activity to pickup_type and drop_off_type - stop_times$Activity[is.na(stop_times$Activity) & stop_times$stop_sequence == 1] <- "TB" # No activity specified at start - # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "0000" & stop_times$Activity == "D", stop_times$arrival_time, @@ -29,16 +27,20 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { stop_times$departure_time, stop_times$arrival_time) - upoffs <- clean_activities2(stop_times$Activity) + # Convert Activity to pickup_type and drop_off_type + upoffs <- clean_activities2(stop_times$Activity, public_only=public_only) stop_times <- cbind(stop_times, upoffs) + #fix missing arrival / departure times by copying from the other time. stop_times$arrival_time[is.na(stop_times$arrival_time)] <- stop_times$departure_time[is.na(stop_times$arrival_time)] stop_times$departure_time[is.na(stop_times$departure_time)] <- stop_times$arrival_time[is.na(stop_times$departure_time)] - stop_times <- stop_times[, c("arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type", "rowID", "schedule")] - - stop_times <- stop_times[!(stop_times$pickup_type == 1 & stop_times$drop_off_type == 1), ] + stop_times <- stop_times[, c("arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type", "rowID", "schedule")] + if (public_only) + { + stop_times <- stop_times[!(stop_times$pickup_type == 1 & stop_times$drop_off_type == 1), ] + } ### SECTION 2: ############################################################################### @@ -47,22 +49,28 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { message(paste0(Sys.time(), " Building calendar and calendar_dates")) } - - schedule <- schedule[, c( - "Train UID", "Date Runs From", "Date Runs To", "Days Run", "Bank Holiday Running", "Train Status", "Train Category", - "Headcode", "STP indicator", "rowID", "ATOC Code", "Retail Train ID" - )] - # build the calendar file res <- makeCalendar(schedule = schedule, ncores = ncores) calendar <- res[[1]] calendar_dates <- res[[2]] - # rm(res) + rm(res) + + #remove columns we don't need any more + schedule <- schedule[, c( + "Train UID", "Train Status", "Train Category", + "Headcode", "rowID", "ATOC Code", "Retail Train ID", "Power Type", "Train Identity" + )] + + #making the calendar will duplicate rows where the base timetable has been layered over with cancellations etc. + #join back to the original to get the extra columns not returned from makeCalendar + calendar <- dplyr::left_join(calendar, schedule, by = c("rowID")) + + gc() calendar$trip_id <- 1:nrow(calendar) # not sure why this was here, but used in duplicate.stop_times - # calendar$service_id = 1:nrow(calendar) # For this purpose the serive and the trip are always the same + # calendar$service_id = 1:nrow(calendar) # For this purpose the service and the trip are always the same - # clean calednars + # clean calendars # calendar = calendar[,c("UID","monday","tuesday","wednesday","thursday","friday","saturday","sunday", # "start_date","end_date","rowID","trip_id")] names(calendar)[names(calendar) == "UID"] <- "service_id" @@ -86,45 +94,48 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { message(paste0(Sys.time(), " Duplicating necessary stop times")) } - + #TODO find out why this hangs if ncores > 1 stop_times <- duplicate.stop_times_alt(calendar = calendar, stop_times = stop_times, ncores = 1) ### SECTION 5: ############################################################################### - # make make the trips.txt file by matching the calnedar to the stop_times + # make make the trips.txt file by matching the calendar to the stop_times - trips <- calendar[, c("service_id", "trip_id", "rowID", "ATOC Code", "Train Status")] + trips <- calendar[, c("service_id", "trip_id", "rowID", "ATOC Code", "Train Status", "Train Category", "Power Type", "Train Identity")] trips <- longnames(routes = trips, stop_times = stop_times) ### SECTION 4: ############################################################################### # make make the routes.txt # a route is all the trips with a common start and end - # i.e. scheduels original UID + # i.e. schedules original UID if (!silent) { message(paste0(Sys.time(), " Building routes.txt")) } - routes <- trips - routes <- dplyr::group_by(routes, `ATOC Code`, route_long_name, `Train Status`) - routes <- dplyr::summarise(routes) - routes$route_id <- 1:nrow(routes) - - trips <- dplyr::left_join(trips, routes, by = c("ATOC Code" = "ATOC Code", "route_long_name" = "route_long_name", "Train Status" = "Train Status")) - - train_status <- data.frame( + #do the conversion to route_type before grouping because several status map to the same route_type and we get 'duplicate' routes that look the same. + train_status <- data.table( train_status = c("B", "F", "P", "S", "T", "1", "2", "3", "4", "5"), - route_type = c(3, NA, 2, 4, NA, 2, NA, NA, 4, 3), + route_type = c( 3, NA, 2, 4, NA, 2, NA, NA, 4, 3), stringsAsFactors = FALSE ) - routes$`Train Status` <- as.character(routes$`Train Status`) - routes <- dplyr::left_join(routes, train_status, by = c("Train Status" = "train_status")) + trips$`Train Status` <- as.character(trips$`Train Status`) + trips <- dplyr::left_join(trips, train_status, by = c("Train Status" = "train_status")) rm(train_status) - routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name")] - names(routes) <- c("route_id", "route_type", "agency_id", "route_long_name") + routes <- trips + + routes <- dplyr::group_by(routes, `ATOC Code`, route_long_name, `Train Category`, route_type ) + routes <- dplyr::summarise(routes) + routes$route_id <- 1:nrow(routes) + + trips <- dplyr::left_join(trips, routes, by = c("ATOC Code", "route_long_name", "Train Category", "route_type")) + + routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name", "Train Category" )] + names(routes) <- c("route_id", "route_type", "agency_id", "route_long_name", "train_category" ) routes$route_short_name <- routes$route_id - routes$route_type[routes$agency_id == "LT"] <- 1 # London Underground is Metro + routes$route_type[routes$agency_id == "LT" & routes$route_type == 2 ] <- 1 + # London Underground is Metro (unless already identified as a bus/ship etc) ### Section 6: ####################################################### # Final Checks @@ -132,9 +143,13 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1) { # Fix Times stop_times <- afterMidnight(stop_times) + + #gtfs systems tend to be tolerant of additional fields, so expose the train_category and power_type so that the consumer can do more analysis on them if they wish. + #e.g. filter out ECS moves # Ditch unneeded columns - routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type")] - trips <- trips[, c("trip_id", "route_id", "service_id")] + routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type", "train_category")] + trips <- trips[, c("trip_id", "route_id", "service_id", "Train Identity", "Power Type")] + names(trips) <- c("trip_id", "route_id", "service_id", "train_identity", "power_type") stop_times <- stop_times[, c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type")] calendar <- calendar[, c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] diff --git a/R/atoc_nr.R b/R/atoc_nr.R index c8544f3..213ad34 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -9,6 +9,8 @@ #' @param locations where to get tiploc locations (see details) #' @param agency where to get agency.txt (see details) #' @param shapes Logical, should shapes.txt be generated (default FALSE) +#' @param working_timetable Logical, should WTT times be used instead of public times (default FALSE) +#' @param public_only Logical, only return calls/services that are for public passenger pickup/set down (default TRUE) #' @family main #' @return A gtfs list #' @@ -26,7 +28,7 @@ #' Agency #' #' The ATOC files do not contain the necessary information to build the -#' agency.txt file. Therfore this data is provided with the package. You can +#' agency.txt file. Therefore this data is provided with the package. You can #' also pass your own data frame of agency information. #' #' @@ -37,7 +39,9 @@ nr2gtfs <- function(path_in, ncores = 1, locations = tiplocs, agency = atoc_agency, - shapes = FALSE) { + shapes = FALSE, + working_timetable = FALSE, + public_only = TRUE) { # checkmate checkmate::assert_character(path_in, len = 1) checkmate::assert_file_exists(path_in) @@ -59,7 +63,10 @@ nr2gtfs <- function(path_in, # Read In each File mca <- importMCA( file = path_in, - silent = silent, ncores = 1 + silent = silent, + ncores = 1, + working_timetable = working_timetable, + public_only = public_only ) @@ -108,7 +115,8 @@ nr2gtfs <- function(path_in, stop_times = stop_times, schedule = schedule, silent = silent, - ncores = ncores + ncores = ncores, + public_only = public_only ) rm(schedule) diff --git a/R/gtfs_cleaning.R b/R/gtfs_cleaning.R index 8d305a6..7dc709c 100644 --- a/R/gtfs_cleaning.R +++ b/R/gtfs_cleaning.R @@ -197,53 +197,86 @@ gtfs_fast_stops <- function(gtfs, maxspeed = 83) { #' Clean simple errors from GTFS files #' #' @param gtfs gtfs list +#' @param public_only Logical, only return calls/services that are for public passenger pickup/set down (default FALSE) #' @details #' Task done: #' #' 0. Remove stops with no coordinates #' 1. Remove stops with no location information -#' 2. Remove stops that are never used -#' 3. Replace missing agency names with "MISSINGAGENCY" -#' 4. If service is not public and removeNonPublic=TRUE then remove it (freight, 'trips' aka charters) +#' 2. Remove trips with less than two stops +#' 3. Remove stops that are never used +#' 4. Replace missing agency names with "MISSINGAGENCY" +#' 5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters) #' (these have a null route_type, so loading into OpenTripPlanner fails if these are present) #' #' @export -gtfs_clean <- function(gtfs, removeNonPublic = FALSE) { +gtfs_clean <- function(gtfs, public_only = FALSE) { # 0 Remove stops with no coordinates gtfs$stops <- gtfs$stops[!is.na(gtfs$stops$stop_lon) & !is.na(gtfs$stops$stop_lat), ] # 1 Remove stop times with no locations gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% unique(gtfs$stops$stop_id), ] - # 2 Remove stops that are never used + # 2 Remove trips with less than two stops + stop_count <- gtfs$stop_times[, .N, by = "trip_id"] + gtfs$trips <- gtfs$trips[!("trip_id" %in% stop_count[N<2]$trip_id)] + + # 3 Remove stops that are never used gtfs$stops <- gtfs$stops[gtfs$stops$stop_id %in% unique(gtfs$stop_times$stop_id), ] - # 3 Replace "" agency_id with dummy name + # 4 Replace "" agency_id with dummy name gtfs$agency$agency_id[gtfs$agency$agency_id == ""] <- "MISSINGAGENCY" gtfs$routes$agency_id[gtfs$routes$agency_id == ""] <- "MISSINGAGENCY" gtfs$agency$agency_name[gtfs$agency$agency_name == ""] <- "MISSINGAGENCY" - # 4 remove calls, trips and routes that have an empty route_type (non public services) - if (removeNonPublic) + # 5 remove calls, trips and routes that have an empty route_type (non public services) + # in addition to all the previous filtering - ECS moves were still making it into the output GTFS file, this removes them + if (public_only) { joinedTrips <- merge(gtfs$trips, gtfs$routes, by = "route_id", all.x = TRUE) joinedCalls <- merge(gtfs$stop_times, joinedTrips, by = "trip_id", all.x = TRUE) - filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type ), ] - gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times )] - - joinedCalendar <- merge(gtfs$calendar, joinedTrips, by = "service_id", all.x = TRUE) - filteredCalendar <- joinedCalendar[ !is.na( joinedCalendar$route_type ), ] - gtfs$calendar <- filteredCalendar[, names( gtfs$calendar )] - - joinedCalendarDates <- merge(gtfs$calendar_dates, joinedTrips, by = "service_id", all.x = TRUE) - filteredCalendarDates <- joinedCalendarDates[ !is.na( joinedCalendarDates$route_type ), ] - gtfs$calendar_dates <- filteredCalendarDates[, names( gtfs$calendar_dates )] - - filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ), ] - gtfs$trips <- filteredTrips[, names( gtfs$trips )] - gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ] + if ("train_category" %in% names(joinedCalls) ) + { + filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type) & + joinedCalls$train_category %in% c("OL", "OU", "OO", "OW", "XC", "XD", "XI", + "XR", "XU", "XX", "XZ", "BR", "BS", "SS" ), ] + } + else + { + filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type), ] + } + + gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times ), with=FALSE] + + #after merging GTFS files we may have compressed the calendar and calendar_dates so a service pattern is used by + #multiple trips - so don't remove calendar and calendar_dates that link to routes with NA route_type in case + #it's in use by multiple trips/routes + + if ("train_category" %in% names(joinedTrips) ) + { + filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ) & + joinedTrips$train_category %in% c("OL", "OU", "OO", "OW", "XC", "XD", "XI", + "XR", "XU", "XX", "XZ", "BR", "BS", "SS" ), ] + } + else + { + filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ), ] + } + + gtfs$trips <- filteredTrips[, names( gtfs$trips ), with=FALSE] + + if ("train_category" %in% names(gtfs$routes) ) + { + gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ) & + gtfs$routes$train_category %in% c("OL", "OU", "OO", "OW", "XC", "XD", "XI", + "XR", "XU", "XX", "XZ", "BR", "BS", "SS" ), ] + } + else + { + gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ] + } } return(gtfs) diff --git a/R/gtfs_read.R b/R/gtfs_read.R index ead852c..a5894e2 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -18,74 +18,112 @@ gtfs_read <- function(path){ gtfs <- list() if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ - gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), - col_types = readr::cols(agency_id = readr::col_character(), - agency_noc = readr::col_character()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$agency <- fread( + file.path(tmp_folder, "agency.txt"), + colClasses = c( + agency_id = "character", + agency_noc = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: agency.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ - gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), - col_types = readr::cols(stop_id = readr::col_character(), - stop_code = readr::col_character(), - stop_name = readr::col_character(), - stop_lat = readr::col_number(), - stop_lon = readr::col_number(), - wheelchair_boarding = readr::col_integer(), #enum value 2 is valid but rarely seen outside the spec document - location_type = readr::col_integer(), - parent_station = readr::col_character(), - platform_code = readr::col_character()), - - - lazy = FALSE, show_col_types = FALSE) + + gtfs$stops <- fread( + file.path(tmp_folder, "stops.txt"), + colClasses = c( + stop_id = "character", + stop_code = "character", + stop_name = "character", + stop_lat = "numeric", + stop_lon = "numeric", + wheelchair_boarding = "integer", #enum value 2 is valid but rarely seen outside the spec document + location_type = "integer", + parent_station = "character", + platform_code = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: stops.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"routes.txt"))){ - gtfs$routes <- readr::read_csv(file.path(tmp_folder,"routes.txt"), - col_types = readr::cols(route_id = readr::col_character(), - agency_id = readr::col_character(), - route_short_name = readr::col_character(), - route_long_name = readr::col_character(), - route_type = readr::col_integer()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$routes <- fread( + file.path(tmp_folder, "routes.txt"), + colClasses = c( + route_id = "character", + agency_id = "character", + route_short_name = "character", + route_long_name = "character", + route_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: routes.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"trips.txt"))){ - gtfs$trips <- readr::read_csv(file.path(tmp_folder,"trips.txt"), - col_types = readr::cols(trip_id = readr::col_character(), - route_id = readr::col_character(), - service_id = readr::col_character(), - block_id = readr::col_character(), - shape_id = readr::col_character(), - wheelchair_accessible = readr::col_integer() #enum value 2 is valid but rarely seen outside the spec document - ), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$trips <- fread( + file.path(tmp_folder, "trips.txt"), + colClasses = c( + trip_id = "character", + route_id = "character", + service_id = "character", + block_id = "character", + shape_id = "character", + wheelchair_accessible = "integer" #enum value 2 is valid but rarely seen outside the spec document + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: trips.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"stop_times.txt"))){ - gtfs$stop_times <- readr::read_csv(file.path(tmp_folder,"stop_times.txt"), - col_types = readr::cols(trip_id = readr::col_character(), - stop_id = readr::col_character(), - stop_sequence = readr::col_integer(), - departure_time = readr::col_character(), - arrival_time = readr::col_character(), - shape_dist_traveled = readr::col_number(), - timepoint = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - pickup_type = readr::col_integer(), - drop_off_type = readr::col_integer()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$stop_times <- fread( + file.path(tmp_folder, "stop_times.txt"), + colClasses = c( + trip_id = "character", + stop_id = "character", + stop_sequence = "integer", + departure_time = "character", + arrival_time = "character", + shape_dist_traveled = "numeric", + timepoint = "integer", + pickup_type = "integer", + drop_off_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + gtfs$stop_times$arrival_time <- lubridate::hms(gtfs$stop_times$arrival_time) gtfs$stop_times$departure_time <- lubridate::hms(gtfs$stop_times$departure_time) @@ -94,44 +132,71 @@ gtfs_read <- function(path){ } if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ - gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), - col_types = readr::cols(service_id = readr::col_character(), - monday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - tuesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - wednesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - thursday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - friday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - saturday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - sunday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file - start_date = readr::col_date(format = "%Y%m%d"), - end_date = readr::col_date(format = "%Y%m%d")), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$calendar <- fread( + file.path(tmp_folder, "calendar.txt"), + colClasses = c( + service_id = "character", + monday = "integer", + tuesday = "integer", + wednesday = "integer", + thursday = "integer", + friday = "integer", + saturday = "integer", + sunday = "integer", + start_date = "character", + end_date = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + + gtfs$calendar[, start_date := as.IDate(start_date, "%Y%m%d")] + gtfs$calendar[, end_date := as.IDate(end_date, "%Y%m%d")] } else { message("Unable to find conditionally required file: calendar.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"calendar_dates.txt"))){ - gtfs$calendar_dates <- readr::read_csv(file.path(tmp_folder,"calendar_dates.txt"), - col_types = readr::cols(service_id = readr::col_character(), - date = readr::col_date(format = "%Y%m%d"), - exception_type = readr::col_integer()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$calendar_dates <- fread( + file.path(tmp_folder, "calendar_dates.txt"), + colClasses = c( + service_id = "character", + date = "character", + exception_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + gtfs$calendar_dates[, date := as.IDate(date, "%Y%m%d")] + } else { message("Unable to find conditionally required file: calendar_dates.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ - gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), - col_types = readr::cols(shape_id = readr::col_character(), - shape_pt_lat = readr::col_number(), - shape_pt_lon = readr::col_number(), - shape_pt_sequence = readr::col_integer(), - shape_dist_traveled = readr::col_number()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$shapes <- data.table::fread( + file.path(tmp_folder, "shapes.txt"), + colClasses = c( + shape_id = "character", + shape_pt_lat = "numeric", + shape_pt_lon = "numeric", + shape_pt_sequence = "integer", + shape_dist_traveled = "numeric" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } @@ -141,9 +206,14 @@ gtfs_read <- function(path){ for (fileName in notLoadedFiles) { - table <- readr::read_csv(file.path( tmp_folder, paste0( fileName, ".txt" ) ), - show_col_types = FALSE, - lazy = FALSE) + table <- fread( + file.path(tmp_folder, paste0(fileName, ".txt")), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + gtfs[[fileName]] <- table } diff --git a/R/write_gtfs.R b/R/write_gtfs.R index b3ccb97..9be6d92 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -134,7 +134,7 @@ stripTabs <- function(df, stripNewline) { #' Convert Period to GTFS timestamps #' When writing a 400mb (zipped) file, we spend nearly 4 minutes in this fn(), about 10x longer than writing the files to the filesystem. #' profiler reports this being mostly nchar(), so we optimise down to one sprintf which reduces the time to 1 minute -#' .format() is about 7x slower than sprintf() +#' .format() is about 7x slower than sprintf() in this situation. Performance may be different on a data.table #' #' @param x periods #' @noRd From 77aa421117db3528177c5184913f9469bcbb99ba Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 18:26:54 +0100 Subject: [PATCH 15/81] rebuild documentation --- UK2GTFS.Rproj | 1 + man/gtfs_clean.Rd | 12 +++++++++--- man/gtfs_force_valid.Rd | 8 ++++++-- man/gtfs_merge.Rd | 22 +++++++++++++++++----- man/gtfs_write.Rd | 2 +- man/importMCA.Rd | 7 +++++-- man/nr2gtfs.Rd | 10 ++++++++-- man/transxchange2gtfs.Rd | 7 +++++-- 8 files changed, 52 insertions(+), 17 deletions(-) diff --git a/UK2GTFS.Rproj b/UK2GTFS.Rproj index 497f8bf..270314b 100644 --- a/UK2GTFS.Rproj +++ b/UK2GTFS.Rproj @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/man/gtfs_clean.Rd b/man/gtfs_clean.Rd index 67c8d74..87d5ef0 100644 --- a/man/gtfs_clean.Rd +++ b/man/gtfs_clean.Rd @@ -4,10 +4,12 @@ \alias{gtfs_clean} \title{Clean simple errors from GTFS files} \usage{ -gtfs_clean(gtfs) +gtfs_clean(gtfs, public_only = FALSE) } \arguments{ \item{gtfs}{gtfs list} + +\item{public_only}{Logical, only return calls/services that are for public passenger pickup/set down (default FALSE)} } \description{ Clean simple errors from GTFS files @@ -15,7 +17,11 @@ Clean simple errors from GTFS files \details{ Task done: +0. Remove stops with no coordinates 1. Remove stops with no location information -2. Remove stops that are never used -3. Replace missing agency names with "MISSINGAGENCY" +2. Remove trips with less than two stops +3. Remove stops that are never used +4. Replace missing agency names with "MISSINGAGENCY" +5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters) + (these have a null route_type, so loading into OpenTripPlanner fails if these are present) } diff --git a/man/gtfs_force_valid.Rd b/man/gtfs_force_valid.Rd index 2a32e9d..e103b8c 100644 --- a/man/gtfs_force_valid.Rd +++ b/man/gtfs_force_valid.Rd @@ -15,6 +15,10 @@ Force a GTFS to be valid by removing problems \details{ Actions performed 1. Remove stops with missing location -2. Remove stops from stop_times that are not in stops -3. Remove trips from stop_times that are not in trips +2. Remove routes that don't exist in agency +3. Remove trips that don't exist in routes +4. Remove stop_times(calls) that don't exist in trips +5. Remove stop_times(calls) that don't exist in stops +6. Remove Calendar that have service_id that doesn't exist in trips +7. Remove Calendar_dates that have service_id that doesn't exist in trips } diff --git a/man/gtfs_merge.Rd b/man/gtfs_merge.Rd index 08fa680..3fe236d 100644 --- a/man/gtfs_merge.Rd +++ b/man/gtfs_merge.Rd @@ -4,17 +4,29 @@ \alias{gtfs_merge} \title{merge a list of gtfs files} \usage{ -gtfs_merge(gtfs_list, force = FALSE, quiet = TRUE) +gtfs_merge( + gtfs_list, + force = FALSE, + quiet = TRUE, + condenseServicePatterns = TRUE +) } \arguments{ \item{gtfs_list}{a list of gtfs objects to be merged} -\item{force}{logical, if TRUE duplicated values are merged taking the fist} - -\item{quiet}{logical, if TRUE less messages +\item{force}{logical, if TRUE duplicated values are merged taking the fist instance to be the correct instance, in most cases this is ok, but may cause some errors} + +\item{quiet}{logical, if TRUE less messages} + +\item{condenseServicePatterns}{logical, if TRUE service patterns across all routes are condensed into a unique set of patterns} } \description{ -merge a list of gtfs files +!WARNING! only the tables: +agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies +are processed, any other tables in the input timetables are passed through +} +\details{ +if duplicate IDs are detected then completely new IDs for all rows will be generated in the output. } diff --git a/man/gtfs_write.Rd b/man/gtfs_write.Rd index b76902e..aeb65f2 100644 --- a/man/gtfs_write.Rd +++ b/man/gtfs_write.Rd @@ -30,6 +30,6 @@ gtfs_write( \item{quote}{logical, should strings be quoted, default = FALSE, passed to data.table::fwrite} } \description{ -Takes a list of data frames represneting the GTFS fromat and saves them as GTFS +Takes a list of data frames representing the GTFS format and saves them as GTFS Zip file. } diff --git a/man/importMCA.Rd b/man/importMCA.Rd index 4e80e56..0eef08f 100644 --- a/man/importMCA.Rd +++ b/man/importMCA.Rd @@ -9,7 +9,8 @@ importMCA( silent = TRUE, ncores = 1, full_import = FALSE, - working_timetable = FALSE + working_timetable = FALSE, + public_only = TRUE ) } \arguments{ @@ -17,11 +18,13 @@ importMCA( \item{silent}{logical, should messages be displayed} -\item{ncores}{number of cores to use when paralell processing} +\item{ncores}{number of cores to use when parallel processing} \item{full_import}{import all data, default FALSE} \item{working_timetable}{use rail industry scheduling times instead of public times} + +\item{public_only}{only return calls that are for public passenger pick up/set down} } \description{ Import the .mca file diff --git a/man/nr2gtfs.Rd b/man/nr2gtfs.Rd index c5a4882..820cd9d 100644 --- a/man/nr2gtfs.Rd +++ b/man/nr2gtfs.Rd @@ -10,7 +10,9 @@ nr2gtfs( ncores = 1, locations = tiplocs, agency = atoc_agency, - shapes = FALSE + shapes = FALSE, + working_timetable = FALSE, + public_only = TRUE ) } \arguments{ @@ -26,6 +28,10 @@ nr2gtfs( \item{agency}{where to get agency.txt (see details)} \item{shapes}{Logical, should shapes.txt be generated (default FALSE)} + +\item{working_timetable}{Logical, should WTT times be used instead of public times (default FALSE)} + +\item{public_only}{Logical, only return calls/services that are for public passenger pickup/set down (default TRUE)} } \value{ A gtfs list @@ -48,7 +54,7 @@ in the ATOC data or provide an SF data frame of your own. Agency The ATOC files do not contain the necessary information to build the -agency.txt file. Therfore this data is provided with the package. You can +agency.txt file. Therefore this data is provided with the package. You can also pass your own data frame of agency information. } \seealso{ diff --git a/man/transxchange2gtfs.Rd b/man/transxchange2gtfs.Rd index 56d8744..2309e65 100644 --- a/man/transxchange2gtfs.Rd +++ b/man/transxchange2gtfs.Rd @@ -12,7 +12,8 @@ transxchange2gtfs( naptan = get_naptan(), scotland = "auto", try_mode = TRUE, - force_merge = FALSE + force_merge = FALSE, + merge = TRUE ) } \arguments{ @@ -27,7 +28,7 @@ transxchange2gtfs( \item{naptan}{Naptan stop locations from get_naptan()} \item{scotland}{character, should Scottish bank holidays be used? Can be -"auto" (defualt), "yes", "no". If "auto" and path_in ends with "S.zip" +"auto" (default), "yes", "no". If "auto" and path_in ends with "S.zip" Scottish bank holidays will be used, otherwise England and Wales bank holidays are used.} @@ -36,6 +37,8 @@ calls thus a failure on a single file will not cause the whole process to fail. Warning this could result in a GTFS file with missing routes.} \item{force_merge}{Logical, passed to gtfs_merge(force), default FALSE} + +\item{merge}{Logical, if results are merged into one GTFS object by calling gtfs_merge, default TRUE} } \value{ A GTFS named list From 1c5558450a9e19915aa597d0e1c1242b9d77a0e8 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 19:48:01 +0100 Subject: [PATCH 16/81] workround roxygen bug --- NAMESPACE | 4 ---- R/atoc_import.R | 3 ++- man/importALF.Rd | 17 ----------------- 3 files changed, 2 insertions(+), 22 deletions(-) delete mode 100644 man/importALF.Rd diff --git a/NAMESPACE b/NAMESPACE index d1b340c..0d00103 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,7 +33,3 @@ export(transxchange2gtfs) export(transxchange_import) import(data.table) importFrom(data.table,":=") -importFrom(data.table,.alf) -importFrom(data.table,Import) -importFrom(data.table,file) -importFrom(data.table,the) diff --git a/R/atoc_import.R b/R/atoc_import.R index a1005e0..851c8bc 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -1,7 +1,8 @@ #' @import data.table #' @importFrom data.table ":=" -#' Import the .alf file + +# Import the .alf file #' #' @details #' Imports the .alf file and returns data.frame diff --git a/man/importALF.Rd b/man/importALF.Rd deleted file mode 100644 index e43a260..0000000 --- a/man/importALF.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/atoc_import.R -\name{importALF} -\alias{importALF} -\title{Import the .alf file} -\usage{ -importALF(file) -} -\arguments{ -\item{file}{Path to .alf file} -} -\description{ -Import the .alf file -} -\details{ -Imports the .alf file and returns data.frame -} From 3078c815fa517b74f919f4f432f530e3fd889582 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 19:48:25 +0100 Subject: [PATCH 17/81] add unit test for overlay code --- tests/testthat/test_atoc.R | 123 +++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index fdbc787..564cc94 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -1,10 +1,131 @@ +context("Running unit tests before system tests") + + +fixDates <- function( df ) +{ + df$start_date <- as.Date(df$start_date, format = "%d-%m-%Y") + df$end_date <- as.Date(df$end_date, format = "%d-%m-%Y") + df$duration <- df$end_date - df$start_date + 1 + + return (df) +} + +test_that("test makeCalendar.inner:1", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixDates( testData ) + + res <- makeCalendar.inner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] + + #this is what the code produces - it is wrong.not applying the overlay correctly + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), + start_date=c("02-01-2023", "24-01-2023", "08-01-2023", "24-01-2023", "01-03-2023", "10-03-2023", "11-01-2023"), + end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "07-03-2023", "31-03-2023", "19-01-2023"), + Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), + STP=c( "P", "P", "P", "P", "P", "P", "O"), + rowID=c( 1, 1, 2, 2, 3, 3, 4)) + + expectedResult <- fixDates( expectedResult ) + + expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) +}) + + + +test_that("test makeCalendar.inner:2", { +browser() + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "16-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixDates( testData ) + + res <- makeCalendar.inner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] + + #this is what the code produces - it is wrong. e.g 10/3 service is missing + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), + start_date=c("02-01-2023", "24-01-2023", "08-01-2023", "24-01-2023", "01-03-2023", "17-03-2023", "11-01-2023"), + end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "07-03-2023", "31-03-2023", "19-01-2023"), + Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), + STP=c( "P", "P", "P", "P", "P", "P", "O"), + rowID=c( 1, 1, 2, 2, 3, 3, 4)) + + expectedResult <- fixDates( expectedResult ) + + expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) +}) + + + +test_that("test makeCalendar.inner:3", { + + browser() + #by convention Sunday service timetables are Sunday only + #the 'from' date should be the first day the timetable has effect (i.e. should have a 1 in the relevant day column) + #(and I assume the same is true of the last) + + #mon-sat timetable + #sun different operating hours on sunday + #engineering works means having to berth in a different platform for a couple of weeks wed-fri + #cancel mondays for 2 weeks + #cancel sundays for 2 weeks + #mon-sat sun -march- wed-fri platform cancel mon cancel sun + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-01-2023", "15-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "27-01-2023", "16-01-2023", "22-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011100", "1000000", "0000001" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixDates( testData ) + + res <- makeCalendar.inner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] + + expectedResult = data.table(UID=c( "uid1 a1", "uid1 a2", "uid1 a3", "uid1 b3", "uid1 c3", "uid1 a4"), + start_date=c("02-01-2023", "01-01-2023", "01-03-2023", "10-03-2023", "21-03-2023", "07-01-2023"), + end_date=c( "08-01-2023", "31-01-2023", "06-03-2023", "19-03-2023", "31-03-2023", "14-01-2023"), + Days=c( "1111100", "0000011", "0011100", "0011100", "0011100", "0011000"), + STP=c( "P", "P", "P", "P", "P", "O"), + rowID=c( 1, 2, 3, 3, 3, 4)) + + expectedResult <- fixDates( expectedResult ) + + expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) +}) + + + + context("Get the example atoc files") file_path <- file.path(tempdir(),"uk2gtfs_tests") dir.create(file_path) data_path <- file.path(tempdir(),"uk2gtfs_data") dir.create(data_path) + + test_that("test atoc data is there", { expect_true(dl_example_file(data_path, "atoc")) expect_true(file.exists(file.path(data_path, "atoc.zip"))) @@ -20,6 +141,8 @@ test_that("test atoc2gtfs singlecore", { }) + + context("Test the main atoc function, with different settings") test_that("test atoc2gtfs singlecore", { From d0218c7df35cadbd55337a87d07618d8bdf17a81 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 29 Aug 2023 20:49:51 +0100 Subject: [PATCH 18/81] add unit test - the test currently passes - but the results look all wrong, overlays not being applied correctly --- tests/testthat/test_atoc.R | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index fdbc787..740183a 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -1,3 +1,93 @@ +context("Running unit tests before system tests") + +library(data.table) + +fixDates <- function( df ) +{ + df$start_date <- as.Date(df$start_date, format = "%d-%m-%Y") + df$end_date <- as.Date(df$end_date, format = "%d-%m-%Y") + df$duration <- df$end_date - df$start_date + 1 + + return (df) +} + +test_that("test makeCalendar.inner:1", { + + testData = data.frame(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixDates( testData ) + + res <- makeCalendar.inner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] + + #this is what the code produces - it is wrong. not applying the overlay correctly + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), + start_date=c("02-01-2023", "23-01-2023", "08-01-2023", "23-01-2023", "01-03-2023", "09-03-2023", "11-01-2023"), + end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "08-03-2023", "31-03-2023", "19-01-2023"), + Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), + STP=c( "P", "P", "P", "P", "P", "P", "O"), + rowID=c( 1, 1, 2, 2, 3, 3, 4)) + + expectedResult <- fixDates( expectedResult ) + + if (!identical(expectedResult,res.calendar)) + { + comparison <- sapply(1:nrow(expectedResult), function(i) all.equal(expectedResult[i, ], res.calendar[i, ])) + print(comparison) + } + + expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) +}) + + + +test_that("test makeCalendar.inner:2", { + + testData = data.frame(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "16-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixDates( testData ) + + res <- makeCalendar.inner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] + + #this is what the code produces - it is wrong. e.g 10/3 service is missing + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), + start_date=c("02-01-2023", "23-01-2023", "08-01-2023", "23-01-2023", "01-03-2023", "16-03-2023", "11-01-2023"), + end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "08-03-2023", "31-03-2023", "19-01-2023"), + Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), + STP=c( "P", "P", "P", "P", "P", "P", "O"), + rowID=c( 1, 1, 2, 2, 3, 3, 4)) + + expectedResult <- fixDates( expectedResult ) + + if (!identical(expectedResult,res.calendar)) + { + comparison <- sapply(1:nrow(expectedResult), function(i) all.equal(expectedResult[i, ], res.calendar[i, ])) + print(comparison) + } + + expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) +}) + + + + context("Get the example atoc files") file_path <- file.path(tempdir(),"uk2gtfs_tests") From 618ffa671fe6284b614abe50890fbb75ba682de2 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:29:37 +0100 Subject: [PATCH 19/81] performance improvement - do evaluation outside data.table then insert afterwards --- R/atoc_import.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/atoc_import.R b/R/atoc_import.R index 851c8bc..a566415 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -295,7 +295,7 @@ strip_whitespace <- function(dt) { - +#TODO update this to handle seconds instead of just truncating them (public TT is to nearest minute, WTT more accurate) process_times <- function(dt, working_timetable) { if (working_timetable) { if ("Scheduled Arrival Time" %in% colnames(dt)) { @@ -322,7 +322,8 @@ process_times <- function(dt, working_timetable) { # Process Activity Codes process_activity <- function(dt, public_only) { - dt[, Activity := strsplit(Activity, "(?<=.{2})", perl=TRUE)] + #performance, runs about twice as fast if we do processing outside data.table then insert it later + splitActivity = stringi::stri_extract_all_regex(dt$Activity, ".{2}") if (public_only) { # Filter to stops for passengers @@ -362,7 +363,8 @@ process_activity <- function(dt, public_only) { } } - dt[, Activity := lapply(Activity, clean_activity3)] + + dt$Activity = lapply(splitActivity, clean_activity3) dt <- dt[Activity != "Other"] From 8b6990d6883b4baa8d111f780c92759363b30bac Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:30:53 +0100 Subject: [PATCH 20/81] make code more compact so more of function can be seen on one screen --- R/atoc_nr.R | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 213ad34..bd50d98 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -50,11 +50,9 @@ nr2gtfs <- function(path_in, checkmate::assert_logical(shapes) if (ncores == 1) { - message(paste0( - Sys.time(), - " This will take some time, make sure you use 'ncores' to enable multi-core processing" - )) + message(paste0(Sys.time(), " This will take some time, make sure you use 'ncores' to enable multi-core processing")) } + # Is input a zip or a folder if (!grepl(".gz", path_in)) { stop("path_in is not a .gz file") @@ -75,14 +73,8 @@ nr2gtfs <- function(path_in, # load("data/tiplocs.RData") stops <- cbind(locations, sf::st_coordinates(locations)) stops <- as.data.frame(stops) - stops <- stops[, c( - "stop_id", "stop_code", "stop_name", - "Y", "X" - )] - names(stops) <- c( - "stop_id", "stop_code", "stop_name", - "stop_lat", "stop_lon" - ) + stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] + names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) stops$stop_lat <- round(stops$stop_lat, 5) stops$stop_lon <- round(stops$stop_lon, 5) } else { @@ -97,15 +89,9 @@ nr2gtfs <- function(path_in, # rm(alf, flf, mca, msn) stop_times <- stop_times[, c( - "Arrival Time", - "Departure Time", - "Location", "stop_sequence", - "Activity", "rowID", "schedule" - )] + "Arrival Time", "Departure Time", "Location", "stop_sequence", "Activity", "rowID", "schedule")] names(stop_times) <- c( - "arrival_time", "departure_time", "stop_id", - "stop_sequence", "Activity", "rowID", "schedule" - ) + "arrival_time", "departure_time", "stop_id", "stop_sequence", "Activity", "rowID", "schedule") # remove any unused stops stops <- stops[stops$stop_id %in% stop_times$stop_id, ] From 14c2d456c9c80d4e2781dd07f4ebb1c8c5b004ab Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:33:10 +0100 Subject: [PATCH 21/81] move formatting to helper function which generically processes all tables, change default options to highest performing write combination --- R/write_gtfs.R | 116 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 31 deletions(-) diff --git a/R/write_gtfs.R b/R/write_gtfs.R index 9be6d92..c4bb114 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -15,10 +15,10 @@ gtfs_write <- function(gtfs, folder = getwd(), name = "gtfs", - stripComma = TRUE, - stripTab = TRUE, - stripNewline = TRUE, - quote = FALSE) { + stripComma = FALSE, + stripTab = FALSE, + stripNewline = FALSE, + quote = TRUE) { if (stripComma) { for (i in seq_len(length(gtfs))) { @@ -33,36 +33,26 @@ gtfs_write <- function(gtfs, } - #Format Dates - if(inherits(gtfs$calendar$start_date, "Date")){ - gtfs$calendar$start_date <- format(gtfs$calendar$start_date, "%Y%m%d") - } - - if(inherits(gtfs$calendar$end_date, "Date")){ - gtfs$calendar$end_date <- format(gtfs$calendar$end_date, "%Y%m%d") - } - - if(inherits(gtfs$calendar_dates$date, "Date")){ - gtfs$calendar_dates$date <- format(gtfs$calendar_dates$date, "%Y%m%d") - } - - #Format times - if(inherits(gtfs$stop_times$arrival_time, "Period")){ - gtfs$stop_times$arrival_time <- period2gtfs(gtfs$stop_times$arrival_time) - } - - if(inherits(gtfs$stop_times$departure_time, "Period")){ - gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) - } - - if("frequencies" %in% names(gtfs)) + if (FALSE) { - if("difftime" %in% class(gtfs$frequencies$start_time)){ - gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S") + #Format times + if(inherits(gtfs$stop_times$arrival_time, "Period")){ + gtfs$stop_times$arrival_time <- period2gtfs(gtfs$stop_times$arrival_time) + } + + if(inherits(gtfs$stop_times$departure_time, "Period")){ + gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) } - if("difftime" %in% class(gtfs$frequencies$end_time)){ - gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S") + if("frequencies" %in% names(gtfs)) + { + if("difftime" %in% class(gtfs$frequencies$start_time)){ + gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S") + } + + if("difftime" %in% class(gtfs$frequencies$end_time)){ + gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S") + } } } @@ -72,6 +62,8 @@ gtfs_write <- function(gtfs, { table <- gtfs[[tableName]] + table <- formatAttributesToGtfsSchema( table ) + if ( !is.null(table) & nrow(table) > 0 ) { data.table::fwrite(table, file.path(tempdir(), "gtfs_temp", paste0(tableName, ".txt")), row.names = FALSE, quote = quote) @@ -150,3 +142,65 @@ period2gtfs <- function(x) { return( sprintf("%02d:%02d:%02d", lubridate::hour(x), lubridate::minute(x), lubridate::second(x)) ) } + +formatAttributesToGtfsSchema <- function(dt) +{ + + { + periodColumnsToFormat <- names(dt)[ sapply(dt, function(x){ inherits(x, "Period") }) ] + + if (length(periodColumnsToFormat) > 0) + { + dt[, (periodColumnsToFormat) := lapply(.SD, period2gtfs), .SDcols = periodColumnsToFormat] + } + } + + { + diffTimeColumnsToFormat <- names(dt)[ sapply(dt, function(x){ "difftime" %in% class( x ) }) ] + + if (length(diffTimeColumnsToFormat) > 0) + { + dt[, (diffTimeColumnsToFormat) := lapply(.SD, format, format = "%H:%M:%S"), .SDcols = diffTimeColumnsToFormat] + } + } + + { + dateColumnsToFormat <- names(dt)[ sapply(dt, function(x){ inherits(x, "Date") }) ] + + if (length(dateColumnsToFormat) > 0) + { + dt[, (dateColumnsToFormat) := lapply(.SD, + function(d){ gsub("-", "", as.character(d) ) } ), + .SDcols = dateColumnsToFormat] + } + } + + { + factorColumnsToFormat <- names(dt)[ sapply(dt, function(x){ is.factor( x ) }) ] + + if (length(factorColumnsToFormat) > 0) + { + dt[, (factorColumnsToFormat) := lapply(.SD, as.character), .SDcols = factorColumnsToFormat] + } + } + + + { + logicalColumnsToFormat <- names(dt)[ sapply(dt, function(x){ is.logical( x ) }) ] + + if (length(logicalColumnsToFormat) > 0) + { + dt[, (logicalColumnsToFormat) := lapply(.SD, as.integer), .SDcols = logicalColumnsToFormat] + } + } + + return (dt) +} + + + + + + + + From 79a175e014e6fe060d97e0a70c377d6d9c58e89c Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:35:06 +0100 Subject: [PATCH 22/81] major overhaul to fix issues in timetable overlay rule processing - has also been extensively profiled and reasonable level of optimisation applied --- R/atoc_export.R | 1307 +++++++++++++++++++++++++++++++++++++++-------- R/atoc_main.R | 76 +-- 2 files changed, 1122 insertions(+), 261 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 65f00d3..ef9008a 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -132,119 +132,233 @@ station2transfers <- function(station, flf, path_out) { return(transfers) } + + +NOT_NEEDED <- c("__NOT_NEEDED_MARKER__~@$$%&*((") + + +#this function is massively performance critical - profile any changes to it.makes up 30% of the whole makeCalendar process +selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) +{ + #if we have two adjacent complete items e.g. ....end 13th Jan start 14th jan..... + #then it's not a real gap and just an artefact of the algorithm use to generate the dates + if( rowIndex>1 && rowIndex= calNew$end_date[rowIndex],,which=TRUE] + + #are we in a gap between two base timetables with no overlays + if ( length(baseTimetableIndexes)<=0 ) + { + calNew$UID[rowIndex] <- NOT_NEEDED + return (calNew) + } + + + # apply timetable overlay selection logic - pick highest priority timetable type + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + + #pick the lowest alphabetic STP (highest priority), and just in case there is more than one, the shortest duration one. + + #priorityTimetable <- baseTimetables[order(STP, duration), head(.SD, 1)] + #performance we pre-sort all the entries by the priority & duration + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #so we don't need to sort again here, just pick the top filtered result + + #stash the generated start & end dates + #performance - copying to separate variables seems to be fastest + start_date = calNew$start_date[rowIndex] + end_date = calNew$end_date[rowIndex] + + calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] #this is the most time consuming line in this fn. takes about 10x longer than the + #single variable copy below + calNew$start_date[rowIndex] = start_date + calNew$end_date[rowIndex] = end_date + + return (calNew) +} + + + #' split overlapping start and end dates +#' duplicated items have the same rowId as the original but a new UID with an alpha character appended to it. +#' #' this function is performance critical - profile any changes #' -#' @param cal cal object +#' THIS ONLY WORKS ON ITEMS WHERE THE DAY PATTERNS ARE ALL THE SAME +#' (or are only 1 day DURATION) +#' +#' @param cal calendar object #' @details split overlapping start and end dates #' @noRd splitDates <- function(cal) { - # get a vector of all the start and end dates together from all base & overlay timetables + # get a vector of all the start and end dates together from all base & overlay timetables and sort them dates <- c(cal$start_date, cal$end_date) dates <- dates[order(dates)] - # create all unique pairs + + # create all unique pairs so we know how to chop the dates up into non-overlapping periods dates.dt <- unique( data.table( start_date = dates[seq(1, length(dates) - 1)], end_date = dates[seq(2, length(dates))] ) ) - cal.new <- cal[dates.dt, on = c("start_date", "end_date")] + #left join back to the source data so we can see which (if any) date segments we have already covered, and which we need to replicate + calNew <- cal[dates.dt, on = c("start_date", "end_date")] - if ("P" %in% cal$STP) { - match <- "P" - } else { - match <- cal$STP[cal$STP != "C"] - match <- match[1] - } + #some dates may already be overlapping + calNew <- fixOverlappingDates( calNew ) - # fill in the original missing schedule - for (j in seq(1, nrow(cal.new))) { - if (is.na(cal.new$UID[j])) { + # fill in the missing schedule parts from the original + # the filled in parts should (if the data is correctly layered) be the highest priority part of the timetable + # we make multiple passes over the timetable working our way outwards from completed items to NA items - matches = (cal$STP == match - & cal$start_date <= cal.new$start_date[j] - & cal$end_date >= cal.new$end_date[j]) + rowCount = nrow(calNew) - sumM = sum(matches) + for (i in seq(1,10)) #should really be a max of 3 passes + { + #forwards + for (j in seq(1, rowCount)) { - if (sumM == 1) { + #if we are not valid & the next item is already valid, fill in our details and adjust our end date + if (j 1) { - message("Going From") - print(cal) - message("To") - print(cal.new) - stop() - # readline(prompt="Press [enter] to continue")print() + #if previous item valid adjust our start date + if(j>1 && !is.na(calNew$UID[j-1]) && NOT_NEEDED != calNew$UID[j-1] ) + { + calNew$start_date[j] <- calNew$end_date[j-1] +1 + } } } - } - # remove any gaps - cal.new <- cal.new[!is.na(cal.new$UID), ] + #backwards + for (j in seq(rowCount, 1)) { - # remove duplicated rows - cal.new <- cal.new[!duplicated(cal.new), ] #this is expensive - is it needed ? + #if we are not valid & the previous item is already valid, fill in our details and adjust our start date + if (j>1 && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1]) ) + { + calNew <- selectOverlayTimeableAndCopyAttributes(cal, calNew, j) - # modify end and start dates on base timetable so they don't overlap the overlay dates. - for (j in seq(1, nrow(cal.new))) { - if (cal.new$STP[j] == "P") { - # check if end date need changing - if (j < nrow(cal.new)) { - if (cal.new$end_date[j] == cal.new$start_date[j + 1]) { - cal.new$end_date[j] <- (cal.new$end_date[j] - 1) + if ( NOT_NEEDED != calNew$UID[j-1]) + { + calNew$start_date[j] <- calNew$end_date[j-1] +1 } - } - # check if start date needs changing - if (j > 1) { - if (cal.new$start_date[j] == cal.new$end_date[j - 1]) { - cal.new$start_date[j] <- (cal.new$start_date[j] + 1) + + #if next item valid adjust our start date + if(j 0, ] +# calNew <- calNew[calNew$duration > 0, ] + + #performance, do all subsets in one go + #calNew <- calNew[!is.na(UID) & UID != NOT_NEEDED & STP != "C" & duration > 0] + calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0), ] # Append UID to note the changes - if (nrow(cal.new) > 0) { - if (nrow(cal.new) < 27) { - cal.new$UID <- paste0(cal.new$UID, " ", letters[1:nrow(cal.new)]) + if (nrow(calNew) > 0) { + if (nrow(calNew) <= 26) { + calNew$UID <- paste0(calNew$UID, " ", letters[1:nrow(calNew)]) } else { # Cases where we need extra letters, gives upto 676 ids lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) - cal.new$UID <- paste0(cal.new$UID, " ", lett[1:nrow(cal.new)]) + calNew$UID <- paste0(calNew$UID, " ", lett[1:nrow(calNew)]) } } else { - cal.new <- NA + calNew <- NA } + return(calNew) +} + + +# triggered by test case "10:test makeCalendarInner" +# when we have a 1 day overlay sitting on the start/end data of a base timetable +# the dates overlap - fix it +fixOverlappingDates <- function( cal ) +{ + rowCount = nrow(cal) + + #forwards + for (j in seq(1, rowCount)) { + + #adjust our end date if next item a higher priority overlay + if (j1 && !is.na(cal$UID[j-1]) && cal$STP[j-1] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1] +1 + } + } + } + + #backwards + for (j in seq(rowCount, 1)) { + + #adjust our end date if previous item a higher priority overlay + if (j>1 && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1]) ) + { + if ( cal$STP[j-1] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1] +1 + } + + if(j= 7) + { + return (TRUE) + } - days.valid <- weekdays(seq.POSIXt( - from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ), - to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ), - by = "DSTday" - )) - days.valid <- tolower(days.valid) + days.valid <- weekdays(seq.POSIXt( + from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ), + to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ), + by = "DSTday" + )) + days.valid <- tolower(days.valid) - #get a vector of names of days of week that the timetable is valid on - days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX] - days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ] + #get a vector of names of days of week that the timetable is valid on + days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX] + days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ] - if (any(days.valid %in% days.match)) { - return(TRUE) - } else { - return(FALSE) + return (any(days.valid %in% days.match)) +} + + +checkOperatingDayActive <- function(calendar) { + + if (all(calendar$duration >= 7)) + { + return (calendar$Days!="0000000") + } + + #get a list of days of week that the timetable is valid on + opDays <- splitBitmaskMat( calendar$Days, asInteger=FALSE ) + opDays <- split(opDays, row(opDays)) + + checkValid <- function(dur, sd, ed, od ){ + + if (dur >= 7) + { + return (any(od)) } - } else { - return(TRUE) + + dayNumbers <- lubridate::wday( seq.Date(from = sd, to = ed, by = "day"), label = FALSE, week_start=1 ) + + return ( any(od[dayNumbers]) ) } + + validCalendars <- mapply( checkValid, calendar$duration, + calendar$start_date, calendar$end_date, + opDays, SIMPLIFY = TRUE ) + return (validCalendars) } + + + + + + #' internal function for constructing longnames of routes #' #' @details @@ -303,6 +447,7 @@ checkrows <- function(tmp) { #' @noRd #' longnames <- function(routes, stop_times) { + stop_times_sub <- dplyr::group_by(stop_times, trip_id) stop_times_sub <- dplyr::summarise(stop_times_sub, schedule = unique(schedule), @@ -326,6 +471,122 @@ longnames <- function(routes, stop_times) { return(routes) } + +START_PATTERN_VECTOR = c("1","01","001","0001","00001","000001","0000001") +END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") + +#calendars should start on the first day they are effective, and end on the last day. +#i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. +validateCalendarDates <- function( calendar ) +{ + start_day_number = lubridate::wday( calendar$start_date, label = FALSE, week_start=1 ) + end_day_number = lubridate::wday( calendar$end_date, label = FALSE, week_start=1 ) + + startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1, start_day_number) + endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7) + + return (startOk & endOk) +} + + + + +#' split and rebind bitmask +#' +#' @details +#' splits 'Days' bitmask into individual logical fields called monday, tuesday, etc... +#' +#' @param calendar data.table of calendar items +#' @noRd +#' +splitAndRebindBitmask <- function( calendar ) +{ + return (cbind( calendar, splitBitmaskDt( calendar$Days, FALSE ) ) ) +} + +#this function gets expensive if you call it a lot, creating data.table takes a while +splitBitmaskDt <- function( bitmaskVector, asInteger=FALSE ) +{ + return (as.data.table(splitBitmaskMat( bitmaskVector, asInteger=asInteger ))) +} + +splitBitmaskMat <- function( bitmaskVector, asInteger=FALSE ) +{ + splitDays = splitBitmask( bitmaskVector, asInteger=asInteger ) + + return (matrix(splitDays, ncol=7, byrow=TRUE, dimnames=list(NULL,WEEKDAY_NAME_VECTOR))) +} + +splitBitmask <- function( bitmask, asInteger=FALSE ) +{ + duff = which( nchar(bitmask) != 7 ) + + bitmask[duff] = " " + + splitDays = strsplit(bitmask, "") + + splitDays = as.integer(unlist(splitDays)) + + if (!asInteger) + { + splitDays = as.logical(splitDays) + } + + return (splitDays) +} + + + +#' allocate Cancellations Across Calendars +#' +#' @details +#' expects input calendar items to have been separated out into non-overlapping dates +#' and 'Days' bitmask unpacked into separate int or logical attributes +#' +#' "originalUID" is used to identify where the cancellations originally came from +#' after allocating across the split calender items the cancellations will have an updated +#' "UID" that says which calender they are now associated with +#' +#' @param calendar data.table of calendar items that are NOT cancellations (that has 'Days' bitmask unpacked ) +#' @param cancellations data.table of calender items that ARE cancellations (that has 'Days' bitmask unpacked ) +#' @noRd +#' +allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) +{ + tempNames = names(calendar) + + #stash some join fields away because we want to keep the data from the cancellations rather than the calendar table + #which otherwise get over-written by the join process + cancellations$start_date2 <- cancellations$start_date + cancellations$end_date2 <- cancellations$end_date + cancellations$UID <- NULL #we want the new UID from the calendar entries + + #left join cancellations to calendar by the original service ID + #and the date of the cancellation lying inside the period of the calendar + #and the day of the cancellation is an operating day of the calendar item + joined = cancellations[calendar, on = .(originalUID==originalUID, + start_date>=start_date, + end_date<=end_date)][ + ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) + | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] + #revert the stashed (join) fields + joined$start_date <- joined$start_date2 + joined$start_date2 <- NULL + joined$end_date <- joined$end_date2 + joined$end_date2 <- NULL + + #remove joined fields we don't need + joined <- joined[, .SD, .SDcols=tempNames] + + #belt and braces - fix any NA fields by reverting from the original UID + joined[is.na(UID), UID := originalUID] + + return( joined ) +} + + + + #' make calendar #' #' @details @@ -340,14 +601,33 @@ makeCalendar <- function(schedule, ncores = 1) { calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) - calendar$`STP indicator` <- as.character(calendar$`STP indicator`) + calendar$STP <- as.character(calendar$STP) calendar$duration <- calendar$end_date - calendar$start_date + 1 + if ( !all(validateCalendarDates( calendar ) ) ) + { + warning(paste0(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask")) + #TODO be more verbose about which ones + } + + + #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later + calendar$originalUID <- calendar$UID + + #brutal, but makes code later on simpler, make all cancellations one day cancellations by splitting + #TODO don't split up into one day cancellations if all the operating day patterns on a service are identical + cancellations <- makeAllOneDay( calendar[calendar$STP == "C", ] ) + calendar <- calendar[calendar$STP != "C", ] + + #calendar$start_date = as.integer( calendar$start_date ) + #calendar$end_date = as.integer( calendar$end_date ) +#test treating date as int: seem to be about twice as fast on the critical line when selecting base timetable + # UIDs = unique(calendar$UID) # length_todo = length(UIDs) message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) - calendar$UID2 <- calendar$UID - calendar_split <- calendar[, .(list(.SD)), by = UID2][,V1] + calendar$`__TEMP__` <- calendar$UID + calendar_split <- calendar[, .(list(.SD)), by = `__TEMP__`][,V1] if (ncores > 1) { cl <- parallel::makeCluster(ncores) @@ -356,8 +636,7 @@ makeCalendar <- function(schedule, ncores = 1) { }) pbapply::pboptions(use_lb = TRUE) res <- pbapply::pblapply(calendar_split, - # 1:length_todo, - makeCalendar.inner, + makeCalendarInner, cl = cl ) parallel::stopCluster(cl) @@ -365,17 +644,50 @@ makeCalendar <- function(schedule, ncores = 1) { } else { res <- pbapply::pblapply( calendar_split, - # 1:length_todo, - makeCalendar.inner + makeCalendarInner ) } res.calendar <- lapply(res, `[[`, 1) - res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, was taking 10 minutes to execute bind_rows - res.calendar_dates <- lapply(res, `[[`, 2) - res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] - res.calendar_dates <- data.table::rbindlist(res.calendar_dates, use.names=FALSE) + res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, takes 10 minutes to execute bind_rows on full GB daily timetable + + res.cancellation_dates <- lapply(res, `[[`, 2) + res.cancellation_dates <- res.cancellation_dates[!is.na(res.cancellation_dates)] + res.cancellation_dates <- data.table::rbindlist(res.cancellation_dates, use.names=FALSE) + stopifnot( 0==nrow(res.cancellation_dates) ) + rm(res.cancellation_dates) + #since we didn't pass in any cancellations we should no longer get any back + + res.calendar = splitAndRebindBitmask( res.calendar ) + cancellations = splitAndRebindBitmask( cancellations ) + + #associate the split up cancellations with the (new) calendar they are associated with + #(only works because cancellations are all one day duration) + cancellations = allocateCancellationsAcrossCalendars( res.calendar, cancellations ) + + #no longer need the field that was used to associate the original and replicated calendars together + cancellations$originalUID <- NULL + res.calendar$originalUID <- NULL + + return(list(res.calendar, cancellations)) + + + + + + + + + + if (FALSE) #don't think we need any of this code any more ? + { + message(paste0( + Sys.time(), + " Removing trips that only occur on days of the week that are outside the timetable validity period" + )) + + #unpack the days bitmask into a vector of int days <- lapply(res.calendar$Days, function(x) { as.integer(substring(x, 1:7, 1:7)) }) @@ -383,21 +695,18 @@ makeCalendar <- function(schedule, ncores = 1) { days <- as.data.frame(days) names(days) <- WEEKDAY_NAME_VECTOR + #attach unpacked bits back onto source calendar res.calendar <- cbind(res.calendar, days) res.calendar$Days <- NULL - message(paste0( - Sys.time(), - " Removing trips that only occur on days of the week that are outside the timetable validity period" - )) - #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar))) - #performance - doing this split on 500k rows takes 60s - longer than the parallel execution below and consumes 3gb memory. res.calendar.days <- res.calendar[, ..CHECKROWS_NAME_VECTOR] res.calendar.days <- data.table::transpose(res.calendar.days) - #transpose on the same size runs in around 3s, but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer. - + #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar))) + #transpose runs in around 3s (compared to 60s for split() on a data.frame), + #but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer. + #TODO see if data.table performs as well but with simpler code. if (ncores > 1) { cl <- parallel::makeCluster(ncores) @@ -414,176 +723,706 @@ makeCalendar <- function(schedule, ncores = 1) { } res.calendar <- res.calendar[keep, ] + } + +} + + + + + + +makeAllOneDay0 <- function( cal ) +{ + duration <- cal$end_date - cal$start_date + 1 + + if ( 0==nrow(cal) || all(1 == duration)) + { + #nothing to do + return (cal) + } + + start_day_number = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) + + # we want to rotate the day pattern so that the pattern aligns with the start date, + # then we can replicate it the required number of times + #TODO made this too complex, leave the pattern where it is and just work out an offset + + #create all possible rotations of day pattern + allDayPatterns <- c( cal$Days, + (paste0(substr(cal$Days, 2, 7),substr(cal$Days, 1, 1))), + (paste0(substr(cal$Days, 3, 7),substr(cal$Days, 1, 2))), + (paste0(substr(cal$Days, 4, 7),substr(cal$Days, 1, 3))), + (paste0(substr(cal$Days, 5, 7),substr(cal$Days, 1, 4))), + (paste0(substr(cal$Days, 6, 7),substr(cal$Days, 1, 5))), + (paste0(substr(cal$Days, 7, 7),substr(cal$Days, 1, 6)))) + + dayPatternMatrix <- matrix( allDayPatterns, ncol=7 ) + + #create logical matrix with the pattern we want selected + cols <- col(dayPatternMatrix) + toSelect <- cols == start_day_number + + #pull out the desired pattern (need to transpose both matrices otherwise the unwind into a vector is in the wrong order) + selectedRotation <- t(dayPatternMatrix)[ t(toSelect) ] + + numWeeks <- as.integer(ceiling(as.integer(cal$duration) / 7)) + + + # replicate the pattern, truncate to number of days + # create a sequence of dates - then return the dates selected in the pattern + makeDates <- function(rot, w, d, start, end){ + + selectedDaysLogical <- as.logical(as.integer(strsplit(rot, "")[[1]])) + + selectedDays <- rep(selectedDaysLogical, times = w) + + truncated <- selectedDays[ 1:d ] + + dateSequence <- seq.Date(from = start, to = end, by = "day") + + selectedDates <- dateSequence[ truncated ] + } + + selectedDates <- mapply( + makeDates, + selectedRotation, numWeeks, duration, cal$start_date, cal$end_date + ) + + #replicate the calendar rows the appropriate number of times + repetitions <- sapply(selectedDates, length) + replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] + + #set the start and end date for each calender item to the single day identified earlier + all_dates <- as.Date(unlist(selectedDates), origin = "1970-01-01") + replicatedcal$end_date <- replicatedcal$start_date <- all_dates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- 1 + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] + + return (replicatedcal) +} + + + + + + + + +SINGLE_DAY_PATTERN_VECTOR = c("1000000","0100000","0010000","0001000","0000100","0000010","0000001") + +SINGLE_DAY_PATTERN_LIST = list(c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)) + + +makeReplicationDates <- function(cal, startDayNum, endDayNum){ + + #make a sequences of dates, offsetting the start date so it's always monday (aligning with bitmask start day) + # and the end date so it's always sunday + firstDate = min(cal$start_date) - 7 + lastDate = max(cal$end_date) + 7 + allDates = seq.Date(from = firstDate, to = lastDate, by = "day") + + offset = as.integer(cal$start_date)-startDayNum+2-as.integer(firstDate) + end = as.integer(cal$end_date)+8-endDayNum-as.integer(firstDate) + + dates <- Map(function(o, e) allDates[o:e], offset, end) + + return ( as.Date( unlist(dates), origin = DATE_EPOC ) ) +} + + + +#' replicates the input calendar objects into single day duration calendar objects +#' calender objects should NOT have had the 'days' bitmask field unpacked +#' (will still produce an output but the unpacked monday, tuesday etc fields will no longer be consistent with the packed 'Days' bitmask) +#' +#' @param cal data.table containing all the calendars to be split up into individual days +#' @noRd +#' +makeAllOneDay <- function( cal ) +{ + duration <- cal$end_date - cal$start_date + 1 + + if ( 0==nrow(cal) || all(1 == duration)) + { + #nothing to do + return (cal) + } + + #make a list of dates for each object being replicated + startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + #work out how many time we need to replicate each item: number of operating days in week * num weeks + bitmaskMat = splitBitmaskMat( cal$Days, asInteger=FALSE ) + dayCount = rowSums(bitmaskMat) + numWeeks <- ceiling(as.integer(cal$duration) / 7) + repetitions = dayCount * numWeeks + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] + + #get a mask of operating days + operatingDayLogical <- rep( split(bitmaskMat, row(bitmaskMat)), times = numWeeks) + + #set the start and end date for each calender item to the single day identified earlier + selectedDates = dateSequence[unlist(operatingDayLogical)] + replicatedcal$end_date <- replicatedcal$start_date <- selectedDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- 1 + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] - return(list(res.calendar, res.calendar_dates)) + return (replicatedcal) } + + + +#' along a similar line to 'makeAllOneDay' duplicates input calendar objects into single WEEK duration calendar objects +#' +#' @param cal data.table containing all the calendars to be split up into individual weeks +#' @noRd +#' +expandAllWeeks <- function( cal ) +{ + if ( 0==nrow(cal) ) + { + #nothing to do + return (cal) + } + + #duration <- cal$end_date - cal$start_date + 1 + + #make a list of dates for each object being replicated + startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + numWeeks <- ceiling(as.integer(cal$duration) / 7) + + #replicate a logical vector for the start date and use that to select the relevant dates from the date sequence + startDayLogical <- SINGLE_DAY_PATTERN_LIST[startDayNum] + startDays <- rep(startDayLogical, times = numWeeks) + startDates <- dateSequence[ unlist(startDays) ] + + #replicate a logical vector for the end date and use that to select the relevant dates from the date sequence + endDayLogical <- SINGLE_DAY_PATTERN_LIST[endDayNum] + endDays <- rep(endDayLogical, times = numWeeks) + endDates <- dateSequence[ unlist(endDays) ] + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = numWeeks)] + + #set the start and end date for each calender item + replicatedcal$start_date <- startDates + replicatedcal$end_date <- endDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1 + + return (replicatedcal) +} + + + + + #' make calendar helper function -#' @param i row number to do +#' this originally expected and dealt with cancellations too. This worked ok for single day duration cancellations +#' but had problems with multi-day cancellations when combined with overlays +#' code hasn't been changed to reject / avoid cancellations but results may not be predictable / tested scenarios +#' @param calendarSub data.table containing all the calendars (aka CIF operating patterns) for a single service #' @noRd #' -makeCalendar.inner <- function(calendar.sub) { # i, UIDs, calendar){ - # UIDs.sub = UIDs[i] - # calendar.sub = calendar[calendar$UID == UIDs.sub,] - # calendar.sub = schedule[schedule$`Train UID` == UIDs.sub,] - if (nrow(calendar.sub) == 1) { +makeCalendarInner <- function(calendarSub) { + + if ( 1 == nrow(calendarSub) ) + { # make into an single entry - return(list(calendar.sub, NA)) - } else { + res = list(calendarSub, NA) + } + else + { + if (length(unique(calendarSub$UID)) > 1) + { + stop(paste("Error: makeCalendarInner was passed more than one service to work on. service=", unique(calendarSub$UID))) + } + # check duration and types + allTypes <- calendarSub$STP - #get durations of overlays - dur <- as.numeric(calendar.sub$duration[calendar.sub$STP != "P"]) + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + baseType = max(allTypes) #usually we expect 'P' to be the base timetable... but it can also be STP service in which case it will be 'N' - #get vector of types of overlays - typ <- calendar.sub$STP[calendar.sub$STP != "P"] + overlayDurations <- as.numeric(calendarSub$duration[calendarSub$STP != baseType]) + overlayTypes <- calendarSub$STP[calendarSub$STP != baseType] - #get vector of all timetable types including base timetable - typ.all <- calendar.sub$STP + if( length(overlayDurations) <= 0 ) + { + #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules + res = list(calendarSub, NA) + } + #if every overlay is a one day cancellation (and only one base timetable) + else if (all(overlayDurations == 1) && all(overlayTypes == "C") && sum(allTypes == baseType) == 1 ) + { + warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") + + # Apply the cancellation via entries in calendar_dates.txt + res = list( calendarSub[calendarSub$STP != "C", ], + calendarSub[calendarSub$STP == "C", ]) + } + else + { + uniqueDayPatterns <- unique(calendarSub$Days[calendarSub$STP != "C"]) + + # if the day patterns are all identical + if (length(uniqueDayPatterns) <= 1 ) + { + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #calendarSub = calendarSub[ order(STP, duration), ] + setkey( calendarSub, STP, duration ) + setindex( calendarSub, start_date, end_date) + + calendar_new <- makeCalendarsUnique( splitDates(calendarSub) ) + res = list(calendar_new, NA) + } + else # split by day pattern + { + #this works if the day patterns don't overlap any operating days. + if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) + { + #this scenario DOES exist in the downloaded ATOC test data + #stop(paste("Scenario with overlay pattern not matching base pattern is not currently handled. service=", unique(calendarSub$UID))) + res = makeCalendarForDifferentDayPatterns( calendarSub ) + } + else + { + res = makeCalendarForDayPatterns( uniqueDayPatterns, calendarSub ) + } + } + } + } - #if every overlay is a one day cancellation (and there is only one of them) - if (all(dur == 1) & all(typ == "C") & length(typ) > 0 & length(typ.all) == 2) { + #stopifnot( is.list(res) ) + return (res) +} - # Modify in the calendar_dates.txt - return(list( - calendar.sub[calendar.sub$STP == "P", ], - calendar.sub[calendar.sub$STP != "P", ] - )) - } else { - # if the day patterns are all identical, and we have only one base timetable - if (length(unique(calendar.sub$Days)) == 1 & sum(typ.all == "P") == 1) { - - calendar.new <- splitDates(calendar.sub) - #calendar.new <- UK2GTFS:::splitDates(calendar.sub) - return(list(calendar.new, NA)) - - } else { - - # split by day pattern - splits <- list() - daypatterns <- unique(calendar.sub$Days) - - for (k in seq(1, length(daypatterns))) { - # select for each pattern but include cancellations with a - # different day pattern - calendar.sub.day <- calendar.sub[calendar.sub$Days == daypatterns[k] | calendar.sub$STP == "C", ] - - if (all(calendar.sub.day$STP == "C")) { - # ignore cases of everything is cancelled - splits[[k]] <- NULL - } - else { - calendar.new.day <- splitDates(calendar.sub.day) - #calendar.new.day <- UK2GTFS:::splitDates(calendar.sub.day) - # rejects nas - if (inherits(calendar.new.day, "data.frame")) { - calendar.new.day$UID <- paste0(calendar.new.day$UID, k) - splits[[k]] <- calendar.new.day - } - } - } - splits <- data.table::rbindlist(splits, use.names=FALSE) # dplyr::bind_rows(splits) - return(list(splits, NA)) +CALENDAR_UNIQUE_CHECK_COLUMN_NAMES <- c("originalUID","start_date","end_date","Days","STP","duration" ) + + +# We have lots of complex logic, which means that when we have multiple base timetables that are separated +# in the temporal domain e.g. march, april - we end up duplicating the overlays +# +# this is a bit of a gluey hack that could be fixed by looking in the temporal domain when deciding what overlaps +# see test case no.10 ("10:test makeCalendarInner") that triggered addition of this logic +# +makeCalendarsUnique <- function ( calendar ) +{ + calendar <- calendar[ !duplicated( calendar, by=CALENDAR_UNIQUE_CHECK_COLUMN_NAMES ) ] + + return( calendar ) +} + + + + +countIntersectingDayPatterns <- function( dayPatterns ) +{ + unpacked = splitBitmaskMat( dayPatterns, asInteger = TRUE ) + sums = colSums(unpacked) #add up number of intersections for monday etc... + names(sums) <- NULL #makes unit test construction easier + return ( sums ) +} + +intersectingDayPattern <- function( dayPattern1, dayPattern2 ) +{ + return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1) ) +} + + +intersectingDayPatterns <- function( dayPatternBase, dayPatternOverlay ) +{ + if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) + + unpackedOverlay = splitBitmaskMat( dayPatternOverlay, asInteger = FALSE ) + unpackedBase = splitBitmaskMat( dayPatternBase, asInteger = FALSE ) + + #repeat the base for every Overlay + unpackedBaseRep = rep( unpackedBase, length(dayPatternOverlay) ) + unpackedBaseRepmat = matrix(unpackedBaseRep, ncol=7, byrow=TRUE) + + intersects = unpackedBaseRepmat & unpackedOverlay + + res <- apply(intersects, 1, any) + + return ( res ) +} + + +intersectingDayPatterns0 <- function( dayPatternBase, dayPatternOverlay ) +{ + if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) + + intersectingDayPattern_vec <- Vectorize(intersectingDayPattern, vectorize.args = c("dayPattern2")) + + res = intersectingDayPattern_vec( dayPatternBase, dayPatternOverlay ) + names(res) <- NULL #makes unit test construction easier +stopifnot(is.logical(res)) + return ( res ) +} + + + +makeCalendarForDayPatterns <- function( dayPatterns, calendar ) +{ + splits <- list() + + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #calendar = calendar[ order(STP, duration), ] + setkey( calendar, STP, duration ) + setindex( calendar, start_date, end_date) + + for (k in seq(1, length(dayPatterns))) { + # select for each pattern but include cancellations with a + # different day pattern + calendarDay <- calendar[calendar$Days == dayPatterns[k] | calendar$STP == "C", ] + # TODO cancellations now handled elsewhere - remove this once code stable + + if (all(calendarDay$STP == "C")) { + # ignore cases of everything is cancelled + splits[[k]] <- NULL + warning("unexpected item in the makeCalendarForDayPatterns-ing area, cancellations should now be handled at a higher level") + } + else { + calendarNewDay <- splitDates(calendarDay) + + # rejects NAs + if (inherits(calendarNewDay, "data.frame")) { + # further differentiate the UID by appending a number to the end for each different days pattern + calendarNewDay$UID <- paste0(calendarNewDay$UID, k) + splits[[k]] <- calendarNewDay } } } + + splits <- data.table::rbindlist(splits, use.names=FALSE) + + splits <- makeCalendarsUnique( splits ) + + # after all this faffing about and splitting and joining, it's quite likely we've created some + # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) + splits <- splits[ checkOperatingDayActive( splits ) ] + + return(list(splits, NA)) } -#' Duplicate stop_times + +# this is a complex case where the overlays don't have the same day pattern as the base timetable +# +# e.g base is mon-sat, and we have some engineering work for 3 weeks tue-thur +# +# the approach we take is to duplicate the overlay timetables for every week they are in effect, then overlay them. +# +# aha but the complexity isn't finished. If the overlay is tue+thur then wed is the base timetable. +# +# when we get to this latter complexity we just split the overlay into individual days and apply it that way. +# +makeCalendarForDifferentDayPatterns <- function( calendar ) +{ + baseType = max(calendar$STP) + baseTimetables = calendar[calendar$STP == baseType] + overlayTimetables = calendar[calendar$STP != baseType] + + gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] + continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] + + gappyOverlays = makeAllOneDay( gappyOverlays ) + continiousOverlays = expandAllWeeks( continiousOverlays ) + + overlays = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) + + + splits <- list() + + distinctBasePatterns = unique( baseTimetables$Days ) + + for (k in seq(1, length(distinctBasePatterns))) { + + thisBase = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] + + thisOverlay = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] + + if (nrow(thisOverlay) <= 0) + { + splits[[k]] <- thisBase + } + else + { + timetablesForThisPattern = data.table::rbindlist( list( thisBase, thisOverlay ), use.names=FALSE) + + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #timetablesForThisPattern = timetablesForThisPattern[ order(STP, duration), ] + setkey( timetablesForThisPattern, STP, duration ) + setindex( timetablesForThisPattern, start_date, end_date) + + thisSplit <- splitDates( timetablesForThisPattern ) + + # rejects NAs + if (inherits(thisSplit, "data.frame")) { + # further differentiate the UID by appending a number to the end for each different days pattern + thisSplit$UID <- paste0(thisSplit$UID, k) + splits[[k]] <- thisSplit + } + } + } + + splits <- data.table::rbindlist(splits, use.names=FALSE) + + splits <- makeCalendarsUnique( splits ) + + # after all this faffing about and splitting and joining, it's quite likely we've created some + # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) + splits <- splits[ checkOperatingDayActive( splits ) ] + + return(list(splits, NA)) +} + + +# in a week bitmask, if there are non-operating days between the first and last operating day of the week - will return TRUE +# e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE +hasGapInOperatingDays <- function( daysBitmask ) +{ + firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1] + lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1] + + operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) + + res = ( lastDay-firstDay+1 != operatingDayCount ) + + res[is.na(res)] <- FALSE #shouldn't really get this, probably operating days are '0000000' + + return( res ) +} + + + + + +#' duplicateItem #' #' @details -#' Function that duplicates top times for trips that have been split into -#' multiple trips +#' Function that duplicates a data.table, adding a "index" column to all rows in the output indicating which +#' instance of the duplication the row is associated with #' -#' @param calendar calendar data.frame -#' @param stop_times stop_times data.frame -#' @param ncores number of processes for parallel processing (default = 1) +#' @param dt data.table +#' @param reps number of duplicates to be created +#' @param indexStart starting number for the "index" value added to the item #' @noRd #' -duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { - calendar.nodup <- calendar[!duplicated(calendar$rowID), ] - calendar.dup <- calendar[duplicated(calendar$rowID), ] - rowID.unique <- as.data.frame(table(calendar.dup$rowID)) - rowID.unique$Var1 <- as.integer(as.character(rowID.unique$Var1)) - stop_times <- dplyr::left_join(stop_times, rowID.unique, - by = c("schedule" = "Var1") - ) +duplicateItem <- function( dt, reps, indexStart=1 ) +{ + if ( is.na(reps) | reps<1 ) return (NULL) + #replicate all the rows in dt times=reps + duplicates <- dt[rep(seq(1, nrow(dt)), reps), ] - stop_times$schedule2 <- stop_times$schedule - stop_times_split <- stop_times[, .(list(.SD)), by = "schedule2"][,V1] + #create and apply indexes to the created rows- each group of replicated rows gets the same index number. + index <- rep(seq( indexStart, indexStart-1+reps ), nrow(dt)) - # TODO: The could handle cases of non duplicated stoptimes within duplicate.stop_times.int - # rather than splitting and rejoining, would bring code tidyness and speed improvements - duplicate.stop_times.int <- function(stop_times.tmp) { - # message(i) - # stop_times.tmp = stop_times[stop_times$schedule == rowID.unique$Var1[i],] - # reps = rowID.unique$Freq[i] - reps <- stop_times.tmp$Freq[1] - if (is.na(reps)) { - return(NULL) - } else { - index <- rep(seq(1, reps), nrow(stop_times.tmp)) - index <- index[order(index)] - stop_times.tmp <- stop_times.tmp[rep(seq(1, nrow(stop_times.tmp)), reps), ] - stop_times.tmp$index <- index - return(stop_times.tmp) - } + duplicates$index <- index[order(index)] + + return(duplicates) +} + + + + +#' duplicateItems +#' +#' @details +#' Function that duplicates a very large data.table, adding a "index" column to all rows in the output indicating which +#' instance of the duplication the row is associated with +#' +#' requires a column called "_reps" on the object to determine how many times it is to be duplicated +#' +#' @param dt data.table +#' @param split_attribute name of attribute to split the items between worker tasks +#' @param indexStart starting number for the "index" value added to the item +#' @noRd +#' +duplicateItems <- function( dt, split_attribute, ncores=1, indexStart=1 ) +{ + #add an additional column before splitting on it - so that the value we're really splitting on still appears in the output. + dt[, `_TEMP_` := get(split_attribute) ] + dt_split <- dt[, .(list(.SD)), by = `_TEMP_`][,V1] + dt$`_TEMP_` <- NULL + + + duplicate_int <- function(dta) { + rep <- dta$`_reps`[1] + return ( duplicateItem( dta, rep, indexStart ) ) } + if (ncores == 1) { - stop_times.dup <- pbapply::pblapply(stop_times_split, duplicate.stop_times.int) + duplicates <- pbapply::pblapply(dt_split, duplicate_int) } else { cl <- parallel::makeCluster(ncores) parallel::clusterEvalQ(cl, { loadNamespace("UK2GTFS") }) - stop_times.dup <- pbapply::pblapply(stop_times_split, - duplicate.stop_times.int, - cl = cl - ) + + duplicates <- pbapply::pblapply(dt_split, + duplicate_int, + cl = cl) parallel::stopCluster(cl) rm(cl) } - #stop_times.dup <- dplyr::bind_rows(stop_times.dup) performance - stop_times.dup <- data.table::rbindlist(stop_times.dup, use.names=FALSE) - # stop_times.dup$index <- NULL - - # Join on the nonduplicated trip_ids - trip.ids.nodup <- calendar.nodup[, c("rowID", "trip_id")] - stop_times <- dplyr::left_join(stop_times, trip.ids.nodup, by = c("schedule" = "rowID")) - stop_times <- stop_times[!is.na(stop_times$trip_id), ] # when routes are cancelled their stop times are left without valid trip_ids - - # join on the duplicated trip_ids - calendar2 <- dplyr::group_by(calendar, rowID) - calendar2 <- dplyr::mutate(calendar2, Index = seq(1, dplyr::n())) - - stop_times.dup$index2 <- as.integer(stop_times.dup$index + 1) - trip.ids.dup <- calendar2[, c("rowID", "trip_id", "Index")] - trip.ids.dup <- as.data.frame(trip.ids.dup) - stop_times.dup <- dplyr::left_join(stop_times.dup, trip.ids.dup, by = c("schedule" = "rowID", "index2" = "Index")) - stop_times.dup <- stop_times.dup[, c( - "arrival_time", "departure_time", "stop_id", "stop_sequence", - "pickup_type", "drop_off_type", "rowID", "schedule", "trip_id" - )] - stop_times <- stop_times[, c( - "arrival_time", "departure_time", "stop_id", "stop_sequence", - "pickup_type", "drop_off_type", "rowID", "schedule", "trip_id" - )] + duplicates <- data.table::rbindlist(duplicates, use.names=FALSE) + + duplicates$`_reps` <- NULL #performance, putting this inside duplicate_int roughly doubles the execution time + + return (duplicates) +} + + - # stop_times.dup = stop_times.dup[order(stop_times.dup$rowID),] - stop_times.comb <- data.table::rbindlist(list(stop_times, stop_times.dup), use.names=FALSE) - return(stop_times.comb) +#' Duplicate stop_times +#' +#' @details +#' Function that duplicates stop times for trips that have been split into +#' multiple trips and sets the new trip id on the duplicated stop_times +#' +#' @param calendar calendar data.frame +#' @param stop_times stop_times data.frame +#' @param ncores number of processes for parallel processing (default = 1) +#' @noRd +#' +duplicate_stop_times <- function(calendar, stop_times, ncores = 1) { + + outputColumnNames = c( + "trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", + "pickup_type", "drop_off_type", "schedule" + ) + + #it's pretty marginal doing this on multiple threads. With a typical number, + #doing the split takes 2.4s and the duplication 7.8s (on one thread) + #TODO look at avoiding the split if threads=1 + + return ( duplicate_related_items( calendar, stop_times, + original_join_field = "schedule", + new_join_field = "trip_id", + outputColumnNames = outputColumnNames, + ncores=ncores ) ) } +#' Duplicate related items +#' +#' @details +#' Function that duplicates items that are related to calendar +#' expected input are calendar items have been duplicated but retain the same (now duplicate) 'rowID' +#' this tells us which objects to duplicate and how many are required +#' the related_items have an attribute , which joins back to 'rowID' on the calendar items +#' +#' After duplication the duplicated items are joined back onto the input calendar items +#' to create an additional attribute on the output objects +#' +#' The calendar item attribute forms the new relation between the calendar items and +#' related items, so must be unique. +#' +#' @param calendar calendar data.frame +#' @param related_items data.frame of items to be replicated +#' @param ncores number of processes for parallel processing (default = 1) (currently hangs/crashes if >1) +#' @noRd +#' +duplicate_related_items <- function(calendar, related_items, original_join_field, new_join_field, outputColumnNames, ncores = 1) { + + calendar.dup <- calendar[duplicated(calendar$rowID), ] + + if( nrow(calendar.dup) <= 0 ) + { + #no duplicating to do + warning("duplicate_related_items: there were no duplicates detected. In real data this may indicate there has been an error earlier in the processing.") + related_items_dup = data.table() + } + else + { + #create a count of the number of each duplicate of rowID + rowID.unique <- as.data.frame(table(calendar.dup$rowID)) + rowID.unique$Var1 <- as.integer(as.character(rowID.unique$Var1)) + + #join the count of number of duplicates required to the stop times (so we can retrieve it later when doing the duplication) + related_items <- dplyr::left_join(related_items, rowID.unique, + by = setNames("Var1",original_join_field) ) + + #set the number of duplications required + related_items$`_reps` <- related_items$Freq + + # TODO: The could handle cases of non duplicated stoptimes within duplicate.stop_times.int + # rather than splitting and rejoining, would bring code tidyness and speed improvements + related_items_dup <- duplicateItems( related_items, original_join_field, ncores=ncores, indexStart=1 ) + + + # join via rowID+index to get new de-duplicated trip_id + + #create index on the table we want to join to - group by the rowId, index runs from 0..count()-1 of group size + #we start at zero so we don't effect the original stop_times rows and just join in the duplicated rows + new_join_ids <- dplyr::group_by(calendar, rowID) + new_join_ids <- dplyr::mutate(new_join_ids, Index = seq(0, dplyr::n()-1)) + new_join_ids <- as.data.frame( new_join_ids[, c("rowID", new_join_field, "Index")] ) + + related_items_dup <- dplyr::left_join(related_items_dup, new_join_ids, by = setNames(c("rowID","Index"),c(original_join_field,"index")) ) + + #select columns required + related_items_dup <- related_items_dup[, outputColumnNames, with=FALSE] + } + + calendar.nodup <- calendar[!duplicated(calendar$rowID), ] + + # when routes are cancelled their stop times are left without valid trip_ids - remove those rows + # this only applies to the non-duplicated rows + # Join via rowID to determine the trip_id + related_ids_nodup <- calendar.nodup[, c("rowID", new_join_field), with=FALSE] + related_items_no_dup <- dplyr::left_join(related_items, related_ids_nodup, by = setNames("rowID",original_join_field)) + related_items_no_dup <- related_items_no_dup[!is.na(related_items_no_dup[[new_join_field]]), ] + + + #select columns required, join output results together + related_items_no_dup <- related_items_no_dup[, outputColumnNames, with=FALSE] + + related_items_comb <- data.table::rbindlist(list(related_items_no_dup, related_items_dup), use.names=FALSE) + + return(related_items_comb) +} + + + + + + + #' fix times for journeys that run past midnight #' #' @details @@ -646,6 +1485,26 @@ afterMidnight <- function(stop_times, safe = TRUE) { + +fixStopTimeData <- function(stop_times) +{ + # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains + stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "0000" & stop_times$Activity == "D", + stop_times$arrival_time, + stop_times$departure_time) + stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "0000" & stop_times$Activity == "U", + stop_times$departure_time, + stop_times$arrival_time) + + #fix missing arrival / departure times by copying from the other time. + stop_times$arrival_time[is.na(stop_times$arrival_time)] <- stop_times$departure_time[is.na(stop_times$arrival_time)] + stop_times$departure_time[is.na(stop_times$departure_time)] <- stop_times$arrival_time[is.na(stop_times$departure_time)] + + return (stop_times) +} + + + #' Clean Activities #' @param x character activities #' @details @@ -656,12 +1515,6 @@ afterMidnight <- function(stop_times, safe = TRUE) { #' clean_activities2 <- function(x, public_only = TRUE) { - #x <- strsplit(x," ") - #x <- lapply(x, function(y){ - # y <- paste(y[order(y, decreasing = TRUE)], collapse = " ") - #}) - #x <- unlist(x) - x <- data.frame(activity = x, stringsAsFactors = FALSE) if (public_only) diff --git a/R/atoc_main.R b/R/atoc_main.R index c2cdcdf..d1333ed 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -19,26 +19,17 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub message(paste0(Sys.time(), " Building stop_times")) } - # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains - stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "0000" & stop_times$Activity == "D", - stop_times$arrival_time, - stop_times$departure_time) - stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "0000" & stop_times$Activity == "U", - stop_times$departure_time, - stop_times$arrival_time) + stop_times <- fixStopTimeData( stop_times ) # Convert Activity to pickup_type and drop_off_type upoffs <- clean_activities2(stop_times$Activity, public_only=public_only) stop_times <- cbind(stop_times, upoffs) - #fix missing arrival / departure times by copying from the other time. - stop_times$arrival_time[is.na(stop_times$arrival_time)] <- stop_times$departure_time[is.na(stop_times$arrival_time)] - stop_times$departure_time[is.na(stop_times$departure_time)] <- stop_times$arrival_time[is.na(stop_times$departure_time)] - stop_times <- stop_times[, c("arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type", "rowID", "schedule")] if (public_only) { + #remove calls that don't allow the public to board or alight (type==1) stop_times <- stop_times[!(stop_times$pickup_type == 1 & stop_times$drop_off_type == 1), ] } @@ -52,7 +43,7 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub # build the calendar file res <- makeCalendar(schedule = schedule, ncores = ncores) calendar <- res[[1]] - calendar_dates <- res[[2]] + cancellation_dates <- res[[2]] rm(res) #remove columns we don't need any more @@ -67,35 +58,43 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub gc() - calendar$trip_id <- 1:nrow(calendar) # not sure why this was here, but used in duplicate.stop_times - # calendar$service_id = 1:nrow(calendar) # For this purpose the service and the trip are always the same + # In CIF land, one service ID can have multiple operating patterns, expressed as multiple BS records (and their associated LO,LI,LT records) having the same UID + # that's the opposite of the GTFS concept where a single calendar (operating pattern) can be used by multiple trips, but a single trip can only have one operating pattern. + # The CIF concept of a single UID maps more closely onto GTFS routes. + # + # so we maintain a 1:1 relationship between Trips and Calendars, duplicating CIF service IDs as required to represent the multiplicity of possible operating patterns. + # + # see merging code where there is functionality to de-duplicate calendar patterns down to a unique set. (condenseServicePatterns) + # clean calendars - # calendar = calendar[,c("UID","monday","tuesday","wednesday","thursday","friday","saturday","sunday", - # "start_date","end_date","rowID","trip_id")] names(calendar)[names(calendar) == "UID"] <- "service_id" - calendar$start_date <- as.character(calendar$start_date) - calendar$start_date <- gsub("-", "", calendar$start_date) - calendar$end_date <- as.character(calendar$end_date) - calendar$end_date <- gsub("-", "", calendar$end_date) + calendar <- formatAttributesToGtfsSchema( calendar ) - calendar_dates <- calendar_dates[, c("UID", "start_date")] - names(calendar_dates) <- c("service_id", "date") - calendar_dates$date <- as.character(calendar_dates$date) - calendar_dates$date <- gsub("-", "", calendar_dates$date) - calendar_dates$exception_type <- 2 # all events passed to calendar_dates are single day cancellations + cancellation_dates <- cancellation_dates[, c("UID", "start_date")] + names(cancellation_dates) <- c("service_id", "date") + cancellation_dates <- formatAttributesToGtfsSchema( cancellation_dates ) + cancellation_dates$exception_type <- 2 # all events passed to cancellation_dates are single day cancellations ### SECTION 3: ############################################################################### - # When splitting the calendar roWIDs are duplicated + + calendar$trip_id <- 1:nrow(calendar) + # calendar$service_id = 1:nrow(calendar) + + + + # When splitting the calendar rowIDs are duplicated # so create new system of trip_ids and duplicate the relevant stop_times if (!silent) { message(paste0(Sys.time(), " Duplicating necessary stop times")) } - #TODO find out why this hangs if ncores > 1 - stop_times <- duplicate.stop_times_alt(calendar = calendar, stop_times = stop_times, ncores = 1) + #TODO find out why this hangs if ncores > 1 + #also sets the trip_id on stop_times + stop_times <- duplicate_stop_times(calendar = calendar, stop_times = stop_times, ncores = 1) + ### SECTION 5: ############################################################################### # make make the trips.txt file by matching the calendar to the stop_times @@ -103,6 +102,11 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub trips <- calendar[, c("service_id", "trip_id", "rowID", "ATOC Code", "Train Status", "Train Category", "Power Type", "Train Identity")] trips <- longnames(routes = trips, stop_times = stop_times) + + # Fix Times (and remove some fields) + stop_times <- afterMidnight(stop_times) + + ### SECTION 4: ############################################################################### # make make the routes.txt # a route is all the trips with a common start and end @@ -111,7 +115,7 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub message(paste0(Sys.time(), " Building routes.txt")) } - #do the conversion to route_type before grouping because several status map to the same route_type and we get 'duplicate' routes that look the same. + #do the conversion to route_type before grouping because several statuses map to the same route_type and we get 'duplicate' routes that look the same. train_status <- data.table( train_status = c("B", "F", "P", "S", "T", "1", "2", "3", "4", "5"), route_type = c( 3, NA, 2, 4, NA, 2, NA, NA, 4, 3), @@ -136,17 +140,17 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub routes$route_type[routes$agency_id == "LT" & routes$route_type == 2 ] <- 1 # London Underground is Metro (unless already identified as a bus/ship etc) + #TODO look at what this causes LizPurpCrossRailElizabethLine to be categorised as. + #TODO move to longnames() ### Section 6: ####################################################### # Final Checks - # Fix Times - stop_times <- afterMidnight(stop_times) - #gtfs systems tend to be tolerant of additional fields, so expose the train_category and power_type so that the consumer can do more analysis on them if they wish. + #gtfs systems tend to be tolerant of additional fields, so expose the train_category and power_type so that the consumer can do analysis on them. #e.g. filter out ECS moves - # Ditch unneeded columns + # Ditch unnecessary columns routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type", "train_category")] trips <- trips[, c("trip_id", "route_id", "service_id", "Train Identity", "Power Type")] names(trips) <- c("trip_id", "route_id", "service_id", "train_identity", "power_type") @@ -155,7 +159,11 @@ schedule2routes <- function(stop_times, schedule, silent = TRUE, ncores = 1, pub # end of function - timetables <- list(calendar, calendar_dates, routes, stop_times, trips) + timetables <- list(calendar, cancellation_dates, routes, stop_times, trips) names(timetables) <- c("calendar", "calendar_dates", "routes", "stop_times", "trips") + return(timetables) } + + + From 2c7cd1f236f388da35e055c2daa9409c26a41ee7 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:35:58 +0100 Subject: [PATCH 23/81] move basic unit tests to their own file - add large number of unit tests for revised overlay functionality --- tests/testthat/test_aa_unit.R | 1179 +++++++++++++++++++++++++++++++++ tests/testthat/test_atoc.R | 121 +--- 2 files changed, 1180 insertions(+), 120 deletions(-) create mode 100644 tests/testthat/test_aa_unit.R diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R new file mode 100644 index 0000000..c686021 --- /dev/null +++ b/tests/testthat/test_aa_unit.R @@ -0,0 +1,1179 @@ + +context("Running basic unit tests") + + +fixCalendarDates <- function( df, fixDurations = TRUE, createOriginalUID = TRUE ) +{ + df$start_date <- as.Date(df$start_date, format = "%d-%m-%Y") + df$end_date <- as.Date(df$end_date, format = "%d-%m-%Y") + if (fixDurations) df$duration <- df$end_date - df$start_date + 1 + if (createOriginalUID && "UID" %in% names(df) && !"originalUID" %in% names(df)) df$originalUID <- df$UID + + return (df) +} + +removeOriginalUidField <- function( df ) +{ + df$originalUID = NULL + return (df) +} + + +printDifferencesDf <- function( df1, df2 ) +{ + if (!identical(df1,df2)) + { + comparison <- sapply(1:nrow(df1), function(i) all.equal(df1[i, ], df2[i, ])) + print(comparison) + } +} + +printDifferences <- function( v1, v2 ) +{ + if (!identical(v1,v2)) + { + comparison <- all.equal( v1, v2 ) + print(comparison) + } +} + + +test_that("test countIntersectingDayPatterns:1", { + + OK = TRUE + + { + patterns = c("0000001", "1000000", "1000001") + expectedCounts = c(2,0,0,0,0,0,2) + + counts <- countIntersectingDayPatterns( patterns ) + + printDifferences( counts, expectedCounts) + OK = OK & identical(counts, expectedCounts) + } + { + patterns = c("0000001", "1000000", "0000001") + expectedCounts = c(1,0,0,0,0,0,2) + + counts <- countIntersectingDayPatterns( patterns ) + + printDifferences( counts, expectedCounts) + OK = OK & identical(counts, expectedCounts) + } + { + patterns = c("4000001", "1001100", "0000001") + expectedCounts = c(5,0,0,1,1,0,2) + + counts <- countIntersectingDayPatterns( patterns ) + + printDifferences( counts, expectedCounts) + OK = OK & identical(counts, expectedCounts) + } + + expect_true( OK ) +}) + + + +test_that("test intersectingDayPatterns:1", { + + OK = TRUE + + { + base = c("0000001") + overlay = c("0000001", "1000000", "1000001", "0000010", "0000000") + expectedResult = c(TRUE, FALSE, TRUE, FALSE, FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("0000000") + overlay = c("0000001", "1000000", "1000001") + expectedResult = c(FALSE, FALSE, FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("1111111") + overlay = c("0000001", "1000000", "1000001", "0000000") + expectedResult = c(TRUE, TRUE, TRUE, FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("1010101") + overlay = c("0000001", "1000000", "0101010", "0000000") + expectedResult = c(TRUE, TRUE, FALSE, FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("0000000") + overlay = c("0000000") + expectedResult = c(FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("0000000") + overlay = c("") + expectedResult = c(FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("0000000") + overlay = c("","0000000","1111111") + expectedResult = c(FALSE, FALSE, FALSE) + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c("0000000") + overlay = c() + expectedResult = NULL + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c() + overlay = c("0000000") + expectedResult = NULL + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + { + base = c() + overlay = c() + expectedResult = NULL + + res = intersectingDayPatterns( base, overlay ) + + printDifferences( res, expectedResult) + OK = OK & identical(res, expectedResult) + } + + expect_true( OK ) +}) + + + + + +test_that("test checkOperatingDayActive:1", { + + OK = TRUE + + testData = data.table( + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "22-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" )) + expectedResult = c(TRUE, TRUE, TRUE, TRUE, TRUE) + + testData = rbind(testData, data.table( + start_date=c("02-09-2023", "14-09-2023", "15-09-2023", "11-09-2023", "20-09-2023" ), + end_date=c( "03-09-2023", "15-09-2023", "20-09-2023", "17-09-2023", "27-09-2023" ), + Days=c( "1111100", "0010010", "0001000", "0000000", "0000000" ))) + #not valid data but edge case to cover off + expectedResult = c(expectedResult, + c(FALSE, FALSE, FALSE, FALSE, FALSE)) + + testData <- fixCalendarDates( testData ) + + for (i in seq(1, length(expectedResult) ) ) + { + OK = OK & expectedResult[i] == checkOperatingDayActive( testData[i,] ) + } + + OK = OK & all(expectedResult == checkOperatingDayActive( testData )) + + expect_true( OK ) +}) + + + + +test_that("test intersectingDayPattern:1", { + + OK = TRUE + + pattern1 = c("0000001", "1000000", "1000001", "0000000", "1000001") + pattern2 = c("0000001", "1000000", "1001001", "0000000", "0100010") + expectedResult = c(TRUE, TRUE, TRUE, FALSE, FALSE) + + for (i in seq(1, length(pattern1) ) ) + { + OK = OK & expectedResult[i] == intersectingDayPattern( pattern1[i], pattern2[i] ) + } + + expect_true( OK ) +}) + + + +test_that("test intersectingDayPatterns:1", { + + patternOverlay = c("0000001", "1000000", "1000001", "0000000", "1000001", "0110110", "0100000") + patternBase = c("1001001") + expectedResult = c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE) + + res = intersectingDayPatterns( patternBase, patternOverlay ) + + names(res) <- NULL + + printDifferences(expectedResult,res) + + expect_true( identical(expectedResult,res ) ) +}) + + + + + + +test_that("test duplicate_stop_times:1", { + + testCalendar = data.table( + rowID=c( 1, 1, 1, 2, 2, 3), #row id identifies original data + trip_id=c( 11, 12, 13, 21, 22, 31))#trip_id is the new value assigned to duplicated stop_times + + testStopTimes = data.table(schedule=c( 1, 1, 1, 2, 2, 4), #schedule joins to row id in calendar + stop_sequence=c( 1, 2, 3, 1, 2, 1), #all the other columns just need to exist + stop_id=c( 0, 0, 0, 0, 0, 0), + pickup_type=c( 0, 0, 0, 0, 0, 0), + drop_off_type=c( 0, 0, 0, 0, 0, 0), + arrival_time=c( 0, 0, 0, 0, 0, 0), + departure_time=c(0, 0, 0, 0, 0, 0)) + + duplicates <- duplicate_stop_times(testCalendar, testStopTimes, ncores = 1) #hangs / crashes with more than one thread + + expectedResult = data.table(trip_id=c( 11, 11, 11, 21, 21, 12, 12, 12, 13, 13, 13, 22, 22), + arrival_time=c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + departure_time=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + stop_id=c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + stop_sequence=c( 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2), + pickup_type=c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + drop_off_type=c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + schedule=c( 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2)) + + printDifferencesDf(expectedResult,duplicates) + + expect_true( identical(expectedResult,duplicates) ) +}) + + + + +test_that("test validateCalendarDates:1", { + + ok = TRUE + + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" )) + testData <- fixCalendarDates( testData ) + + ok = ok & all(validateCalendarDates( testData )) + + + #uid4 wrong + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "22-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" )) + testData <- fixCalendarDates( testData ) + + ok = ok & !all(validateCalendarDates( testData )) + + #uid2 wrong + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "04-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" )) + testData <- fixCalendarDates( testData ) + + ok = ok & !all(validateCalendarDates( testData )) + + #uid1 wrong + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023", "26-01-2023" ), + end_date=c( "31-01-2023", "05-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" )) + testData <- fixCalendarDates( testData ) + + ok = ok & !all(validateCalendarDates( testData )) + + + expect_true( ok ) +}) + + + + +test_that("test makeAllOneDay:1", { + + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "23-01-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "1000000", "0001000" ), + STP=c( "P", "C", "P", "C", "C" ), + rowID=c( 1, 2, 3, 6, 7)) + testData <- fixCalendarDates( testData ) + + res <- makeAllOneDay( testData ) + + #TODO check the contents more thoroughly + + ok = TRUE + + ok = ok & all(res$start_date == res$end_date) + + summary <- as.data.frame( res %>% + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedResult = data.frame(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + count=c( 15, 10, 15, 1, 1)) + expectedResult$count <- as.integer( expectedResult$count ) + + printDifferencesDf(expectedResult,summary) + + expect_true( identical(expectedResult,summary) & ok ) +}) + + +#bizarrely had to add special glue to make it work correctly when duplicating one object +test_that("test makeAllOneDay:2", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "18-01-2023"), + Days=c( "1110000"), + STP=c( "P"), + rowID=c( 3)) + testData <- fixCalendarDates( testData ) + + res <- makeAllOneDay( testData ) + + ok = TRUE + + ok = ok & all(res$start_date == res$end_date) + + + expectedResult = data.table( + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "03-01-2023", "04-01-2023", "09-01-2023", "10-01-2023", "11-01-2023"), + end_date=c( "02-01-2023", "03-01-2023", "04-01-2023", "09-01-2023", "10-01-2023", "11-01-2023"), + Days=c( "1000000", "0100000", "0010000", "1000000", "0100000", "0010000"), + STP=c( "P", "P", "P", "P", "P", "P"), + rowID=c( 3, 3, 3, 3, 3, 3)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1", "uid1", "uid1"), + start_date=c("16-01-2023", "17-01-2023", "18-01-2023"), + end_date=c( "16-01-2023", "17-01-2023", "18-01-2023"), + Days=c( "1000000", "0100000", "0010000"), + STP=c( "P", "P", "P"), + rowID=c( 3, 3, 3))) + + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + ok = ok & identical(expectedResult,res) + + + summary <- as.data.frame( res %>% + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedCount = data.frame(UID=c( "uid1"), + count=c( 9)) + expectedCount$count <- as.integer( expectedCount$count ) + + printDifferencesDf(expectedCount,summary) + + expect_true( identical(expectedCount,summary) & identical(expectedResult,res) & ok ) +}) + + + + +test_that("test expandAllWeeks:1", { + + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023"), + end_date=c( "18-01-2023", "29-01-2023", "03-03-2023", "23-01-2023"), + Days=c( "1110000", "0001001", "0011100", "1000000"), + STP=c( "P", "C", "P", "C"), + rowID=c( 1, 2, 3, 6)) + testData <- fixCalendarDates( testData ) + + res <- expandAllWeeks( testData ) + + expectedResult = data.table(UID=c( "uid1", "uid1", "uid1", "uid2", "uid2", "uid2", "uid2", "uid3", "uid4"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023", "05-01-2023", "12-01-2023", "19-01-2023", "26-01-2023", "01-03-2023", "23-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023", "08-01-2023", "15-01-2023", "22-01-2023", "29-01-2023", "03-03-2023", "23-01-2023"), + Days=c( "1110000", "1110000", "1110000", "0001001", "0001001", "0001001", "0001001", "0011100", "1000000"), + STP=c( "P", "P", "P", "C", "C", "C", "C", "P", "C"), + rowID=c( 1, 1, 1, 2, 2, 2, 2, 3, 6)) + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("test expandAllWeeks:2", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "18-01-2023"), + Days=c( "1110000"), + STP=c( "P"), + rowID=c( 1)) + testData <- fixCalendarDates( testData ) + + res <- expandAllWeeks( testData ) + + expectedResult = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023"), + Days=c( "1110000", "1110000", "1110000"), + STP=c( "P", "P", "P"), + rowID=c( 1, 1, 1)) + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("test hasGapInOperatingDays:1", { + + testData = c("0000000", "1000000", "0000001", "0100000", "1100000", "0000011", "0011100", "0101000", "1000001", "0001001") + expectedResult = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE) + + res = hasGapInOperatingDays( testData ) + + expect_true( identical(expectedResult,res) ) +}) + + + +#when running for real, this hangs if ncores>1 having trouble reproducing +test_that("test duplicateItems:1", { + + sourceDuplication = 99 + repetitions = 110 + expectedCount=sourceDuplication * repetitions + + testData = data.table(UID=c( "uid1", "uid2", "uid3"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023"), + Days=c( "1110000", "1110000", "1110000"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + testData <- testData[rep(seq_len(.N), times = sourceDuplication)] + + testData$`_reps` = repetitions + + res = duplicateItems( testData, "UID", ncores=4 ) + + summary <- as.data.frame( res %>% + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedResult = data.frame(UID=c( "uid1", "uid2", "uid3"), + count=c( expectedCount, expectedCount, expectedCount)) + expectedResult$count <- as.integer( expectedResult$count ) + + printDifferencesDf(expectedResult,summary) + + + expect_true( identical(expectedResult,summary) ) +}) + + + + + + + + +context("Running calendar overlay unit tests") + + +test_that("0:fixOverlappingDates -based on priority", { + #TODO add more test cases + + testData = data.table( + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", NA, "uid1"), + STP=c( "P", "C", "O", "P", "P", "C", "N"), + start_date=c("02-01-2023", "11-01-2023", "11-01-2023", "20-01-2023", "25-01-2023", "27-01-2023", "20-02-2023" ), + end_date=c( "11-01-2023", "11-01-2023", "20-01-2023", "25-02-2023", "27-01-2023", "20-03-2023", "31-03-2023")) + testData <- fixCalendarDates( testData ) + + res = fixOverlappingDates(testData) + + expectedResult = data.table( #equal priority but overlapping + #current behaviour is to leave unchanged + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", NA, "uid1"), + STP=c( "P", "C", "O", "P", "P", "C", "N"), + start_date=c("02-01-2023", "11-01-2023", "12-01-2023", "21-01-2023", "25-01-2023", "27-01-2023", "20-02-2023" ), + end_date=c( "10-01-2023", "11-01-2023", "20-01-2023", "25-02-2023", "27-01-2023", "20-03-2023", "31-03-2023" )) + expectedResult <- fixCalendarDates( expectedResult ) + expectedResult$duration = testData$duration + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("1:test allocateCancellationsAcrossCalendars", { + + calendar = data.table(UID=c( "uid1 a", "uid1 b", "uid2 c", "uid2 d", "uid3 e"), + originalUID=c("uid1", "uid1", "uid2", "uid2", "uid3"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "07-03-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "26-03-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "0100001", "0001000" ), + STP=c( "P", "C", "P", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5)) + calendar <- fixCalendarDates( calendar ) + calendar <- splitAndRebindBitmask( calendar ) + + cancellations = data.table( + UID=c( "aaaaa", "bbbbbb", "ccccccc", "ddddddd", "eeeeee"), #this column gets removed + originalUID=c("uid1", "uid1", "uid1", "uid2", "uid2", "uid4"), + start_date=c("02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), + end_date=c( "02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), + Days=c( "1000000", "0100000", "0000100", "0100000", "0010000", "0001000" ), + STP=c( "C", "C", "C", "C", "C", "C" ), + rowID=c( 6, 7, 8, 9, 10, 11)) + cancellations <- fixCalendarDates( cancellations ) + cancellations <- splitAndRebindBitmask( cancellations ) + + res <- allocateCancellationsAcrossCalendars( calendar, cancellations ) + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 a", "uid2 d"), + #originalUID=c("uid1", "uid1", "uid2"), + start_date=c("02-01-2023", "03-01-2023", "14-03-2023"), + end_date=c( "02-01-2023", "03-01-2023", "14-03-2023"), + Days=c( "1000000", "0100000", "0100000"), + STP=c( "C", "C", "C"), + rowID=c( 6, 7, 9), + duration=c( 1, 1, 1), + monday=c( TRUE, FALSE, FALSE), + tuesday=c( FALSE, TRUE, TRUE), + wednesday=c( FALSE, FALSE, FALSE), + thursday=c( FALSE, FALSE, FALSE), + friday=c( FALSE, FALSE, FALSE), + saturday=c( FALSE, FALSE, FALSE), + sunday=c( FALSE, FALSE, FALSE) + ) + expectedResult <- fixCalendarDates( expectedResult, createOriginalUID=FALSE ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + +test_that("2:test makeCalendarInner:one calendar entry for service", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("1.1:test makeCalendarInner:all identical patterns - more than single day overlay", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "09-01-2023", "09-01-2023"), + end_date=c( "04-02-2023", "28-01-2023", "14-01-2023"), + Days=c( "1111110", "1111110", "1111110"), + STP=c( "P", "O", "C"), + rowID=c( 1, 2, 3)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("02-01-2023", "15-01-2023", "29-01-2023"), + end_date=c( "08-01-2023", "28-01-2023", "04-02-2023"), + Days=c( "1111110", "1111110", "1111110"), + STP=c( "P", "O", "P"), + rowID=c( 1, 2, 1)) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("3:test makeCalendarInner:one base: one day cancellations", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0010000", "0001000", "1000000" ), + STP=c( "P", "C", "C", "C" ), + rowID=c( 1, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expectedResultDates = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "0010000", "0001000", "1000000" ), + STP=c( "C", "C", "C" ), + rowID=c( 4, 5, 6)) + expectedResultDates <- fixCalendarDates( expectedResultDates ) + + res.calendar_dates = removeOriginalUidField( res.calendar_dates ) + expectedResultDates = removeOriginalUidField( expectedResultDates ) + + printDifferencesDf(expectedResultDates,res.calendar_dates) + + + expect_true(identical(expectedResult,res.calendar) + & identical(expectedResultDates,res.calendar_dates)) +}) + + + + + +test_that("4:test makeCalendarInner:one day cancellations(old)", { + + #there are multiple valid ways to process this - because of cancellations being handled at a higher level this + #test case no longer applies - but quite a bit of work to create the test case, so keep it for now. + expect_true(TRUE) + + if(FALSE) + { + #all overlays 1 day cancellations + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023"), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023"), + Days=c( "1111110", "0000001", "0011100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expectedResultDates = data.table(UID=c("uid1", "uid1", "uid1"), + start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "0010000", "0001000", "1000000" ), + STP=c( "C", "C", "C" ), + rowID=c( 4, 5, 6)) + expectedResultDates <- fixCalendarDates( expectedResultDates ) + + res.calendar_dates = removeOriginalUidField( res.calendar_dates ) + expectedResultDates = removeOriginalUidField( expectedResultDates ) + + expect_true(identical(expectedResult,res.calendar) + & identical(expectedResultDates,res.calendar_dates)) + } + +}) + + + + + +test_that("5:test makeCalendarInner:one day cancellations(current)", { + + # all overlays 1 day cancellations + # this method splits up the base timetable, leaving gaps where there are cancellation days + + # this can create schedule entries which are by the CIF rules incorrect, because we don't + # validate that the new start/end dates align with the day pattern bitmask + + # while the cancellation part is no longer current, this is still a good test for all the date setting logic + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 4, 5, 6)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 a2", "uid1 b2", "uid1 c2"), + start_date=c("02-01-2023", "12-01-2023", "24-01-2023", "08-01-2023", "12-01-2023", "24-01-2023"), + end_date=c( "10-01-2023", "22-01-2023", "04-02-2023", "10-01-2023", "22-01-2023", "05-02-2023"), + Days=c( "1111110", "1111110", "1111110", "0000001", "0000001", "0000001"), + STP=c( "P", "P", "P", "P", "P", "P"), + rowID=c( 1, 1, 1, 2, 2, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + + +test_that("6:test makeCalendarInner:overlay -matching base pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "09-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "21-01-2023"), + Days=c( "1111110", "0000001", "1111110"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 a2"), + start_date=c("02-01-2023", "09-01-2023", "22-01-2023", "08-01-2023"), + end_date=c( "08-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "1111110", "1111110", "0000001"), + STP=c( "P", "O", "P", "P"), + rowID=c( 1, 3, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) + +}) + + +test_that("6.1:test makeCalendarInner:bases with different patterns, no overlay", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("22-05-2023", "25-09-2023", "02-10-2023"), + end_date=c( "22-09-2023", "26-09-2023", "13-10-2023"), + Days=c( "1111100", "1100000", "1111100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + res.calendar = removeOriginalUidField( res.calendar ) + testData = removeOriginalUidField( testData ) + + printDifferencesDf(testData,res.calendar) + + expect_true(identical(testData,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("6.2:test makeCalendarInner:base is N (STP) with different patterns, no overlay", { + + testData = data.table(UID=c( "uid1", "uid1"), + start_date=c("26-06-2023", "31-07-2023"), + end_date=c( "29-07-2023", "03-08-2023"), + Days=c( "1111110", "1111000"), + STP=c( "N", "N"), + rowID=c( 1, 2)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + res.calendar = removeOriginalUidField( res.calendar ) + testData = removeOriginalUidField( testData ) + + printDifferencesDf(testData,res.calendar) + + expect_true(identical(testData,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("7:test makeCalendarInner:overlay -different to base pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "21-01-2023"), + Days=c( "1111110", "0000001", "0111110"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1"), + start_date=c("02-01-2023", "10-01-2023", "15-01-2023", "17-01-2023", "22-01-2023", "08-01-2023"), + end_date=c( "09-01-2023", "14-01-2023", "16-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "0111110", "1111110", "0111110", "1111110", "0000001"), + STP=c( "P", "O", "P", "O", "P", "P"), + rowID=c( 1, 3, 1, 3, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("8:test makeCalendarInner:overlay -different to base pattern-gap in pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "20-01-2023"), + Days=c( "1111110", "0000001", "0110100"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "14-01-2023"), + end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "16-01-2023"), + Days=c( "1111110", "0100000", "0010000", "1111110", "0000100", "1111110"), + STP=c( "P", "O", "O", "P", "O", "P"), + rowID=c( 1, 3, 3, 1, 3, 1)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 j1", "uid1 k1", "uid1"), + start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), + end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "0100000", "0010000", "1111110", "0000100", "1111110", "0000001"), + STP=c( "O", "O", "P", "O", "P", "P"), + rowID=c( 3, 3, 1, 3, 1, 2))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("9:test makeCalendarInner:overlay -different to base pattern-gap in pattern -creating base fragments to be skipped", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "03-02-2023", "05-02-2023", "20-01-2023"), + Days=c( "0111100", "0000001", "0110100"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1"), #the 'f' calendar gets thrown away + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), + end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), + Days=c( "0111100", "0100000", "0010000", "0111100", "0000100"), + STP=c( "P", "O", "O", "P", "O"), + rowID=c( 1, 3, 3, 1, 3)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 j1", "uid1 k1", "uid1"), + start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), + end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "03-02-2023", "05-02-2023"), + Days=c( "0100000", "0010000", "0111100", "0000100", "0111100", "0000001"), + STP=c( "O", "O", "P", "O", "P", "P"), + rowID=c( 3, 3, 1, 3, 1, 2))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + + +test_that("10:test makeCalendarInner", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "12-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "03-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "12-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111100", "0000001", "0011100", "0011000", "0001000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6, 7)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + #this is a more complex expansion than strictly necessary - could add more logic to see if the base / overlay patterns + #currently we just test if the patterns collide from a operating day mask perspective, but not if they overlap for operating period. + #e.g. if we have base timetables for march and april with different operating patterns it will expand on a week-by-week basis + # instead of going 'oh that's fine, march and april don't overlap + + expectedResult = data.table( + UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + start_date=c("02-01-2023", "11-01-2023", "13-01-2023", "18-01-2023", "20-01-2023", "24-01-2023"), + end_date=c( "10-01-2023", "11-01-2023", "17-01-2023", "19-01-2023", "22-01-2023", "03-02-2023"), + Days=c( "1111100", "0011000", "1111100", "0011000", "1111100", "1111100"), + STP=c( "P", "O", "P", "O", "P", "P"), + rowID=c( 1, 4, 1, 4, 1, 1)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1", "uid1 c3", "uid1 d3" ), + start_date=c("08-01-2023", "01-03-2023", "10-03-2023" ), + end_date=c( "05-02-2023", "07-03-2023", "31-03-2023" ), + Days=c( "0000001", "0011100", "0011100" ), + STP=c( "P", "P", "P" ), + rowID=c( 2, 3, 3 ))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + + +test_that("11:test makeCalendarInner: overlay matching pattern of a base that is offset temporaly", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("04-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023"), + end_date=c( "02-02-2023", "05-02-2023", "30-03-2023", "19-01-2023", "16-03-2023"), + Days=c( "0011000", "0000001", "0011000", "0011000", "0011000"), + STP=c( "P", "P", "P", "O", "C"), + rowID=c( 1, 2, 3, 4, 5)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + #this is what the code produces - it is wrong. e.g 10/3 service is missing + expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 a2"), + start_date=c("04-01-2023", "11-01-2023", "20-01-2023", "01-03-2023", "17-03-2023", "08-01-2023"), + end_date=c( "10-01-2023", "19-01-2023", "02-02-2023", "07-03-2023", "30-03-2023", "05-02-2023"), + Days=c( "0011000", "0011000", "0011000", "0011000", "0011000", "0000001"), + STP=c( "P", "O", "P", "P", "P", "P"), + rowID=c( 1, 4, 1, 3, 3, 2)) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("12: test makeCalendarInner", { + + #by convention Sunday service timetables are Sunday only + #the 'from' date should be the first day the timetable has effect (i.e. should have a 1 in the relevant day column) + #(and I assume the same is true of the last) + + #mon-sat timetable + #sun different operating hours on sunday + #engineering works means having to berth in a different platform for a couple of weeks wed-fri + #cancel mondays for 2 weeks + #cancel sundays for 2 weeks + #mon-sat sun -march- wed-fri platform cancel mon cancel sun + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-01-2023", "15-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "27-01-2023", "16-01-2023", "22-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011100", "1000000", "0000001" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "14-01-2023", "17-01-2023", "18-01-2023"), + end_date=c( "08-01-2023", "10-01-2023", "13-01-2023", "15-01-2023", "17-01-2023", "20-01-2023"), + Days=c( "1111110", "1111110", "0011100", "1111110", "1111110", "0011100"), + STP=c( "P", "P", "O", "P", "P", "O"), + rowID=c( 1, 1, 4, 1, 1, 4)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 a2", "uid1 c2", "uid1 d3"), + start_date=c("21-01-2023", "25-01-2023", "28-01-2023", "08-01-2023", "23-01-2023", "01-03-2023"), + end_date=c( "24-01-2023", "27-01-2023", "04-02-2023", "14-01-2023", "05-02-2023", "31-03-2023"), + Days=c( "1111110", "0011100", "1111110", "0000001", "0000001", "0011100"), + STP=c( "P", "O", "P", "P", "P", "P"), + rowID=c( 1, 4, 1, 2, 2, 3))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index 564cc94..4b7ff21 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -1,123 +1,3 @@ - -context("Running unit tests before system tests") - - -fixDates <- function( df ) -{ - df$start_date <- as.Date(df$start_date, format = "%d-%m-%Y") - df$end_date <- as.Date(df$end_date, format = "%d-%m-%Y") - df$duration <- df$end_date - df$start_date + 1 - - return (df) -} - -test_that("test makeCalendar.inner:1", { - - testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), - end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "09-03-2023", "23-01-2023" ), - Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), - STP=c( "P", "P", "P", "O", "C", "C" ), - rowID=c( 1, 2, 3, 4, 5, 6)) - - testData <- fixDates( testData ) - - res <- makeCalendar.inner( testData ) - - res.calendar <- res[[1]] - res.calendar_dates <- res[[2]] - res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] - - #this is what the code produces - it is wrong.not applying the overlay correctly - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), - start_date=c("02-01-2023", "24-01-2023", "08-01-2023", "24-01-2023", "01-03-2023", "10-03-2023", "11-01-2023"), - end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "07-03-2023", "31-03-2023", "19-01-2023"), - Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), - STP=c( "P", "P", "P", "P", "P", "P", "O"), - rowID=c( 1, 1, 2, 2, 3, 3, 4)) - - expectedResult <- fixDates( expectedResult ) - - expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) -}) - - - -test_that("test makeCalendar.inner:2", { -browser() - testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023", "23-01-2023" ), - end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "16-03-2023", "23-01-2023" ), - Days=c( "1111110", "0000001", "0011100", "0011000", "0011000", "1000000" ), - STP=c( "P", "P", "P", "O", "C", "C" ), - rowID=c( 1, 2, 3, 4, 5, 6)) - - testData <- fixDates( testData ) - - res <- makeCalendar.inner( testData ) - - res.calendar <- res[[1]] - res.calendar_dates <- res[[2]] - res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] - - #this is what the code produces - it is wrong. e.g 10/3 service is missing - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 a2", "uid1 b2", "uid1 a3", "uid1 b3", "uid1 a4"), - start_date=c("02-01-2023", "24-01-2023", "08-01-2023", "24-01-2023", "01-03-2023", "17-03-2023", "11-01-2023"), - end_date=c( "22-01-2023", "04-02-2023", "22-01-2023", "05-02-2023", "07-03-2023", "31-03-2023", "19-01-2023"), - Days=c( "1111110", "1111110", "0000001", "0000001", "0011100", "0011100", "0011000"), - STP=c( "P", "P", "P", "P", "P", "P", "O"), - rowID=c( 1, 1, 2, 2, 3, 3, 4)) - - expectedResult <- fixDates( expectedResult ) - - expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) -}) - - - -test_that("test makeCalendar.inner:3", { - - browser() - #by convention Sunday service timetables are Sunday only - #the 'from' date should be the first day the timetable has effect (i.e. should have a 1 in the relevant day column) - #(and I assume the same is true of the last) - - #mon-sat timetable - #sun different operating hours on sunday - #engineering works means having to berth in a different platform for a couple of weeks wed-fri - #cancel mondays for 2 weeks - #cancel sundays for 2 weeks - #mon-sat sun -march- wed-fri platform cancel mon cancel sun - testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-01-2023", "15-01-2023" ), - end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "27-01-2023", "16-01-2023", "22-01-2023" ), - Days=c( "1111110", "0000001", "0011100", "0011100", "1000000", "0000001" ), - STP=c( "P", "P", "P", "O", "C", "C" ), - rowID=c( 1, 2, 3, 4, 5, 6)) - - testData <- fixDates( testData ) - - res <- makeCalendar.inner( testData ) - - res.calendar <- res[[1]] - res.calendar_dates <- res[[2]] - res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] - - expectedResult = data.table(UID=c( "uid1 a1", "uid1 a2", "uid1 a3", "uid1 b3", "uid1 c3", "uid1 a4"), - start_date=c("02-01-2023", "01-01-2023", "01-03-2023", "10-03-2023", "21-03-2023", "07-01-2023"), - end_date=c( "08-01-2023", "31-01-2023", "06-03-2023", "19-03-2023", "31-03-2023", "14-01-2023"), - Days=c( "1111100", "0000011", "0011100", "0011100", "0011100", "0011000"), - STP=c( "P", "P", "P", "P", "P", "O"), - rowID=c( 1, 2, 3, 3, 3, 4)) - - expectedResult <- fixDates( expectedResult ) - - expect_true(identical(expectedResult,res.calendar) & 0==length(res.calendar_dates)) -}) - - - - context("Get the example atoc files") file_path <- file.path(tempdir(),"uk2gtfs_tests") dir.create(file_path) @@ -134,6 +14,7 @@ test_that("test atoc data is there", { context("Test the main atoc function") test_that("test atoc2gtfs singlecore", { + gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1) From 47def4b35823f5a37a7bfc41981c0d49ec4bd132 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 22:40:11 +0100 Subject: [PATCH 24/81] rename write_gtfs to gtfs_write so it's consistent with other files --- R/{write_gtfs.R => gtfs_write.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{write_gtfs.R => gtfs_write.R} (100%) diff --git a/R/write_gtfs.R b/R/gtfs_write.R similarity index 100% rename from R/write_gtfs.R rename to R/gtfs_write.R From fa0e8c0ae28d5a752887394c102f5af2dd394d6d Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 23:31:29 +0100 Subject: [PATCH 25/81] split atoc_export up because it's now far too long and my finger aches from too much scrolling --- R/atoc_export.R | 974 +--------------------------------- man/gtfs_write.Rd | 10 +- tests/testthat/test_aa_unit.R | 2 +- 3 files changed, 11 insertions(+), 975 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index ef9008a..645255f 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -134,308 +134,6 @@ station2transfers <- function(station, flf, path_out) { -NOT_NEEDED <- c("__NOT_NEEDED_MARKER__~@$$%&*((") - - -#this function is massively performance critical - profile any changes to it.makes up 30% of the whole makeCalendar process -selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) -{ - #if we have two adjacent complete items e.g. ....end 13th Jan start 14th jan..... - #then it's not a real gap and just an artefact of the algorithm use to generate the dates - if( rowIndex>1 && rowIndex= calNew$end_date[rowIndex],,which=TRUE] - - #are we in a gap between two base timetables with no overlays - if ( length(baseTimetableIndexes)<=0 ) - { - calNew$UID[rowIndex] <- NOT_NEEDED - return (calNew) - } - - - # apply timetable overlay selection logic - pick highest priority timetable type - # as per https://wiki.openraildata.com/index.php/SCHEDULE - # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." - - #pick the lowest alphabetic STP (highest priority), and just in case there is more than one, the shortest duration one. - - #priorityTimetable <- baseTimetables[order(STP, duration), head(.SD, 1)] - #performance we pre-sort all the entries by the priority & duration - #this speeds things up when we look up the required priority overlay **SEE_NOTE** - #so we don't need to sort again here, just pick the top filtered result - - #stash the generated start & end dates - #performance - copying to separate variables seems to be fastest - start_date = calNew$start_date[rowIndex] - end_date = calNew$end_date[rowIndex] - - calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] #this is the most time consuming line in this fn. takes about 10x longer than the - #single variable copy below - calNew$start_date[rowIndex] = start_date - calNew$end_date[rowIndex] = end_date - - return (calNew) -} - - - -#' split overlapping start and end dates -#' duplicated items have the same rowId as the original but a new UID with an alpha character appended to it. -#' -#' this function is performance critical - profile any changes -#' -#' THIS ONLY WORKS ON ITEMS WHERE THE DAY PATTERNS ARE ALL THE SAME -#' (or are only 1 day DURATION) -#' -#' @param cal calendar object -#' @details split overlapping start and end dates -#' @noRd - -splitDates <- function(cal) { - - # get a vector of all the start and end dates together from all base & overlay timetables and sort them - dates <- c(cal$start_date, cal$end_date) - dates <- dates[order(dates)] - - # create all unique pairs so we know how to chop the dates up into non-overlapping periods - dates.dt <- unique( data.table( - start_date = dates[seq(1, length(dates) - 1)], - end_date = dates[seq(2, length(dates))] - ) ) - - #left join back to the source data so we can see which (if any) date segments we have already covered, and which we need to replicate - calNew <- cal[dates.dt, on = c("start_date", "end_date")] - - #some dates may already be overlapping - calNew <- fixOverlappingDates( calNew ) - - # fill in the missing schedule parts from the original - # the filled in parts should (if the data is correctly layered) be the highest priority part of the timetable - - # we make multiple passes over the timetable working our way outwards from completed items to NA items - - rowCount = nrow(calNew) - - for (i in seq(1,10)) #should really be a max of 3 passes - { - #forwards - for (j in seq(1, rowCount)) { - - #if we are not valid & the next item is already valid, fill in our details and adjust our end date - if (j1 && !is.na(calNew$UID[j-1]) && NOT_NEEDED != calNew$UID[j-1] ) - { - calNew$start_date[j] <- calNew$end_date[j-1] +1 - } - } - } - - #backwards - for (j in seq(rowCount, 1)) { - - #if we are not valid & the previous item is already valid, fill in our details and adjust our start date - if (j>1 && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1]) ) - { - calNew <- selectOverlayTimeableAndCopyAttributes(cal, calNew, j) - - if ( NOT_NEEDED != calNew$UID[j-1]) - { - calNew$start_date[j] <- calNew$end_date[j-1] +1 - } - - #if next item valid adjust our start date - if(j 0, ] - - #performance, do all subsets in one go - #calNew <- calNew[!is.na(UID) & UID != NOT_NEEDED & STP != "C" & duration > 0] - calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0), ] - - # Append UID to note the changes - if (nrow(calNew) > 0) { - if (nrow(calNew) <= 26) { - calNew$UID <- paste0(calNew$UID, " ", letters[1:nrow(calNew)]) - } else { - # Cases where we need extra letters, gives upto 676 ids - lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) - calNew$UID <- paste0(calNew$UID, " ", lett[1:nrow(calNew)]) - } - } else { - calNew <- NA - } - - return(calNew) -} - - -# triggered by test case "10:test makeCalendarInner" -# when we have a 1 day overlay sitting on the start/end data of a base timetable -# the dates overlap - fix it -fixOverlappingDates <- function( cal ) -{ - rowCount = nrow(cal) - - #forwards - for (j in seq(1, rowCount)) { - - #adjust our end date if next item a higher priority overlay - if (j1 && !is.na(cal$UID[j-1]) && cal$STP[j-1] < cal$STP[j] ) - { - cal$start_date[j] <- cal$end_date[j-1] +1 - } - } - } - - #backwards - for (j in seq(rowCount, 1)) { - - #adjust our end date if previous item a higher priority overlay - if (j>1 && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1]) ) - { - if ( cal$STP[j-1] < cal$STP[j] ) - { - cal$start_date[j] <- cal$end_date[j-1] +1 - } - - if(j= 7) - { - return (TRUE) - } - - days.valid <- weekdays(seq.POSIXt( - from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ), - to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ), - by = "DSTday" - )) - days.valid <- tolower(days.valid) - - #get a vector of names of days of week that the timetable is valid on - days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX] - days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ] - - return (any(days.valid %in% days.match)) -} - - -checkOperatingDayActive <- function(calendar) { - - if (all(calendar$duration >= 7)) - { - return (calendar$Days!="0000000") - } - - #get a list of days of week that the timetable is valid on - opDays <- splitBitmaskMat( calendar$Days, asInteger=FALSE ) - opDays <- split(opDays, row(opDays)) - - checkValid <- function(dur, sd, ed, od ){ - - if (dur >= 7) - { - return (any(od)) - } - - dayNumbers <- lubridate::wday( seq.Date(from = sd, to = ed, by = "day"), label = FALSE, week_start=1 ) - - return ( any(od[dayNumbers]) ) - } - - validCalendars <- mapply( checkValid, calendar$duration, - calendar$start_date, calendar$end_date, - opDays, SIMPLIFY = TRUE ) - return (validCalendars) -} - - - - - - #' internal function for constructing longnames of routes #' @@ -467,125 +165,12 @@ longnames <- function(routes, stop_times) { routes[`Train Category` == "SS", route_long_name := paste("Ship from",route_long_name)] routes[`Train Category` %in% c("BS", "BR"), route_long_name := paste("Bus from",route_long_name)] routes[!(`Train Category` %in% c("SS", "BS", "BR")), route_long_name := paste("Train from",route_long_name)] + #TODO reflect the London Transport services being set to metro/underground in this naming code return(routes) } -START_PATTERN_VECTOR = c("1","01","001","0001","00001","000001","0000001") -END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") - -#calendars should start on the first day they are effective, and end on the last day. -#i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. -validateCalendarDates <- function( calendar ) -{ - start_day_number = lubridate::wday( calendar$start_date, label = FALSE, week_start=1 ) - end_day_number = lubridate::wday( calendar$end_date, label = FALSE, week_start=1 ) - - startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1, start_day_number) - endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7) - - return (startOk & endOk) -} - - - - -#' split and rebind bitmask -#' -#' @details -#' splits 'Days' bitmask into individual logical fields called monday, tuesday, etc... -#' -#' @param calendar data.table of calendar items -#' @noRd -#' -splitAndRebindBitmask <- function( calendar ) -{ - return (cbind( calendar, splitBitmaskDt( calendar$Days, FALSE ) ) ) -} - -#this function gets expensive if you call it a lot, creating data.table takes a while -splitBitmaskDt <- function( bitmaskVector, asInteger=FALSE ) -{ - return (as.data.table(splitBitmaskMat( bitmaskVector, asInteger=asInteger ))) -} - -splitBitmaskMat <- function( bitmaskVector, asInteger=FALSE ) -{ - splitDays = splitBitmask( bitmaskVector, asInteger=asInteger ) - - return (matrix(splitDays, ncol=7, byrow=TRUE, dimnames=list(NULL,WEEKDAY_NAME_VECTOR))) -} - -splitBitmask <- function( bitmask, asInteger=FALSE ) -{ - duff = which( nchar(bitmask) != 7 ) - - bitmask[duff] = " " - - splitDays = strsplit(bitmask, "") - - splitDays = as.integer(unlist(splitDays)) - - if (!asInteger) - { - splitDays = as.logical(splitDays) - } - - return (splitDays) -} - - - -#' allocate Cancellations Across Calendars -#' -#' @details -#' expects input calendar items to have been separated out into non-overlapping dates -#' and 'Days' bitmask unpacked into separate int or logical attributes -#' -#' "originalUID" is used to identify where the cancellations originally came from -#' after allocating across the split calender items the cancellations will have an updated -#' "UID" that says which calender they are now associated with -#' -#' @param calendar data.table of calendar items that are NOT cancellations (that has 'Days' bitmask unpacked ) -#' @param cancellations data.table of calender items that ARE cancellations (that has 'Days' bitmask unpacked ) -#' @noRd -#' -allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) -{ - tempNames = names(calendar) - - #stash some join fields away because we want to keep the data from the cancellations rather than the calendar table - #which otherwise get over-written by the join process - cancellations$start_date2 <- cancellations$start_date - cancellations$end_date2 <- cancellations$end_date - cancellations$UID <- NULL #we want the new UID from the calendar entries - - #left join cancellations to calendar by the original service ID - #and the date of the cancellation lying inside the period of the calendar - #and the day of the cancellation is an operating day of the calendar item - joined = cancellations[calendar, on = .(originalUID==originalUID, - start_date>=start_date, - end_date<=end_date)][ - ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) - | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] - #revert the stashed (join) fields - joined$start_date <- joined$start_date2 - joined$start_date2 <- NULL - joined$end_date <- joined$end_date2 - joined$end_date2 <- NULL - - #remove joined fields we don't need - joined <- joined[, .SD, .SDcols=tempNames] - - #belt and braces - fix any NA fields by reverting from the original UID - joined[is.na(UID), UID := originalUID] - - return( joined ) -} - - - #' make calendar #' @@ -622,6 +207,7 @@ makeCalendar <- function(schedule, ncores = 1) { #calendar$start_date = as.integer( calendar$start_date ) #calendar$end_date = as.integer( calendar$end_date ) #test treating date as int: seem to be about twice as fast on the critical line when selecting base timetable +#TODO add package option to switch between processing as date/int otherwise debugging is too hard # UIDs = unique(calendar$UID) # length_todo = length(UIDs) @@ -670,561 +256,11 @@ makeCalendar <- function(schedule, ncores = 1) { res.calendar$originalUID <- NULL return(list(res.calendar, cancellations)) - - - - - - - - - - - if (FALSE) #don't think we need any of this code any more ? - { - message(paste0( - Sys.time(), - " Removing trips that only occur on days of the week that are outside the timetable validity period" - )) - - #unpack the days bitmask into a vector of int - days <- lapply(res.calendar$Days, function(x) { - as.integer(substring(x, 1:7, 1:7)) - }) - days <- matrix(unlist(days), ncol = 7, byrow = TRUE) - days <- as.data.frame(days) - names(days) <- WEEKDAY_NAME_VECTOR - - #attach unpacked bits back onto source calendar - res.calendar <- cbind(res.calendar, days) - res.calendar$Days <- NULL - - - - res.calendar.days <- res.calendar[, ..CHECKROWS_NAME_VECTOR] - res.calendar.days <- data.table::transpose(res.calendar.days) - #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar))) - #transpose runs in around 3s (compared to 60s for split() on a data.frame), - #but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer. - #TODO see if data.table performs as well but with simpler code. - - if (ncores > 1) { - cl <- parallel::makeCluster(ncores) - parallel::clusterEvalQ(cl, { - loadNamespace("UK2GTFS") - }) - keep <- pbapply::pbsapply(res.calendar.days, checkrows, - cl = cl - ) - parallel::stopCluster(cl) - rm(cl) - } else { - keep <- pbapply::pbsapply(res.calendar.days, checkrows) - } - - res.calendar <- res.calendar[keep, ] - } - -} - - - - - - -makeAllOneDay0 <- function( cal ) -{ - duration <- cal$end_date - cal$start_date + 1 - - if ( 0==nrow(cal) || all(1 == duration)) - { - #nothing to do - return (cal) - } - - start_day_number = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) - - # we want to rotate the day pattern so that the pattern aligns with the start date, - # then we can replicate it the required number of times - #TODO made this too complex, leave the pattern where it is and just work out an offset - - #create all possible rotations of day pattern - allDayPatterns <- c( cal$Days, - (paste0(substr(cal$Days, 2, 7),substr(cal$Days, 1, 1))), - (paste0(substr(cal$Days, 3, 7),substr(cal$Days, 1, 2))), - (paste0(substr(cal$Days, 4, 7),substr(cal$Days, 1, 3))), - (paste0(substr(cal$Days, 5, 7),substr(cal$Days, 1, 4))), - (paste0(substr(cal$Days, 6, 7),substr(cal$Days, 1, 5))), - (paste0(substr(cal$Days, 7, 7),substr(cal$Days, 1, 6)))) - - dayPatternMatrix <- matrix( allDayPatterns, ncol=7 ) - - #create logical matrix with the pattern we want selected - cols <- col(dayPatternMatrix) - toSelect <- cols == start_day_number - - #pull out the desired pattern (need to transpose both matrices otherwise the unwind into a vector is in the wrong order) - selectedRotation <- t(dayPatternMatrix)[ t(toSelect) ] - - numWeeks <- as.integer(ceiling(as.integer(cal$duration) / 7)) - - - # replicate the pattern, truncate to number of days - # create a sequence of dates - then return the dates selected in the pattern - makeDates <- function(rot, w, d, start, end){ - - selectedDaysLogical <- as.logical(as.integer(strsplit(rot, "")[[1]])) - - selectedDays <- rep(selectedDaysLogical, times = w) - - truncated <- selectedDays[ 1:d ] - - dateSequence <- seq.Date(from = start, to = end, by = "day") - - selectedDates <- dateSequence[ truncated ] - } - - selectedDates <- mapply( - makeDates, - selectedRotation, numWeeks, duration, cal$start_date, cal$end_date - ) - - #replicate the calendar rows the appropriate number of times - repetitions <- sapply(selectedDates, length) - replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] - - #set the start and end date for each calender item to the single day identified earlier - all_dates <- as.Date(unlist(selectedDates), origin = "1970-01-01") - replicatedcal$end_date <- replicatedcal$start_date <- all_dates - - #tidy up the values so they are correct for the spilt items - replicatedcal$duration <- 1 - replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] - - return (replicatedcal) } - - - - -SINGLE_DAY_PATTERN_VECTOR = c("1000000","0100000","0010000","0001000","0000100","0000010","0000001") - -SINGLE_DAY_PATTERN_LIST = list(c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), - c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE), - c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), - c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), - c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE), - c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE), - c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)) - - -makeReplicationDates <- function(cal, startDayNum, endDayNum){ - - #make a sequences of dates, offsetting the start date so it's always monday (aligning with bitmask start day) - # and the end date so it's always sunday - firstDate = min(cal$start_date) - 7 - lastDate = max(cal$end_date) + 7 - allDates = seq.Date(from = firstDate, to = lastDate, by = "day") - - offset = as.integer(cal$start_date)-startDayNum+2-as.integer(firstDate) - end = as.integer(cal$end_date)+8-endDayNum-as.integer(firstDate) - - dates <- Map(function(o, e) allDates[o:e], offset, end) - - return ( as.Date( unlist(dates), origin = DATE_EPOC ) ) -} - - - -#' replicates the input calendar objects into single day duration calendar objects -#' calender objects should NOT have had the 'days' bitmask field unpacked -#' (will still produce an output but the unpacked monday, tuesday etc fields will no longer be consistent with the packed 'Days' bitmask) -#' -#' @param cal data.table containing all the calendars to be split up into individual days -#' @noRd -#' -makeAllOneDay <- function( cal ) -{ - duration <- cal$end_date - cal$start_date + 1 - - if ( 0==nrow(cal) || all(1 == duration)) - { - #nothing to do - return (cal) - } - - #make a list of dates for each object being replicated - startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) - dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) - - #work out how many time we need to replicate each item: number of operating days in week * num weeks - bitmaskMat = splitBitmaskMat( cal$Days, asInteger=FALSE ) - dayCount = rowSums(bitmaskMat) - numWeeks <- ceiling(as.integer(cal$duration) / 7) - repetitions = dayCount * numWeeks - - #replicate the calendar rows the appropriate number of times - replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] - - #get a mask of operating days - operatingDayLogical <- rep( split(bitmaskMat, row(bitmaskMat)), times = numWeeks) - - #set the start and end date for each calender item to the single day identified earlier - selectedDates = dateSequence[unlist(operatingDayLogical)] - replicatedcal$end_date <- replicatedcal$start_date <- selectedDates - - #tidy up the values so they are correct for the spilt items - replicatedcal$duration <- 1 - replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] - - return (replicatedcal) -} - - - - -#' along a similar line to 'makeAllOneDay' duplicates input calendar objects into single WEEK duration calendar objects -#' -#' @param cal data.table containing all the calendars to be split up into individual weeks -#' @noRd -#' -expandAllWeeks <- function( cal ) -{ - if ( 0==nrow(cal) ) - { - #nothing to do - return (cal) - } - - #duration <- cal$end_date - cal$start_date + 1 - - #make a list of dates for each object being replicated - startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) - dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) - - numWeeks <- ceiling(as.integer(cal$duration) / 7) - - #replicate a logical vector for the start date and use that to select the relevant dates from the date sequence - startDayLogical <- SINGLE_DAY_PATTERN_LIST[startDayNum] - startDays <- rep(startDayLogical, times = numWeeks) - startDates <- dateSequence[ unlist(startDays) ] - - #replicate a logical vector for the end date and use that to select the relevant dates from the date sequence - endDayLogical <- SINGLE_DAY_PATTERN_LIST[endDayNum] - endDays <- rep(endDayLogical, times = numWeeks) - endDates <- dateSequence[ unlist(endDays) ] - - #replicate the calendar rows the appropriate number of times - replicatedcal <- cal[rep(seq_len(.N), times = numWeeks)] - - #set the start and end date for each calender item - replicatedcal$start_date <- startDates - replicatedcal$end_date <- endDates - - #tidy up the values so they are correct for the spilt items - replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1 - - return (replicatedcal) -} - - - - - -#' make calendar helper function -#' this originally expected and dealt with cancellations too. This worked ok for single day duration cancellations -#' but had problems with multi-day cancellations when combined with overlays -#' code hasn't been changed to reject / avoid cancellations but results may not be predictable / tested scenarios -#' @param calendarSub data.table containing all the calendars (aka CIF operating patterns) for a single service -#' @noRd -#' -makeCalendarInner <- function(calendarSub) { - - if ( 1 == nrow(calendarSub) ) - { - # make into an single entry - res = list(calendarSub, NA) - } - else - { - if (length(unique(calendarSub$UID)) > 1) - { - stop(paste("Error: makeCalendarInner was passed more than one service to work on. service=", unique(calendarSub$UID))) - } - - # check duration and types - allTypes <- calendarSub$STP - - # as per https://wiki.openraildata.com/index.php/SCHEDULE - # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." - baseType = max(allTypes) #usually we expect 'P' to be the base timetable... but it can also be STP service in which case it will be 'N' - - overlayDurations <- as.numeric(calendarSub$duration[calendarSub$STP != baseType]) - overlayTypes <- calendarSub$STP[calendarSub$STP != baseType] - - if( length(overlayDurations) <= 0 ) - { - #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules - res = list(calendarSub, NA) - } - #if every overlay is a one day cancellation (and only one base timetable) - else if (all(overlayDurations == 1) && all(overlayTypes == "C") && sum(allTypes == baseType) == 1 ) - { - warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") - - # Apply the cancellation via entries in calendar_dates.txt - res = list( calendarSub[calendarSub$STP != "C", ], - calendarSub[calendarSub$STP == "C", ]) - } - else - { - uniqueDayPatterns <- unique(calendarSub$Days[calendarSub$STP != "C"]) - - # if the day patterns are all identical - if (length(uniqueDayPatterns) <= 1 ) - { - #performance pre-sort all the entries by the priority - #this speeds things up when we look up the required priority overlay **SEE_NOTE** - #calendarSub = calendarSub[ order(STP, duration), ] - setkey( calendarSub, STP, duration ) - setindex( calendarSub, start_date, end_date) - - calendar_new <- makeCalendarsUnique( splitDates(calendarSub) ) - res = list(calendar_new, NA) - } - else # split by day pattern - { - #this works if the day patterns don't overlap any operating days. - if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) - { - #this scenario DOES exist in the downloaded ATOC test data - #stop(paste("Scenario with overlay pattern not matching base pattern is not currently handled. service=", unique(calendarSub$UID))) - res = makeCalendarForDifferentDayPatterns( calendarSub ) - } - else - { - res = makeCalendarForDayPatterns( uniqueDayPatterns, calendarSub ) - } - } - } - } - - #stopifnot( is.list(res) ) - return (res) -} - - - - -CALENDAR_UNIQUE_CHECK_COLUMN_NAMES <- c("originalUID","start_date","end_date","Days","STP","duration" ) - - -# We have lots of complex logic, which means that when we have multiple base timetables that are separated -# in the temporal domain e.g. march, april - we end up duplicating the overlays -# -# this is a bit of a gluey hack that could be fixed by looking in the temporal domain when deciding what overlaps -# see test case no.10 ("10:test makeCalendarInner") that triggered addition of this logic -# -makeCalendarsUnique <- function ( calendar ) -{ - calendar <- calendar[ !duplicated( calendar, by=CALENDAR_UNIQUE_CHECK_COLUMN_NAMES ) ] - - return( calendar ) -} - - - - -countIntersectingDayPatterns <- function( dayPatterns ) -{ - unpacked = splitBitmaskMat( dayPatterns, asInteger = TRUE ) - sums = colSums(unpacked) #add up number of intersections for monday etc... - names(sums) <- NULL #makes unit test construction easier - return ( sums ) -} - -intersectingDayPattern <- function( dayPattern1, dayPattern2 ) -{ - return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1) ) -} - - -intersectingDayPatterns <- function( dayPatternBase, dayPatternOverlay ) -{ - if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) - - unpackedOverlay = splitBitmaskMat( dayPatternOverlay, asInteger = FALSE ) - unpackedBase = splitBitmaskMat( dayPatternBase, asInteger = FALSE ) - - #repeat the base for every Overlay - unpackedBaseRep = rep( unpackedBase, length(dayPatternOverlay) ) - unpackedBaseRepmat = matrix(unpackedBaseRep, ncol=7, byrow=TRUE) - - intersects = unpackedBaseRepmat & unpackedOverlay - - res <- apply(intersects, 1, any) - - return ( res ) -} - - -intersectingDayPatterns0 <- function( dayPatternBase, dayPatternOverlay ) -{ - if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) - - intersectingDayPattern_vec <- Vectorize(intersectingDayPattern, vectorize.args = c("dayPattern2")) - - res = intersectingDayPattern_vec( dayPatternBase, dayPatternOverlay ) - names(res) <- NULL #makes unit test construction easier -stopifnot(is.logical(res)) - return ( res ) -} - - - -makeCalendarForDayPatterns <- function( dayPatterns, calendar ) -{ - splits <- list() - - #performance pre-sort all the entries by the priority - #this speeds things up when we look up the required priority overlay **SEE_NOTE** - #calendar = calendar[ order(STP, duration), ] - setkey( calendar, STP, duration ) - setindex( calendar, start_date, end_date) - - for (k in seq(1, length(dayPatterns))) { - # select for each pattern but include cancellations with a - # different day pattern - calendarDay <- calendar[calendar$Days == dayPatterns[k] | calendar$STP == "C", ] - # TODO cancellations now handled elsewhere - remove this once code stable - - if (all(calendarDay$STP == "C")) { - # ignore cases of everything is cancelled - splits[[k]] <- NULL - warning("unexpected item in the makeCalendarForDayPatterns-ing area, cancellations should now be handled at a higher level") - } - else { - calendarNewDay <- splitDates(calendarDay) - - # rejects NAs - if (inherits(calendarNewDay, "data.frame")) { - # further differentiate the UID by appending a number to the end for each different days pattern - calendarNewDay$UID <- paste0(calendarNewDay$UID, k) - splits[[k]] <- calendarNewDay - } - } - } - - splits <- data.table::rbindlist(splits, use.names=FALSE) - - splits <- makeCalendarsUnique( splits ) - - # after all this faffing about and splitting and joining, it's quite likely we've created some - # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) - splits <- splits[ checkOperatingDayActive( splits ) ] - - return(list(splits, NA)) -} - - -# this is a complex case where the overlays don't have the same day pattern as the base timetable -# -# e.g base is mon-sat, and we have some engineering work for 3 weeks tue-thur -# -# the approach we take is to duplicate the overlay timetables for every week they are in effect, then overlay them. -# -# aha but the complexity isn't finished. If the overlay is tue+thur then wed is the base timetable. -# -# when we get to this latter complexity we just split the overlay into individual days and apply it that way. -# -makeCalendarForDifferentDayPatterns <- function( calendar ) -{ - baseType = max(calendar$STP) - baseTimetables = calendar[calendar$STP == baseType] - overlayTimetables = calendar[calendar$STP != baseType] - - gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] - continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] - - gappyOverlays = makeAllOneDay( gappyOverlays ) - continiousOverlays = expandAllWeeks( continiousOverlays ) - - overlays = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) - - - splits <- list() - - distinctBasePatterns = unique( baseTimetables$Days ) - - for (k in seq(1, length(distinctBasePatterns))) { - - thisBase = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] - - thisOverlay = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] - - if (nrow(thisOverlay) <= 0) - { - splits[[k]] <- thisBase - } - else - { - timetablesForThisPattern = data.table::rbindlist( list( thisBase, thisOverlay ), use.names=FALSE) - - #performance pre-sort all the entries by the priority - #this speeds things up when we look up the required priority overlay **SEE_NOTE** - #timetablesForThisPattern = timetablesForThisPattern[ order(STP, duration), ] - setkey( timetablesForThisPattern, STP, duration ) - setindex( timetablesForThisPattern, start_date, end_date) - - thisSplit <- splitDates( timetablesForThisPattern ) - - # rejects NAs - if (inherits(thisSplit, "data.frame")) { - # further differentiate the UID by appending a number to the end for each different days pattern - thisSplit$UID <- paste0(thisSplit$UID, k) - splits[[k]] <- thisSplit - } - } - } - - splits <- data.table::rbindlist(splits, use.names=FALSE) - - splits <- makeCalendarsUnique( splits ) - - # after all this faffing about and splitting and joining, it's quite likely we've created some - # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) - splits <- splits[ checkOperatingDayActive( splits ) ] - - return(list(splits, NA)) -} - - -# in a week bitmask, if there are non-operating days between the first and last operating day of the week - will return TRUE -# e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE -hasGapInOperatingDays <- function( daysBitmask ) -{ - firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1] - lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1] - - operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) - - res = ( lastDay-firstDay+1 != operatingDayCount ) - - res[is.na(res)] <- FALSE #shouldn't really get this, probably operating days are '0000000' - - return( res ) -} - - - - - #' duplicateItem #' #' @details @@ -1257,7 +293,7 @@ duplicateItem <- function( dt, reps, indexStart=1 ) #' duplicateItems #' #' @details -#' Function that duplicates a very large data.table, adding a "index" column to all rows in the output indicating which +#' Function that duplicates a large data.table, adding a "index" column to all rows in the output indicating which #' instance of the duplication the row is associated with #' #' requires a column called "_reps" on the object to determine how many times it is to be duplicated @@ -1325,7 +361,7 @@ duplicate_stop_times <- function(calendar, stop_times, ncores = 1) { "pickup_type", "drop_off_type", "schedule" ) - #it's pretty marginal doing this on multiple threads. With a typical number, + #it's pretty marginal doing this on multiple threads. With a typical sized day all GB file, #doing the split takes 2.4s and the duplication 7.8s (on one thread) #TODO look at avoiding the split if threads=1 @@ -1528,7 +564,7 @@ clean_activities2 <- function(x, public_only = TRUE) { } x <- x[, c("pickup_type", "drop_off_type")] } - else #set all of the stops on a route to be passenger boarding / alighting from a GTFS perspective + else #set all of the stops on a route to be valid for passenger boarding / alighting from a GTFS perspective { x$pickup_type <- 0 x$drop_off_type <- 0 diff --git a/man/gtfs_write.Rd b/man/gtfs_write.Rd index aeb65f2..4a9beab 100644 --- a/man/gtfs_write.Rd +++ b/man/gtfs_write.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/write_gtfs.R +% Please edit documentation in R/gtfs_write.R \name{gtfs_write} \alias{gtfs_write} \title{Write GTFS} @@ -8,10 +8,10 @@ gtfs_write( gtfs, folder = getwd(), name = "gtfs", - stripComma = TRUE, - stripTab = TRUE, - stripNewline = TRUE, - quote = FALSE + stripComma = FALSE, + stripTab = FALSE, + stripNewline = FALSE, + quote = TRUE ) } \arguments{ diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index c686021..7c1c2f1 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -593,7 +593,7 @@ test_that("1:test allocateCancellationsAcrossCalendars", { expectedResult = data.table( UID=c( "uid1 a", "uid1 a", "uid2 d"), - #originalUID=c("uid1", "uid1", "uid2"), + originalUID=c("uid1", "uid1", "uid2"), start_date=c("02-01-2023", "03-01-2023", "14-03-2023"), end_date=c( "02-01-2023", "03-01-2023", "14-03-2023"), Days=c( "1000000", "0100000", "0100000"), From a40c02abc937ed7068c9cbedcf4842624d71bb15 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 23:32:10 +0100 Subject: [PATCH 26/81] add file split from atoc_export --- R/atoc_overlay.R | 773 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 773 insertions(+) create mode 100644 R/atoc_overlay.R diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R new file mode 100644 index 0000000..22634fa --- /dev/null +++ b/R/atoc_overlay.R @@ -0,0 +1,773 @@ +#functions relating to processing timetable overlay rules + + + + +# in a week bitmask, if there are non-operating days between the first and last operating day of the week - will return TRUE +# e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE +hasGapInOperatingDays <- function( daysBitmask ) +{ + firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1] + lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1] + + operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) + + res = ( lastDay-firstDay+1 != operatingDayCount ) + + res[is.na(res)] <- FALSE #shouldn't really get this, probably operating days are '0000000' + + return( res ) +} + + + +DATE_EPOC <- as.Date(lubridate::origin) # 01/01/1970 +WEEKDAY_NAME_VECTOR <- c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") +START_PATTERN_VECTOR = c("1","01","001","0001","00001","000001","0000001") +END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") + +#calendars should start on the first day they are effective, and end on the last day. +#i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. +validateCalendarDates <- function( calendar ) +{ + start_day_number = lubridate::wday( calendar$start_date, label = FALSE, week_start=1 ) + end_day_number = lubridate::wday( calendar$end_date, label = FALSE, week_start=1 ) + + startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1, start_day_number) + endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7) + + return (startOk & endOk) +} + + + + +#' split and rebind bitmask +#' +#' @details +#' splits 'Days' bitmask into individual logical fields called monday, tuesday, etc... +#' +#' @param calendar data.table of calendar items +#' @noRd +#' +splitAndRebindBitmask <- function( calendar ) +{ + calMat = splitBitmaskMat( calendar$Days, asInteger=FALSE ) + + #this function gets expensive if you call it a lot, creating data.table takes a while + return (cbind( calendar, as.data.table(calMat) ) ) +} + +splitBitmaskMat <- function( bitmaskVector, asInteger=FALSE ) +{ + splitDays = splitBitmask( bitmaskVector, asInteger=asInteger ) + + return (matrix(splitDays, ncol=7, byrow=TRUE, dimnames=list(NULL,WEEKDAY_NAME_VECTOR))) +} + +splitBitmask <- function( bitmask, asInteger=FALSE ) +{ + duff = which( nchar(bitmask) != 7 ) + + bitmask[duff] = " " + + splitDays = strsplit(bitmask, "") + + splitDays = as.integer(unlist(splitDays)) + + if (!asInteger) + { + splitDays = as.logical(splitDays) + } + + return (splitDays) +} + + + +checkOperatingDayActive <- function(calendar) { + + if (all(calendar$duration >= 7)) + { + return (calendar$Days!="0000000") + } + + #get a list of days of week that the timetable is valid on + opDays <- splitBitmaskMat( calendar$Days, asInteger=FALSE ) + opDays <- split(opDays, row(opDays)) + + checkValid <- function(dur, sd, ed, od ){ + + if (dur >= 7) + { + return (any(od)) + } + + dayNumbers <- lubridate::wday( seq.Date(from = sd, to = ed, by = "day"), label = FALSE, week_start=1 ) + + return ( any(od[dayNumbers]) ) + } + + validCalendars <- mapply( checkValid, calendar$duration, + calendar$start_date, calendar$end_date, + opDays, SIMPLIFY = TRUE ) + return (validCalendars) +} + + + +CALENDAR_UNIQUE_CHECK_COLUMN_NAMES <- c("originalUID","start_date","end_date","Days","STP","duration" ) + +# We have lots of complex logic, which means that when we have multiple base timetables that are separated +# in the temporal domain e.g. march, april - we end up duplicating the overlays +# +# this is a bit of a gluey hack that could be fixed by looking in the temporal domain when deciding what overlaps +# see test case no.10 ("10:test makeCalendarInner") that triggered addition of this logic +# +makeCalendarsUnique <- function ( calendar ) +{ + calendar <- calendar[ !duplicated( calendar, by=CALENDAR_UNIQUE_CHECK_COLUMN_NAMES ) ] + + return( calendar ) +} + + + + +countIntersectingDayPatterns <- function( dayPatterns ) +{ + unpacked = splitBitmaskMat( dayPatterns, asInteger = TRUE ) + sums = colSums(unpacked) #add up number of intersections for monday etc... + names(sums) <- NULL #makes unit test construction easier + return ( sums ) +} + +intersectingDayPattern <- function( dayPattern1, dayPattern2 ) +{ + return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1) ) +} + + +intersectingDayPatterns <- function( dayPatternBase, dayPatternOverlay ) +{ + if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) + + unpackedOverlay = splitBitmaskMat( dayPatternOverlay, asInteger = FALSE ) + unpackedBase = splitBitmaskMat( dayPatternBase, asInteger = FALSE ) + + #repeat the base for every Overlay + unpackedBaseRep = rep( unpackedBase, length(dayPatternOverlay) ) + unpackedBaseRepmat = matrix(unpackedBaseRep, ncol=7, byrow=TRUE) + + intersects = unpackedBaseRepmat & unpackedOverlay + + res <- apply(intersects, 1, any) + + return ( res ) +} + + + +SINGLE_DAY_PATTERN_VECTOR = c("1000000","0100000","0010000","0001000","0000100","0000010","0000001") + +SINGLE_DAY_PATTERN_LIST = list(c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)) + + +makeReplicationDates <- function(cal, startDayNum, endDayNum){ + + #make a sequences of dates, offsetting the start date so it's always monday (aligning with bitmask start day) + # and the end date so it's always sunday + firstDate = min(cal$start_date) - 7 + lastDate = max(cal$end_date) + 7 + allDates = seq.Date(from = firstDate, to = lastDate, by = "day") + + offset = as.integer(cal$start_date)-startDayNum+2-as.integer(firstDate) + end = as.integer(cal$end_date)+8-endDayNum-as.integer(firstDate) + + dates <- Map(function(o, e) allDates[o:e], offset, end) + + return ( as.Date( unlist(dates), origin = DATE_EPOC ) ) +} + + + +#' replicates the input calendar objects into single day duration calendar objects +#' calender objects should NOT have had the 'days' bitmask field unpacked +#' (will still produce an output but the unpacked monday, tuesday etc fields will no longer be consistent with the packed 'Days' bitmask) +#' +#' @param cal data.table containing all the calendars to be split up into individual days +#' @noRd +#' +makeAllOneDay <- function( cal ) +{ + duration <- cal$end_date - cal$start_date + 1 + + if ( 0==nrow(cal) || all(1 == duration)) + { + #nothing to do + return (cal) + } + + #make a list of dates for each object being replicated + startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + #work out how many time we need to replicate each item: number of operating days in week * num weeks + bitmaskMat = splitBitmaskMat( cal$Days, asInteger=FALSE ) + dayCount = rowSums(bitmaskMat) + numWeeks <- ceiling(as.integer(cal$duration) / 7) + repetitions = dayCount * numWeeks + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] + + #get a mask of operating days + operatingDayLogical <- rep( split(bitmaskMat, row(bitmaskMat)), times = numWeeks) + + #set the start and end date for each calender item to the single day identified earlier + selectedDates = dateSequence[unlist(operatingDayLogical)] + replicatedcal$end_date <- replicatedcal$start_date <- selectedDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- 1 + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] + + return (replicatedcal) +} + + + + +#' along a similar line to 'makeAllOneDay' duplicates input calendar objects into single WEEK duration calendar objects +#' +#' @param cal data.table containing all the calendars to be split up into individual weeks +#' @noRd +#' +expandAllWeeks <- function( cal ) +{ + if ( 0==nrow(cal) ) + { + #nothing to do + return (cal) + } + + #duration <- cal$end_date - cal$start_date + 1 + + #make a list of dates for each object being replicated + startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + numWeeks <- ceiling(as.integer(cal$duration) / 7) + + #replicate a logical vector for the start date and use that to select the relevant dates from the date sequence + startDayLogical <- SINGLE_DAY_PATTERN_LIST[startDayNum] + startDays <- rep(startDayLogical, times = numWeeks) + startDates <- dateSequence[ unlist(startDays) ] + + #replicate a logical vector for the end date and use that to select the relevant dates from the date sequence + endDayLogical <- SINGLE_DAY_PATTERN_LIST[endDayNum] + endDays <- rep(endDayLogical, times = numWeeks) + endDates <- dateSequence[ unlist(endDays) ] + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = numWeeks)] + + #set the start and end date for each calender item + replicatedcal$start_date <- startDates + replicatedcal$end_date <- endDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1 + + return (replicatedcal) +} + + + +#' allocate Cancellations Across Calendars +#' +#' @details +#' expects input calendar items to have been separated out into non-overlapping dates +#' and 'Days' bitmask unpacked into separate int or logical attributes +#' +#' "originalUID" is used to identify where the cancellations originally came from +#' after allocating across the split calender items the cancellations will have an updated +#' "UID" that says which calender they are now associated with +#' +#' @param calendar data.table of calendar items that are NOT cancellations (that has 'Days' bitmask unpacked ) +#' @param cancellations data.table of calender items that ARE cancellations (that has 'Days' bitmask unpacked ) +#' @noRd +#' +allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) +{ + tempNames = names(calendar) + + #stash some join fields away because we want to keep the data from the cancellations rather than the calendar table + #which otherwise get over-written by the join process + cancellations$start_date2 <- cancellations$start_date + cancellations$end_date2 <- cancellations$end_date + cancellations$UID <- NULL #we want the new UID from the calendar entries + + #left join cancellations to calendar by the original service ID + #and the date of the cancellation lying inside the period of the calendar + #and the day of the cancellation is an operating day of the calendar item + joined = cancellations[calendar, on = .(originalUID==originalUID, + start_date>=start_date, + end_date<=end_date)][ + ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) + | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] + #revert the stashed (join) fields + joined$start_date <- joined$start_date2 + joined$start_date2 <- NULL + joined$end_date <- joined$end_date2 + joined$end_date2 <- NULL + + #remove joined fields we don't need + joined <- joined[, .SD, .SDcols=tempNames] + + #belt and braces - fix any NA fields by reverting from the original UID + joined[is.na(UID), UID := originalUID] + + return( joined ) +} + + + + +NOT_NEEDED <- c("__NOT_NEEDED_MARKER__~@$$%&*((") + + +#this function is massively performance critical - profile any changes to it +selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) +{ + #if we have two adjacent complete items e.g. ....end 13th Jan start 14th jan..... + #then it's not a real gap and just an artefact of the algorithm use to generate the dates + if( rowIndex>1 && rowIndex= calNew$end_date[rowIndex],,which=TRUE] + + #are we in a gap between two base timetables with no overlays + if ( length(baseTimetableIndexes)<=0 ) + { + calNew$UID[rowIndex] <- NOT_NEEDED + return (calNew) + } + + + # apply timetable overlay selection logic - pick highest priority timetable type + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + + #pick the lowest alphabetic STP (highest priority), and just in case there is more than one, the shortest duration one. + + #priorityTimetable <- baseTimetables[order(STP, duration), head(.SD, 1)] + #performance we pre-sort all the entries by the priority & duration + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #so we don't need to sort again here, just pick the top filtered result + + #stash the generated start & end dates + #performance - copying to separate variables seems to be fastest + start_date = calNew$start_date[rowIndex] + end_date = calNew$end_date[rowIndex] + + calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] + #this is the most time consuming line in this fn. takes about 10x longer than the single variable copy below + + calNew$start_date[rowIndex] = start_date + calNew$end_date[rowIndex] = end_date + + return (calNew) +} + + + +# triggered by test case "10:test makeCalendarInner" +# when we have a 1 day overlay sitting on the start/end data of a base timetable +# the dates overlap - fix it +fixOverlappingDates <- function( cal ) +{ + rowCount = nrow(cal) + + #forwards + for (j in seq(1, rowCount)) { + + #adjust our end date if next item a higher priority overlay + if (j1 && !is.na(cal$UID[j-1]) && cal$STP[j-1] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1] +1 + } + } + } + + #backwards + for (j in seq(rowCount, 1)) { + + #adjust our end date if previous item a higher priority overlay + if (j>1 && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1]) ) + { + if ( cal$STP[j-1] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1] +1 + } + + if(j1 && !is.na(calNew$UID[j-1]) && NOT_NEEDED != calNew$UID[j-1] ) + { + calNew$start_date[j] <- calNew$end_date[j-1] +1 + } + } + } + + #backwards + for (j in seq(rowCount, 1)) { + + #if we are not valid & the previous item is already valid, fill in our details and adjust our start date + if (j>1 && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1]) ) + { + calNew <- selectOverlayTimeableAndCopyAttributes(cal, calNew, j) + + if ( NOT_NEEDED != calNew$UID[j-1]) + { + calNew$start_date[j] <- calNew$end_date[j-1] +1 + } + + #if next item valid adjust our start date + if(j 0, ] + + #performance, do all subsets in one go + calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0), ] + + # Append UID to note the changes + if (nrow(calNew) > 0) { + if (nrow(calNew) <= 26) { + calNew$UID <- paste0(calNew$UID, " ", letters[1:nrow(calNew)]) + } else { + # Cases where we need extra letters, gives upto 676 ids + lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) + calNew$UID <- paste0(calNew$UID, " ", lett[1:nrow(calNew)]) + } + } else { + calNew <- NA + } + + return(calNew) +} + + + + +#' make calendar helper function +#' this originally expected and dealt with cancellations too. This worked ok for single day duration cancellations +#' but had problems with multi-day cancellations when combined with overlays +#' code hasn't been changed to reject / avoid cancellations but results may not be predictable / tested scenarios +#' @param calendarSub data.table containing all the calendars (aka CIF operating patterns) for a single service +#' @noRd +#' +makeCalendarInner <- function(calendarSub) { + + if ( 1 == nrow(calendarSub) ) + { + # make into an single entry + res = list(calendarSub, NA) + } + else + { + if (length(unique(calendarSub$UID)) > 1) + { + stop(paste("Error: makeCalendarInner was passed more than one service to work on. service=", unique(calendarSub$UID))) + } + + # check duration and types + allTypes <- calendarSub$STP + + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + baseType = max(allTypes) #usually we expect 'P' to be the base timetable... but it can also be STP service in which case it will be 'N' + + overlayDurations <- as.numeric(calendarSub$duration[calendarSub$STP != baseType]) + overlayTypes <- calendarSub$STP[calendarSub$STP != baseType] + + if( length(overlayDurations) <= 0 ) + { + #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules + res = list(calendarSub, NA) + } + #if every overlay is a one day cancellation (and only one base timetable) + else if (all(overlayDurations == 1) && all(overlayTypes == "C") && sum(allTypes == baseType) == 1 ) + { + warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") + + # Apply the cancellation via entries in calendar_dates.txt + res = list( calendarSub[calendarSub$STP != "C", ], + calendarSub[calendarSub$STP == "C", ]) + } + else + { + uniqueDayPatterns <- unique(calendarSub$Days[calendarSub$STP != "C"]) + + # if the day patterns are all identical + if (length(uniqueDayPatterns) <= 1 ) + { + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #calendarSub = calendarSub[ order(STP, duration), ] + setkey( calendarSub, STP, duration ) + setindex( calendarSub, start_date, end_date) + + calendar_new <- makeCalendarsUnique( splitDates(calendarSub) ) + res = list(calendar_new, NA) + } + else # split by day pattern + { + #this works if the day patterns don't overlap any operating days. + if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) + { + #this scenario DOES exist in the downloaded ATOC test data + #stop(paste("Scenario with overlay pattern not matching base pattern is not currently handled. service=", unique(calendarSub$UID))) + res = makeCalendarForDifferentDayPatterns( calendarSub ) + } + else + { + res = makeCalendarForDayPatterns( uniqueDayPatterns, calendarSub ) + } + } + } + } + + #stopifnot( is.list(res) ) + return (res) +} + + +#TODO see if makeCalendarForDifferentDayPatterns() covers this case too +#- if so, remove this so there are fewer code paths to test +makeCalendarForDayPatterns <- function( dayPatterns, calendar ) +{ + splits <- list() + + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #calendar = calendar[ order(STP, duration), ] + setkey( calendar, STP, duration ) + setindex( calendar, start_date, end_date) + + for (k in seq(1, length(dayPatterns))) { + # select for each pattern but include cancellations with a + # different day pattern + calendarDay <- calendar[calendar$Days == dayPatterns[k] | calendar$STP == "C", ] + # TODO cancellations now handled elsewhere - remove this once code stable + + if (all(calendarDay$STP == "C")) { + # ignore cases of everything is cancelled + splits[[k]] <- NULL + warning("unexpected item in the makeCalendarForDayPatterns-ing area, cancellations should now be handled at a higher level") + } + else { + calendarNewDay <- splitDates(calendarDay) + + # rejects NAs + if (inherits(calendarNewDay, "data.frame")) { + # further differentiate the UID by appending a number to the end for each different days pattern + calendarNewDay$UID <- paste0(calendarNewDay$UID, k) + splits[[k]] <- calendarNewDay + } + } + } + + splits <- data.table::rbindlist(splits, use.names=FALSE) + + splits <- makeCalendarsUnique( splits ) + + # after all this faffing about and splitting and joining, it's quite likely we've created some + # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) + splits <- splits[ checkOperatingDayActive( splits ) ] + + return(list(splits, NA)) +} + + +# this is a complex case where the overlays don't have the same day pattern as the base timetable +# +# e.g base is mon-sat, and we have some engineering work for 3 weeks tue-thur +# +# the approach we take is to duplicate the overlay timetables for every week they are in effect, then overlay them. +# +# aha but the complexity isn't finished. If the overlay is tue+thur then wed is the base timetable. +# +# when we get to this latter complexity we just split the overlay into individual days and apply it that way. +# +makeCalendarForDifferentDayPatterns <- function( calendar ) +{ + baseType = max(calendar$STP) + baseTimetables = calendar[calendar$STP == baseType] + overlayTimetables = calendar[calendar$STP != baseType] + + gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] + continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] + + gappyOverlays = makeAllOneDay( gappyOverlays ) + continiousOverlays = expandAllWeeks( continiousOverlays ) + + overlays = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) + + + splits <- list() + + distinctBasePatterns = unique( baseTimetables$Days ) + + for (k in seq(1, length(distinctBasePatterns))) { + + thisBase = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] + + thisOverlay = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] + + if (nrow(thisOverlay) <= 0) + { + splits[[k]] <- thisBase + } + else + { + timetablesForThisPattern = data.table::rbindlist( list( thisBase, thisOverlay ), use.names=FALSE) + + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #timetablesForThisPattern = timetablesForThisPattern[ order(STP, duration), ] + setkey( timetablesForThisPattern, STP, duration ) + setindex( timetablesForThisPattern, start_date, end_date) + + thisSplit <- splitDates( timetablesForThisPattern ) + + # rejects NAs + if (inherits(thisSplit, "data.frame")) { + # further differentiate the UID by appending a number to the end for each different days pattern + thisSplit$UID <- paste0(thisSplit$UID, k) + splits[[k]] <- thisSplit + } + } + } + + splits <- data.table::rbindlist(splits, use.names=FALSE) + + splits <- makeCalendarsUnique( splits ) + + # after all this faffing about and splitting and joining, it's quite likely we've created some + # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) + splits <- splits[ checkOperatingDayActive( splits ) ] + + return(list(splits, NA)) +} + + + From b6ff79deee11b94ab248df1c2c48064690aee328 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 4 Sep 2023 23:57:54 +0100 Subject: [PATCH 27/81] up-rev version number, add author comment, fix warning in unit test --- DESCRIPTION | 6 ++++-- tests/testthat/test_aa_unit.R | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4d9f8bf..4bd98ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,12 @@ Package: UK2GTFS Type: Package Title: Converts UK transport timetable datasets to GTFS format -Version: 0.2 +Version: 1.0 Authors@R: c( person("Malcolm", "Morgan", email = "m.morgan1@leeds.ac.uk", role = c("aut","cre"), - comment = c(ORCID = "0000-0002-9488-9183")) + comment = c(ORCID = "0000-0002-9488-9183")), + person("Owen", "O'Neill", email = "owen@wvr.org.uk", role = c("aut"), + comment = c(ORCID = "0009-0008-0595-3042")) ) Maintainer: Malcolm Morgan Description: The UK uses a range of odd formats to store timetable data, this package converts them to the nice GTFS format. diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index 7c1c2f1..9091e9d 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -578,8 +578,10 @@ test_that("1:test allocateCancellationsAcrossCalendars", { calendar <- fixCalendarDates( calendar ) calendar <- splitAndRebindBitmask( calendar ) - cancellations = data.table( - UID=c( "aaaaa", "bbbbbb", "ccccccc", "ddddddd", "eeeeee"), #this column gets removed + #TODO - discuss. the GTFS spec allows cancellations/ additions with no associates calendar - we're currently + # filtering these out, which is probably the right thing to do ? + cancellations = data.table( #these columns get removed + UID=c( "aaaaa", "bbbbbb", "ccccccc", "ddddddd", "eeeeee", "fffffff"), originalUID=c("uid1", "uid1", "uid1", "uid2", "uid2", "uid4"), start_date=c("02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), end_date=c( "02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), From 1b5110d156497c86555b6682473a8a8e756142ee Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 00:02:10 +0100 Subject: [PATCH 28/81] spelling mistake --- tests/testthat/test_aa_unit.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index 9091e9d..794fa60 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -578,7 +578,7 @@ test_that("1:test allocateCancellationsAcrossCalendars", { calendar <- fixCalendarDates( calendar ) calendar <- splitAndRebindBitmask( calendar ) - #TODO - discuss. the GTFS spec allows cancellations/ additions with no associates calendar - we're currently + #TODO - discuss. the GTFS spec allows cancellations/ additions with no associated calendar - we're currently # filtering these out, which is probably the right thing to do ? cancellations = data.table( #these columns get removed UID=c( "aaaaa", "bbbbbb", "ccccccc", "ddddddd", "eeeeee", "fffffff"), From d56d912ea6efba48d4aa42dc4cf3b8be84947331 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:28:24 +0100 Subject: [PATCH 29/81] spelling --- R/atoc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/atoc.R b/R/atoc.R index d77e411..8088bed 100644 --- a/R/atoc.R +++ b/R/atoc.R @@ -31,7 +31,7 @@ #' Agency #' #' The ATOC files do not contain the necessary information to build the -#' agency.txt file. Therfore this data is provided with the package. You can +#' agency.txt file. Therefore this data is provided with the package. You can #' also pass your own data frame of agency information. #' #' From 9b67120e8cadac5dc475b1b2941c6806613ba03b Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:47:28 +0100 Subject: [PATCH 30/81] spelling --- R/get_cal.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_cal.R b/R/get_cal.R index a009b84..45492f9 100644 --- a/R/get_cal.R +++ b/R/get_cal.R @@ -7,7 +7,7 @@ #' @return data frame #' @details TransXchange records bank holidays by name (e.g. Christmas Day), #' some UK bank holidays move around, so this function downloads the official -#' bank holiday calendar. The offical feed only covers a short period of time +#' bank holiday calendar. The official feed only covers a short period of time #' so this may not be suitable for converting files from the past / future. #' @export #' From 07bce70d48786d63fc3f343a9f24e9b7f6fc9fe3 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:47:58 +0100 Subject: [PATCH 31/81] spelling --- R/nptdr_export.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/nptdr_export.R b/R/nptdr_export.R index dc0e9c2..2a09b08 100644 --- a/R/nptdr_export.R +++ b/R/nptdr_export.R @@ -108,7 +108,7 @@ nptdr_makeCalendar <- function(schedule, exceptions, historic_bank_holidays = hi calendar$bank_holiday <- NULL # TODO: Detect Scotland and NI - message("Unique Scotland and NI bank holidays not correctly handeled") + message("Unique Scotland and NI bank holidays are not correctly handled") bh <- historic_bank_holidays[historic_bank_holidays$date >= min(calendar$start_date, na.rm = TRUE),] bh <- bh[bh$date <= min(calendar$end_date, na.rm = TRUE), ] From 73727ecd1bd39a241023c8640645bfd97dbf803c Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:48:23 +0100 Subject: [PATCH 32/81] spelling --- R/transxchange.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/transxchange.R b/R/transxchange.R index ddd5cca..30d120b 100644 --- a/R/transxchange.R +++ b/R/transxchange.R @@ -19,12 +19,12 @@ #' @return A GTFS named list #' @details #' -#' This is a meta fucntion which aids TransXchange to GTFS conversion. It simple +#' This is a meta function which aids TransXchange to GTFS conversion. It simple #' runs transxchange_import(), transxchange_export(), gtfs_merge(), gtfs_write() #' #' Progress Bars #' -#' To minimise overall processing when using mulitple cores the fucntion works +#' To minimise overall processing when using multiple cores the function works #' from largest to smallest file.This can mean the progress bar sits a 0% for #' quite some time, before starting to move rapidly. #' From ba4074fb2988e34f1fb5ba7dbc3e873995404906 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:49:15 +0100 Subject: [PATCH 33/81] fix bug contributed by my cat. --- tests/testthat/test_atoc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index 8096d04..4b7ff21 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -1,4 +1,4 @@ -ontext("Get the example atoc files") +context("Get the example atoc files") file_path <- file.path(tempdir(),"uk2gtfs_tests") dir.create(file_path) data_path <- file.path(tempdir(),"uk2gtfs_data") From ac957d7916744a16c374225d957b2eebd08f85db Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:53:15 +0100 Subject: [PATCH 34/81] spelling --- man/atoc2gtfs.Rd | 2 +- man/get_bank_holidays.Rd | 2 +- man/transxchange2gtfs.Rd | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/atoc2gtfs.Rd b/man/atoc2gtfs.Rd index 3220063..071b5e6 100644 --- a/man/atoc2gtfs.Rd +++ b/man/atoc2gtfs.Rd @@ -55,7 +55,7 @@ Locations Agency The ATOC files do not contain the necessary information to build the - agency.txt file. Therfore this data is provided with the package. You can + agency.txt file. Therefore this data is provided with the package. You can also pass your own data frame of agency information. } \seealso{ diff --git a/man/get_bank_holidays.Rd b/man/get_bank_holidays.Rd index 77d86d7..41f04f6 100644 --- a/man/get_bank_holidays.Rd +++ b/man/get_bank_holidays.Rd @@ -24,6 +24,6 @@ data. \details{ TransXchange records bank holidays by name (e.g. Christmas Day), some UK bank holidays move around, so this function downloads the official - bank holiday calendar. The offical feed only covers a short period of time + bank holiday calendar. The official feed only covers a short period of time so this may not be suitable for converting files from the past / future. } diff --git a/man/transxchange2gtfs.Rd b/man/transxchange2gtfs.Rd index 2309e65..539e099 100644 --- a/man/transxchange2gtfs.Rd +++ b/man/transxchange2gtfs.Rd @@ -49,12 +49,12 @@ TransXchange to GTFS \details{ Convert transxchange files to GTFS -This is a meta fucntion which aids TransXchange to GTFS conversion. It simple +This is a meta function which aids TransXchange to GTFS conversion. It simple runs transxchange_import(), transxchange_export(), gtfs_merge(), gtfs_write() Progress Bars -To minimise overall processing when using mulitple cores the fucntion works +To minimise overall processing when using multiple cores the function works from largest to smallest file.This can mean the progress bar sits a 0% for quite some time, before starting to move rapidly. } From 8a3315fcef5ce2b0b412e8b99a9c3d8fb54af7c9 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 16:54:21 +0100 Subject: [PATCH 35/81] 1) bug fix for duplicated schedule ID 2) additional test for above 3) added debugging option to make it easier to capture problem services --- NAMESPACE | 1 + R/atoc_export.R | 15 ++- R/atoc_overlay.R | 112 ++++++++++++++----- R/globals.R | 25 +++++ R/gtfs_merge.R | 56 ++++++---- man/UK2GTFS_option_stopProcessingAtUid.Rd | 19 ++++ tests/testthat/test_aa_unit.R | 126 ++++++++++++++++++---- 7 files changed, 287 insertions(+), 67 deletions(-) create mode 100644 man/UK2GTFS_option_stopProcessingAtUid.Rd diff --git a/NAMESPACE b/NAMESPACE index 0d00103..f870f94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(ATOC_shapes) +export(UK2GTFS_option_stopProcessingAtUid) export(atoc2gtfs) export(dl_example_file) export(get_bank_holidays) diff --git a/R/atoc_export.R b/R/atoc_export.R index 645255f..f3c279d 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -209,8 +209,9 @@ makeCalendar <- function(schedule, ncores = 1) { #test treating date as int: seem to be about twice as fast on the critical line when selecting base timetable #TODO add package option to switch between processing as date/int otherwise debugging is too hard - # UIDs = unique(calendar$UID) - # length_todo = length(UIDs) + #debugging option + set_STOP_PROCESSING_UID( getOption("UK2GTFS_opt_stopProcessingAtUid") ) + message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) calendar$`__TEMP__` <- calendar$UID calendar_split <- calendar[, .(list(.SD)), by = `__TEMP__`][,V1] @@ -255,6 +256,16 @@ makeCalendar <- function(schedule, ncores = 1) { cancellations$originalUID <- NULL res.calendar$originalUID <- NULL + #error checking + dups = duplicated( res.calendar$UID ) + if( any(TRUE==dups) ) + { + dups = unique( res.calendar$UID[ dups ] ) + + warning(paste(Sys.time(), "Duplicate UIDs were created by the makeCalendar() process, this is likely to cause downstream proceessing errors. ", + "Please capture the data and raise a bug / create a test case. ", dups)) + } + return(list(res.calendar, cancellations)) } diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index 22634fa..a9e97a8 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -3,6 +3,65 @@ +assign("STOP_PROCESSING_UID", NULL ) + +set_STOP_PROCESSING_UID <- function( value ) +{ + env <- asNamespace("UK2GTFS") + + unlockBinding("STOP_PROCESSING_UID", env) + + assign("STOP_PROCESSING_UID", value, envir = env) + + lockBinding("STOP_PROCESSING_UID", env) + + if(!is.null(value)) + { + message(paste0(Sys.time(), " Set STOP_PROCESSING_UID to [", get("STOP_PROCESSING_UID"), "]")) + } +} + + + + + +# Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique +appendLetterSuffix <- function( cal ) +{ + rows = nrow(cal) + + if (rows > 1) + { + if (rows <= 26) + { + cal$UID <- paste0(cal$UID, " ", letters[1:rows]) + } + else + { + # Cases where we need extra letters, gives up to 676 ids + lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) + cal$UID <- paste0(cal$UID, " ", lett[1:rows]) + } + } + + return (cal) +} + + +# Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique +appendNumberSuffix<-function( cal, numToAppend ) +{ + if( numToAppend>1 ) #don't need to append a new number if we only have one pattern + { + # further differentiate the UID by appending a number to the end for each different days pattern + cal$UID <- paste0(cal$UID, numToAppend) + } + + return (cal) +} + + + # in a week bitmask, if there are non-operating days between the first and last operating day of the week - will return TRUE # e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE hasGapInOperatingDays <- function( daysBitmask ) @@ -321,7 +380,7 @@ allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) #and the day of the cancellation is an operating day of the calendar item joined = cancellations[calendar, on = .(originalUID==originalUID, start_date>=start_date, - end_date<=end_date)][ + end_date<=end_date), nomatch = 0][ ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] #revert the stashed (join) fields @@ -550,15 +609,12 @@ splitDates <- function(cal) { calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0), ] # Append UID to note the changes - if (nrow(calNew) > 0) { - if (nrow(calNew) <= 26) { - calNew$UID <- paste0(calNew$UID, " ", letters[1:nrow(calNew)]) - } else { - # Cases where we need extra letters, gives upto 676 ids - lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) - calNew$UID <- paste0(calNew$UID, " ", lett[1:nrow(calNew)]) - } - } else { + if (nrow(calNew) > 0) + { + calNew <- appendLetterSuffix( calNew ) + } + else + { calNew <- NA } @@ -568,6 +624,7 @@ splitDates <- function(cal) { + #' make calendar helper function #' this originally expected and dealt with cancellations too. This worked ok for single day duration cancellations #' but had problems with multi-day cancellations when combined with overlays @@ -577,6 +634,15 @@ splitDates <- function(cal) { #' makeCalendarInner <- function(calendarSub) { + if ( !is.null(STOP_PROCESSING_UID) ) + { + if ( any( STOP_PROCESSING_UID==calendarSub$UID) ) + { + message(paste0(Sys.time(), " Reached STOP_PROCESSING_UID value [", unique(calendarSub$UID), "] length=", length(calendarSub$UID))) + stop("Option:UK2GTFS_option_stopProcessingAtUid has been set: Stopped processing at UID=", STOP_PROCESSING_UID) + } + } + if ( 1 == nrow(calendarSub) ) { # make into an single entry @@ -602,15 +668,15 @@ makeCalendarInner <- function(calendarSub) { if( length(overlayDurations) <= 0 ) { #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules - res = list(calendarSub, NA) + res = list( appendLetterSuffix(calendarSub), NA) } - #if every overlay is a one day cancellation (and only one base timetable) + #if every overlay is a one day cancellation #TODO remove this condition on only one base - code works, removing just breaks some tests that would need fixing else if (all(overlayDurations == 1) && all(overlayTypes == "C") && sum(allTypes == baseType) == 1 ) { warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") # Apply the cancellation via entries in calendar_dates.txt - res = list( calendarSub[calendarSub$STP != "C", ], + res = list( appendLetterSuffix( calendarSub[calendarSub$STP != "C", ] ), calendarSub[calendarSub$STP == "C", ]) } else @@ -679,9 +745,7 @@ makeCalendarForDayPatterns <- function( dayPatterns, calendar ) # rejects NAs if (inherits(calendarNewDay, "data.frame")) { - # further differentiate the UID by appending a number to the end for each different days pattern - calendarNewDay$UID <- paste0(calendarNewDay$UID, k) - splits[[k]] <- calendarNewDay + splits[[k]] <- appendNumberSuffix( calendarNewDay, k ) } } } @@ -729,17 +793,17 @@ makeCalendarForDifferentDayPatterns <- function( calendar ) for (k in seq(1, length(distinctBasePatterns))) { - thisBase = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] + theseBases = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] - thisOverlay = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] + theseOverlays = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] - if (nrow(thisOverlay) <= 0) + if (nrow(theseOverlays) <= 0) { - splits[[k]] <- thisBase + splits[[k]] <- appendNumberSuffix( appendLetterSuffix( theseBases ), k ) } else { - timetablesForThisPattern = data.table::rbindlist( list( thisBase, thisOverlay ), use.names=FALSE) + timetablesForThisPattern = data.table::rbindlist( list( theseBases, theseOverlays ), use.names=FALSE) #performance pre-sort all the entries by the priority #this speeds things up when we look up the required priority overlay **SEE_NOTE** @@ -751,9 +815,8 @@ makeCalendarForDifferentDayPatterns <- function( calendar ) # rejects NAs if (inherits(thisSplit, "data.frame")) { - # further differentiate the UID by appending a number to the end for each different days pattern - thisSplit$UID <- paste0(thisSplit$UID, k) - splits[[k]] <- thisSplit + + splits[[k]] <- appendNumberSuffix( thisSplit, k ) } } } @@ -771,3 +834,4 @@ makeCalendarForDifferentDayPatterns <- function( calendar ) + diff --git a/R/globals.R b/R/globals.R index 6275df6..f6ab36b 100644 --- a/R/globals.R +++ b/R/globals.R @@ -15,3 +15,28 @@ utils::globalVariables(c( 'operator_code','route_id' )) + + +#' UK2GTFS option stopProcessingAtUid +#' @description sets/gets a UID value at which processing will stop - used for debugging +#' @param value option value to be set +#' @details If no value passed in will return the current setting of the option. (Usually NULL) +#' If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() +#' when an exact match for that value is encountered. +#' +#' THIS ONLY WORKS WITH ncores==1 +#' (probably some environment nonsense I don't understand) +#' +#' @export +UK2GTFS_option_stopProcessingAtUid <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_stopProcessingAtUid") ) + } + else + { + return( options(UK2GTFS_opt_stopProcessingAtUid = value) ) + } +} + diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index 4926c20..05105a0 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -20,10 +20,19 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa flattened <- unlist(gtfs_list, recursive = FALSE) rm(gtfs_list) + #The Atoc code has moved from data.frame to data.table for performance reasons, but the transXchange code hasn't migrated yet. + #this is a breaking change for some items because the behaviour for data.table isn't the same as data.frame, despite extending data.frame. nice. + #least painful way to fix this for now is to convert to data.table if supplied data.frame + flattened <- lapply( flattened, function(item) + { + if ( inherits(item, "data.table" ) ) return (item) + return (data.table(item)) + } ) + #get unique input table names tableNames <- unique(names(flattened)) - grouped_list <- list() + grouped_list <- list() # Loop through table names names and group data frames for (tableName in tableNames) { @@ -125,8 +134,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa agency_lang = agency_lang[1] ) } else { - stop(paste0("Duplicated Agency IDs ", - paste(unique(agency.check$agency_id[duplicated(agency.check$agency_id)]), collapse = " "))) + stop("Duplicated Agency IDs: ", + paste(unique(agency.check$agency_id[duplicated(agency.check$agency_id)]), collapse = " ")) } } else { agency <- agency[!duplicated(agency$agency_id), ] @@ -140,7 +149,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if(force){ stops <- stops[!duplicated(stops$stop_id),] } else { - stop("Duplicated Stop IDS") + stop("Duplicated Stop IDS: ", + paste( unique(stops$stop_id[duplicated(stops$stop_id)]), collapse = " ")) } } @@ -156,14 +166,15 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa routes <- routes[!duplicated(new_route_id), ] new_route_id <- routes[, c("file_id", "route_id")] } else { - stop("Duplicated route_id within the same GTFS file, try using force = TRUE") + stop("Duplicated route_id within the same GTFS file, try using force = TRUE ", + paste( unique(new_route_id$route_id[duplicated(new_route_id)]), collapse = " ")) } } new_route_id$route_id_new <- seq(1, nrow(new_route_id)) routes <- dplyr::left_join(routes, new_route_id, by = c("file_id", "route_id")) - routes <- routes[, c("route_id_new", retainedColumnNames)] + routes <- routes[, c("route_id_new", retainedColumnNames), with=FALSE] routes <- routes %>% dplyr::rename(route_id = route_id_new) } @@ -176,7 +187,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa new_service_id <- calendar[, c("file_id", "service_id")] if (any(duplicated(new_service_id))) { - stop("Duplicated service_id within the same GTFS file") + stop("Duplicated service_id within the same GTFS file: ", + paste( unique(new_service_id$service_id[duplicated(new_service_id)]), collapse = " ")) } # it is valid to have calendar_dates with no associated calendar (see comments further down) @@ -187,14 +199,14 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] calendar <- dplyr::left_join(calendar, new_service_id, by = c("file_id", "service_id")) - calendar <- calendar[, c("service_id_new", retainedColumnNames)] + calendar <- calendar[, c("service_id_new", retainedColumnNames), with=FALSE] names(calendar) <- c("service_id", retainedColumnNames) if (nrow(calendar_dates) > 0) { retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] calendar_dates <- dplyr::left_join(calendar_dates, new_service_id, by = c("file_id", "service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames), with=FALSE] calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) } } @@ -211,7 +223,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa stop_times <- unique(stop_times) new_trip_id <- trips[, c("file_id", "trip_id")] } else{ - stop("Duplicated trip_id within the same GTFS file") + stop(paste0("Duplicated trip_id within the same GTFS file", + paste( unique( new_trip_id$trip_id[duplicated(new_trip_id)]), collapse = " "))) } @@ -220,19 +233,19 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("trip_id"))] trips <- dplyr::left_join(trips, new_trip_id, by = c("file_id", "trip_id")) - trips <- trips[, c("trip_id_new", retainedColumnNames)] + trips <- trips[, c("trip_id_new", retainedColumnNames), with=FALSE] trips <- trips %>% dplyr::rename(trip_id = trip_id_new) retainedColumnNames <- colnames(stop_times)[!(colnames(stop_times) %in% c("trip_id", "file_id"))] stop_times <- dplyr::left_join(stop_times, new_trip_id, by = c("file_id", "trip_id")) - stop_times <- stop_times[, c("trip_id_new", retainedColumnNames)] + stop_times <- stop_times[, c("trip_id_new", retainedColumnNames), with=FALSE] stop_times <- stop_times %>% dplyr::rename(trip_id = trip_id_new) if ( length(frequencies) > 0 ) { retainedColumnNames <- colnames(frequencies)[!(colnames(frequencies) %in% c("trip_id", "file_id"))] frequencies <- dplyr::left_join(frequencies, new_trip_id, by = c("file_id", "trip_id")) - frequencies <- frequencies[, c("trip_id_new", retainedColumnNames)] + frequencies <- frequencies[, c("trip_id_new", retainedColumnNames), with=FALSE] frequencies <- frequencies %>% dplyr::rename(trip_id = trip_id_new) } } @@ -240,14 +253,14 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if (exists("new_service_id")) { retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id"))] trips <- dplyr::left_join(trips, new_service_id, by = c("file_id", "service_id")) - trips <- trips[, c(retainedColumnNames, "service_id_new")] + trips <- trips[, c(retainedColumnNames, "service_id_new"), with=FALSE] trips <- trips %>% dplyr::rename(service_id = service_id_new) } if (exists("new_route_id")) { retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("route_id"))] trips <- dplyr::left_join(trips, new_route_id, by = c("file_id", "route_id")) - trips <- trips[, c("route_id_new", retainedColumnNames)] + trips <- trips[, c("route_id_new", retainedColumnNames), with=FALSE] trips <- trips %>% dplyr::rename(route_id = route_id_new) } @@ -265,9 +278,9 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if (condenseServicePatterns && nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} - #find every unique combination of calendar_dates and calender values + #find every unique combination of calendar_dates and calendar values calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) - if(class(calendar_dates_summary$date) == "Date"){ + if( inherits(calendar_dates_summary$date, "Date") ){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, pattern = paste(c(as.character(date), exception_type), collapse = "") ) @@ -291,18 +304,18 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id", "route_id"))] trips <- dplyr::left_join(trips, calendar_summary, by = c("service_id")) - trips <- trips[, c("route_id", "service_id_new", retainedColumnNames)] + trips <- trips[, c("route_id", "service_id_new", retainedColumnNames), with=FALSE] trips <- trips %>% dplyr::rename(service_id = service_id_new) retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] calendar <- dplyr::left_join(calendar, calendar_summary, by = c("service_id")) - calendar <- calendar[, c("service_id_new", retainedColumnNames)] + calendar <- calendar[, c("service_id_new", retainedColumnNames), with=FALSE] calendar <- calendar %>% dplyr::rename(service_id = service_id_new) calendar <- calendar[!duplicated(calendar$service_id), ] retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] calendar_dates <- dplyr::left_join(calendar_dates, calendar_summary, by = c("service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames), with=FALSE] calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } @@ -314,7 +327,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if(force){ shapes <- shapes[!duplicated(composite_key),] } else { - stop("Duplicated Shapes IDS") + stop(paste0("Duplicated Shapes IDS", + paste( unique( composite_key[duplicated(composite_key)]), collapse = " "))) } } diff --git a/man/UK2GTFS_option_stopProcessingAtUid.Rd b/man/UK2GTFS_option_stopProcessingAtUid.Rd new file mode 100644 index 0000000..1c69f21 --- /dev/null +++ b/man/UK2GTFS_option_stopProcessingAtUid.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_stopProcessingAtUid} +\alias{UK2GTFS_option_stopProcessingAtUid} +\title{UK2GTFS option stopProcessingAtUid} +\usage{ +UK2GTFS_option_stopProcessingAtUid(value) +} +\arguments{ +\item{value}{option value to be set} +} +\description{ +sets/gets a UID value at which processing will stop - used for debugging +} +\details{ +If no value passed in will return the current setting of the option. (Usually NULL) + If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() + when an exact match for that value is encountered. +} diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index 794fa60..a0c287a 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -38,6 +38,33 @@ printDifferences <- function( v1, v2 ) } + + + +test_that("test changing module level variable", { + + env <- asNamespace("UK2GTFS") + + current = get("STOP_PROCESSING_UID", envir=env) + + set_STOP_PROCESSING_UID( "xxxx") + + new = get("STOP_PROCESSING_UID", envir=env) + + set_STOP_PROCESSING_UID( "yyyy") + + new2 = get("STOP_PROCESSING_UID", envir=env) + + set_STOP_PROCESSING_UID( NULL ) + + new3 = get("STOP_PROCESSING_UID", envir=env) + + expect_true( "xxxx"==new && "yyyy"==new2 && is.null(new3) ) +}) + + + + test_that("test countIntersectingDayPatterns:1", { OK = TRUE @@ -820,7 +847,7 @@ test_that("5:test makeCalendarInner:one day cancellations(current)", { res.calendar <- res[[1]] res.calendar_dates <- res[[2]] - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 a2", "uid1 b2", "uid1 c2"), + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 a2", "uid1 b2", "uid1 c2"), start_date=c("02-01-2023", "12-01-2023", "24-01-2023", "08-01-2023", "12-01-2023", "24-01-2023"), end_date=c( "10-01-2023", "22-01-2023", "04-02-2023", "10-01-2023", "22-01-2023", "05-02-2023"), Days=c( "1111110", "1111110", "1111110", "0000001", "0000001", "0000001"), @@ -855,7 +882,7 @@ test_that("6:test makeCalendarInner:overlay -matching base pattern", { res.calendar <- res[[1]] res.calendar_dates <- res[[2]] - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 a2"), + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid12"), start_date=c("02-01-2023", "09-01-2023", "22-01-2023", "08-01-2023"), end_date=c( "08-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), Days=c( "1111110", "1111110", "1111110", "0000001"), @@ -868,7 +895,6 @@ test_that("6:test makeCalendarInner:overlay -matching base pattern", { printDifferencesDf(expectedResult,res.calendar) - expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) }) @@ -889,12 +915,20 @@ test_that("6.1:test makeCalendarInner:bases with different patterns, no overlay" res.calendar <- res[[1]] res.calendar_dates <- res[[2]] + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("22-05-2023", "25-09-2023", "02-10-2023"), + end_date=c( "22-09-2023", "26-09-2023", "13-10-2023"), + Days=c( "1111100", "1100000", "1111100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + expectedResult <- fixCalendarDates( expectedResult ) + res.calendar = removeOriginalUidField( res.calendar ) - testData = removeOriginalUidField( testData ) + expectedResult = removeOriginalUidField( expectedResult ) - printDifferencesDf(testData,res.calendar) + printDifferencesDf(expectedResult,res.calendar) - expect_true(identical(testData,res.calendar) & is.na(res.calendar_dates)) + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) }) @@ -913,12 +947,20 @@ test_that("6.2:test makeCalendarInner:base is N (STP) with different patterns, n res.calendar <- res[[1]] res.calendar_dates <- res[[2]] + expectedResult = data.table(UID=c( "uid1 a", "uid1 b"), + start_date=c("26-06-2023", "31-07-2023"), + end_date=c( "29-07-2023", "03-08-2023"), + Days=c( "1111110", "1111000"), + STP=c( "N", "N"), + rowID=c( 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + res.calendar = removeOriginalUidField( res.calendar ) - testData = removeOriginalUidField( testData ) + expectedResult = removeOriginalUidField( expectedResult ) - printDifferencesDf(testData,res.calendar) + printDifferencesDf(expectedResult,res.calendar) - expect_true(identical(testData,res.calendar) & is.na(res.calendar_dates)) + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) }) @@ -938,7 +980,7 @@ test_that("7:test makeCalendarInner:overlay -different to base pattern", { res.calendar <- res[[1]] res.calendar_dates <- res[[2]] - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1"), + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid12"), start_date=c("02-01-2023", "10-01-2023", "15-01-2023", "17-01-2023", "22-01-2023", "08-01-2023"), end_date=c( "09-01-2023", "14-01-2023", "16-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), Days=c( "1111110", "0111110", "1111110", "0111110", "1111110", "0000001"), @@ -971,7 +1013,7 @@ test_that("8:test makeCalendarInner:overlay -different to base pattern-gap in pa res.calendar_dates <- res[[2]] expectedResult = data.table( - UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "14-01-2023"), end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "16-01-2023"), Days=c( "1111110", "0100000", "0010000", "1111110", "0000100", "1111110"), @@ -979,7 +1021,7 @@ test_that("8:test makeCalendarInner:overlay -different to base pattern-gap in pa rowID=c( 1, 3, 3, 1, 3, 1)) expectedResult = rbind(expectedResult, data.table( - UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 j1", "uid1 k1", "uid1"), + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 j", "uid1 k", "uid12"), start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "04-02-2023", "05-02-2023"), Days=c( "0100000", "0010000", "1111110", "0000100", "1111110", "0000001"), @@ -1013,7 +1055,7 @@ test_that("9:test makeCalendarInner:overlay -different to base pattern-gap in pa res.calendar_dates <- res[[2]] expectedResult = data.table( - UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1"), #the 'f' calendar gets thrown away + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e"), #the 'f' calendar gets thrown away start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), Days=c( "0111100", "0100000", "0010000", "0111100", "0000100"), @@ -1021,7 +1063,7 @@ test_that("9:test makeCalendarInner:overlay -different to base pattern-gap in pa rowID=c( 1, 3, 3, 1, 3)) expectedResult = rbind(expectedResult, data.table( - UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 j1", "uid1 k1", "uid1"), + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 j", "uid1 k", "uid12"), start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "03-02-2023", "05-02-2023"), Days=c( "0100000", "0010000", "0111100", "0000100", "0111100", "0000001"), @@ -1063,7 +1105,7 @@ test_that("10:test makeCalendarInner", { # instead of going 'oh that's fine, march and april don't overlap expectedResult = data.table( - UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), start_date=c("02-01-2023", "11-01-2023", "13-01-2023", "18-01-2023", "20-01-2023", "24-01-2023"), end_date=c( "10-01-2023", "11-01-2023", "17-01-2023", "19-01-2023", "22-01-2023", "03-02-2023"), Days=c( "1111100", "0011000", "1111100", "0011000", "1111100", "1111100"), @@ -1071,7 +1113,7 @@ test_that("10:test makeCalendarInner", { rowID=c( 1, 4, 1, 4, 1, 1)) expectedResult = rbind(expectedResult, data.table( - UID=c( "uid1", "uid1 c3", "uid1 d3" ), + UID=c( "uid12", "uid1 c3", "uid1 d3" ), start_date=c("08-01-2023", "01-03-2023", "10-03-2023" ), end_date=c( "05-02-2023", "07-03-2023", "31-03-2023" ), Days=c( "0000001", "0011100", "0011100" ), @@ -1107,8 +1149,7 @@ test_that("11:test makeCalendarInner: overlay matching pattern of a base that is res.calendar <- res[[1]] res.calendar_dates <- res[[2]] - #this is what the code produces - it is wrong. e.g 10/3 service is missing - expectedResult = data.table(UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 a2"), + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid12"), start_date=c("04-01-2023", "11-01-2023", "20-01-2023", "01-03-2023", "17-03-2023", "08-01-2023"), end_date=c( "10-01-2023", "19-01-2023", "02-02-2023", "07-03-2023", "30-03-2023", "05-02-2023"), Days=c( "0011000", "0011000", "0011000", "0011000", "0011000", "0000001"), @@ -1153,7 +1194,7 @@ test_that("12: test makeCalendarInner", { res.calendar_dates <- res[[2]] expectedResult = data.table( - UID=c( "uid1 a1", "uid1 b1", "uid1 c1", "uid1 d1", "uid1 e1", "uid1 f1"), + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "14-01-2023", "17-01-2023", "18-01-2023"), end_date=c( "08-01-2023", "10-01-2023", "13-01-2023", "15-01-2023", "17-01-2023", "20-01-2023"), Days=c( "1111110", "1111110", "0011100", "1111110", "1111110", "0011100"), @@ -1161,7 +1202,7 @@ test_that("12: test makeCalendarInner", { rowID=c( 1, 1, 4, 1, 1, 4)) expectedResult = rbind(expectedResult, data.table( - UID=c( "uid1 g1", "uid1 h1", "uid1 i1", "uid1 a2", "uid1 c2", "uid1 d3"), + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 a2", "uid1 c2", "uid1 d3"), start_date=c("21-01-2023", "25-01-2023", "28-01-2023", "08-01-2023", "23-01-2023", "01-03-2023"), end_date=c( "24-01-2023", "27-01-2023", "04-02-2023", "14-01-2023", "05-02-2023", "31-03-2023"), Days=c( "1111110", "0011100", "1111110", "0000001", "0000001", "0011100"), @@ -1179,3 +1220,48 @@ test_that("12: test makeCalendarInner", { }) + +test_that("11:test makeCalendarInner: overlay matching pattern of a base that is offset temporaly", { + + testData = data.table( + UID=c( "C09094", "C09094", "C09094", "C09094", "C09094", "C09094", "C09094"), + start_date=c("22-05-2023", "27-05-2023", "10-08-2023", "07-09-2023", "21-09-2023", "05-10-2023", "28-10-2023"), + end_date=c( "08-12-2023", "09-09-2023", "10-08-2023", "07-09-2023", "21-09-2023", "05-10-2023", "09-12-2023"), + Days=c( "1111100", "0000010", "0001000", "0001000", "0001000" , "0001000", "0000010"), + STP=c( "P", "P", "O", "O", "O", "O", "P"), + rowID=c( 440737, 1205259, 2856390, 4529108, 5368156, 5835925, 6097391)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "C09094 a", "C09094 b", "C09094 c", "C09094 d", "C09094 e", "C09094 f", "C09094 g"), + start_date=c("22-05-2023", "10-08-2023", "11-08-2023", "07-09-2023", "08-09-2023", "21-09-2023", "22-09-2023"), + end_date=c( "09-08-2023", "10-08-2023", "06-09-2023", "07-09-2023", "20-09-2023", "21-09-2023", "04-10-2023"), + Days=c( "1111100", "0001000", "1111100", "0001000", "1111100" , "0001000", "1111100"), + STP=c( "P", "O", "P", "O", "P", "O", "P"), + rowID=c( 440737, 2856390, 440737, 4529108, 440737, 5368156, 440737)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "C09094 h", "C09094 i", "C09094 a2", "C09094 b2"), + start_date=c("05-10-2023", "06-10-2023", "27-05-2023", "28-10-2023"), + end_date=c( "05-10-2023", "08-12-2023", "09-09-2023", "09-12-2023"), + Days=c( "0001000", "1111100", "0000010", "0000010"), + STP=c( "O", "P", "P", "P"), + rowID=c( 5835925, 440737, 1205259, 6097391))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + From 8554bd5986e6effa7fdfebd3125fae3e65fbcb4a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 17:05:09 +0100 Subject: [PATCH 36/81] tidy up logging messages --- R/gtfs_merge.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index 05105a0..a2449a0 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -149,8 +149,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if(force){ stops <- stops[!duplicated(stops$stop_id),] } else { - stop("Duplicated Stop IDS: ", - paste( unique(stops$stop_id[duplicated(stops$stop_id)]), collapse = " ")) + stop("Duplicated Stop IDS: ", paste( unique(stops$stop_id[duplicated(stops$stop_id)]), collapse = " ")) } } @@ -166,7 +165,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa routes <- routes[!duplicated(new_route_id), ] new_route_id <- routes[, c("file_id", "route_id")] } else { - stop("Duplicated route_id within the same GTFS file, try using force = TRUE ", + stop("Duplicated route_id within the same GTFS file, try using force = TRUE :", paste( unique(new_route_id$route_id[duplicated(new_route_id)]), collapse = " ")) } } @@ -223,8 +222,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa stop_times <- unique(stop_times) new_trip_id <- trips[, c("file_id", "trip_id")] } else{ - stop(paste0("Duplicated trip_id within the same GTFS file", - paste( unique( new_trip_id$trip_id[duplicated(new_trip_id)]), collapse = " "))) + stop("Duplicated trip_id within the same GTFS file :", + paste( unique( new_trip_id$trip_id[duplicated(new_trip_id)]), collapse = " ")) } @@ -327,8 +326,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa if(force){ shapes <- shapes[!duplicated(composite_key),] } else { - stop(paste0("Duplicated Shapes IDS", - paste( unique( composite_key[duplicated(composite_key)]), collapse = " "))) + stop("Duplicated Shapes IDS :", paste( unique( composite_key[duplicated(composite_key)]), collapse = " ")) } } From 92818e83f1fc0a9c80f31b42ea0d4f4503b3d537 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 19:00:24 +0100 Subject: [PATCH 37/81] updated comment after documentation build --- man/UK2GTFS_option_stopProcessingAtUid.Rd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/man/UK2GTFS_option_stopProcessingAtUid.Rd b/man/UK2GTFS_option_stopProcessingAtUid.Rd index 1c69f21..29c41b9 100644 --- a/man/UK2GTFS_option_stopProcessingAtUid.Rd +++ b/man/UK2GTFS_option_stopProcessingAtUid.Rd @@ -16,4 +16,7 @@ sets/gets a UID value at which processing will stop - used for debugging If no value passed in will return the current setting of the option. (Usually NULL) If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() when an exact match for that value is encountered. + + THIS ONLY WORKS WITH ncores==1 + (probably some environment nonsense I don't understand) } From 14af9d938d204a79c634d245ef70d155750822d1 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 19:21:47 +0100 Subject: [PATCH 38/81] fix warnings during tests --- R/gtfs_merge.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index a2449a0..5e1bbdb 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -330,10 +330,10 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePa } } + if ("file_id" %in% colnames(stop_times) ) stop_times$file_id <- NULL + if ("file_id" %in% colnames(calendar) ) calendar$file_id <- NULL shapes$file_id <- NULL - stop_times$file_id <- NULL routes$file_id <- NULL - calendar$file_id <- NULL frequencies$file_id <- NULL res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") From 8a7b27f6a97cbb8362635675f60d567431e3cb9d Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 5 Sep 2023 22:21:24 +0100 Subject: [PATCH 39/81] don't think this should be under version control ? --- inst/extdata/date.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 inst/extdata/date.txt diff --git a/inst/extdata/date.txt b/inst/extdata/date.txt deleted file mode 100644 index 4fe01a2..0000000 --- a/inst/extdata/date.txt +++ /dev/null @@ -1 +0,0 @@ -2023-08-30T15:51:50Z From 67ba10a1cb33a80e5650cabdf4c5fda4ba45a9d4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Wed, 6 Sep 2023 00:55:34 +0100 Subject: [PATCH 40/81] update to read TIPLOC file in either lat/lon csv format or geometry file + more error checking that data files loaded ok --- R/atoc_nr.R | 70 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 5837058..f8bee4b 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -42,32 +42,57 @@ nr2gtfs <- function(path_in, shapes = FALSE, working_timetable = FALSE, public_only = TRUE) { + # checkmate + checkmate::assert_character(path_in, len = 1) + checkmate::assert_file_exists(path_in) + checkmate::assert_logical(silent) + checkmate::assert_numeric(ncores, lower = 1) + checkmate::assert_logical(shapes) - if(inherits(locations,"character")){ - if(locations == "tiplocs"){ - load_data("tiplocs") - locations = tiplocs - } + if (ncores == 1) { + message(paste0(Sys.time(), " This will take some time, make sure you use 'ncores' to enable multi-core processing")) } if(inherits(agency,"character")){ if(agency == "atoc_agency"){ load_data("atoc_agency") agency = atoc_agency + if ( !inherits(agency, "data.frame") || 0==nrow(agency) ){ stop("failed to load atoc_agency data.") } } } - - # checkmate - checkmate::assert_character(path_in, len = 1) - checkmate::assert_file_exists(path_in) - checkmate::assert_logical(silent) - checkmate::assert_numeric(ncores, lower = 1) - checkmate::assert_logical(shapes) - if (ncores == 1) { - message(paste0(Sys.time(), " This will take some time, make sure you use 'ncores' to enable multi-core processing")) + if(inherits(locations,"character")){ + if(locations == "tiplocs"){ + load_data("tiplocs") + locations = tiplocs + if ( !inherits(locations, "data.frame") || 0==nrow(locations) ){ stop("failed to tiploc data.") } + } + } + + # Get the Station Locations + if (inherits(locations, "data.frame")) + { + if (inherits(locations, "sf")) + { + stops <- cbind(locations, sf::st_coordinates(locations)) + stops <- as.data.frame(stops) + stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] + } + else + { + stops = locations + } + + names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) + } else { + stops <- utils::read.csv(locations, stringsAsFactors = FALSE) } + stops$stop_lat <- round(stops$stop_lat, 5) + stops$stop_lon <- round(stops$stop_lon, 5) + + + # Is input a zip or a folder if (!grepl(".gz", path_in)) { stop("path_in is not a .gz file") @@ -83,17 +108,6 @@ nr2gtfs <- function(path_in, ) - # Get the Station Locations - if ("sf" %in% class(locations)) { - stops <- cbind(locations, sf::st_coordinates(locations)) - stops <- as.data.frame(stops) - stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] - names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) - stops$stop_lat <- round(stops$stop_lat, 5) - stops$stop_lon <- round(stops$stop_lon, 5) - } else { - stops <- utils::read.csv(locations, stringsAsFactors = FALSE) - } # Construct the GTFS stop_times <- mca[["stop_times"]] @@ -110,6 +124,12 @@ nr2gtfs <- function(path_in, # remove any unused stops stops <- stops[stops$stop_id %in% stop_times$stop_id, ] + if ( nrow(stops)<=0 ) + { + stop("Could not match any stops in input data to internal stop database.") + } + + # Main Timetable Build timetables <- schedule2routes( stop_times = stop_times, From 43d4766ef491e05fee0f4d338857b445f9978e69 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Wed, 6 Sep 2023 01:48:07 +0100 Subject: [PATCH 41/81] reduce double space to single space --- R/atoc_export.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index d996ec9..9d99243 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -180,9 +180,9 @@ longnames <- function(routes, stop_times, stops) { by = c("rowID" = "schedule")) - routes[`Train Category` == "SS", route_long_name := paste("Ship ",route_long_name)] - routes[`Train Category` %in% c("BS", "BR"), route_long_name := paste("Bus ",route_long_name)] - routes[!(`Train Category` %in% c("SS", "BS", "BR")), route_long_name := paste("Train ",route_long_name)] + routes[`Train Category` == "SS", route_long_name := paste("Ship",route_long_name)] + routes[`Train Category` %in% c("BS", "BR"), route_long_name := paste("Bus",route_long_name)] + routes[!(`Train Category` %in% c("SS", "BS", "BR")), route_long_name := paste("Train",route_long_name)] #TODO reflect the London Transport services being set to metro/underground in this naming code return(routes) From a548652f91f69cf8120c72a28107ae04c8a0f5ae Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Wed, 6 Sep 2023 02:27:03 +0100 Subject: [PATCH 42/81] swap lat and long round when reading from .rda file (was getting swapped by mistake) --- R/atoc_nr.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/atoc_nr.R b/R/atoc_nr.R index f8bee4b..037f866 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -77,14 +77,15 @@ nr2gtfs <- function(path_in, stops <- cbind(locations, sf::st_coordinates(locations)) stops <- as.data.frame(stops) stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] + names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) } else { stops = locations } - - names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) - } else { + } + else + { stops <- utils::read.csv(locations, stringsAsFactors = FALSE) } From 77a7e3d7d508fae8b32769e3b3285f7addeb1b20 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 00:16:58 +0100 Subject: [PATCH 43/81] stop update code blowing up as thread loads --- R/extdata.R | 2 +- R/zzz.R | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/R/extdata.R b/R/extdata.R index 2703066..ea2dd4b 100644 --- a/R/extdata.R +++ b/R/extdata.R @@ -59,7 +59,7 @@ download_data <- function(tag_name, package_location, date){ utils::unzip(file.path(tempdir(),"UK2GTFS_load/all.zip"), exdir = file.path(package_location, "extdata")) unlink(file.path(tempdir(),"UK2GTFS_load"), recursive = TRUE) - writeLines(date, file.path(package_location, "extdata/date.txt")) + writeLines( as.character(date), file.path(package_location, "extdata/date.txt")) } diff --git a/R/zzz.R b/R/zzz.R index 020fe99..11b5d3f 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,6 +1,13 @@ # Run when package loads +# bear in mind this runs in every worker process as we start them and load this package into the worker's namespace .onLoad <- function(libname, pkgname){ - update_data() + + tryCatch({ + update_data() + }, error = function(err) { + warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_date() :", err) + }) + } From 8cc04fafde72c810125437b7da34f40d8c96a5c3 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 00:18:14 +0100 Subject: [PATCH 44/81] remove older style function and merge functionality to reduce number of code paths and hence improve test coverage --- R/atoc_main.R | 1 + R/atoc_overlay.R | 98 ++++++------------------- tests/testthat/test_aa_unit.R | 134 +++++++++++++++++----------------- 3 files changed, 92 insertions(+), 141 deletions(-) diff --git a/R/atoc_main.R b/R/atoc_main.R index f2d7f27..15def02 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -137,6 +137,7 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name", "Train Category" )] names(routes) <- c("route_id", "route_type", "agency_id", "route_long_name", "train_category" ) + # IDs are not meaningful, just leave out routes$route_short_name <- "" # was: routes$route_id diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index a9e97a8..0224216 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -634,15 +634,6 @@ splitDates <- function(cal) { #' makeCalendarInner <- function(calendarSub) { - if ( !is.null(STOP_PROCESSING_UID) ) - { - if ( any( STOP_PROCESSING_UID==calendarSub$UID) ) - { - message(paste0(Sys.time(), " Reached STOP_PROCESSING_UID value [", unique(calendarSub$UID), "] length=", length(calendarSub$UID))) - stop("Option:UK2GTFS_option_stopProcessingAtUid has been set: Stopped processing at UID=", STOP_PROCESSING_UID) - } - } - if ( 1 == nrow(calendarSub) ) { # make into an single entry @@ -670,8 +661,8 @@ makeCalendarInner <- function(calendarSub) { #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules res = list( appendLetterSuffix(calendarSub), NA) } - #if every overlay is a one day cancellation #TODO remove this condition on only one base - code works, removing just breaks some tests that would need fixing - else if (all(overlayDurations == 1) && all(overlayTypes == "C") && sum(allTypes == baseType) == 1 ) + #if every overlay is a one day cancellation + else if ( all(overlayDurations == 1) && all(overlayTypes == "C") ) { warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") @@ -697,69 +688,25 @@ makeCalendarInner <- function(calendarSub) { } else # split by day pattern { - #this works if the day patterns don't overlap any operating days. - if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) - { - #this scenario DOES exist in the downloaded ATOC test data - #stop(paste("Scenario with overlay pattern not matching base pattern is not currently handled. service=", unique(calendarSub$UID))) - res = makeCalendarForDifferentDayPatterns( calendarSub ) - } - else - { - res = makeCalendarForDayPatterns( uniqueDayPatterns, calendarSub ) - } + res = makeCalendarForDifferentDayPatterns( calendarSub, uniqueDayPatterns ) } } } - #stopifnot( is.list(res) ) - return (res) -} - - -#TODO see if makeCalendarForDifferentDayPatterns() covers this case too -#- if so, remove this so there are fewer code paths to test -makeCalendarForDayPatterns <- function( dayPatterns, calendar ) -{ - splits <- list() - - #performance pre-sort all the entries by the priority - #this speeds things up when we look up the required priority overlay **SEE_NOTE** - #calendar = calendar[ order(STP, duration), ] - setkey( calendar, STP, duration ) - setindex( calendar, start_date, end_date) - - for (k in seq(1, length(dayPatterns))) { - # select for each pattern but include cancellations with a - # different day pattern - calendarDay <- calendar[calendar$Days == dayPatterns[k] | calendar$STP == "C", ] - # TODO cancellations now handled elsewhere - remove this once code stable - - if (all(calendarDay$STP == "C")) { - # ignore cases of everything is cancelled - splits[[k]] <- NULL - warning("unexpected item in the makeCalendarForDayPatterns-ing area, cancellations should now be handled at a higher level") - } - else { - calendarNewDay <- splitDates(calendarDay) - # rejects NAs - if (inherits(calendarNewDay, "data.frame")) { - splits[[k]] <- appendNumberSuffix( calendarNewDay, k ) - } + if ( !is.null(STOP_PROCESSING_UID) ) + { + if ( any( STOP_PROCESSING_UID==calendarSub$UID) ) + { + message(paste0(Sys.time(), " Reached STOP_PROCESSING_UID value [", unique(calendarSub$UID), "] length=", length(calendarSub$UID))) + stop("Option:UK2GTFS_option_stopProcessingAtUid has been set: Stopped processing at UID=", STOP_PROCESSING_UID) } } - splits <- data.table::rbindlist(splits, use.names=FALSE) - - splits <- makeCalendarsUnique( splits ) + return (res) +} - # after all this faffing about and splitting and joining, it's quite likely we've created some - # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) - splits <- splits[ checkOperatingDayActive( splits ) ] - return(list(splits, NA)) -} # this is a complex case where the overlays don't have the same day pattern as the base timetable @@ -772,20 +719,24 @@ makeCalendarForDayPatterns <- function( dayPatterns, calendar ) # # when we get to this latter complexity we just split the overlay into individual days and apply it that way. # -makeCalendarForDifferentDayPatterns <- function( calendar ) +makeCalendarForDifferentDayPatterns <- function( calendar, uniqueDayPatterns ) { baseType = max(calendar$STP) baseTimetables = calendar[calendar$STP == baseType] overlayTimetables = calendar[calendar$STP != baseType] - gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] - continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] - - gappyOverlays = makeAllOneDay( gappyOverlays ) - continiousOverlays = expandAllWeeks( continiousOverlays ) + #do the day patterns overlap each other in any way ? + #e.g. a mon-sat pattern with a wed-fri overlap. + if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) + { + gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] + continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] - overlays = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) + gappyOverlays = makeAllOneDay( gappyOverlays ) + continiousOverlays = expandAllWeeks( continiousOverlays ) + overlayTimetables = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) + } splits <- list() @@ -795,7 +746,7 @@ makeCalendarForDifferentDayPatterns <- function( calendar ) theseBases = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] - theseOverlays = overlays[ intersectingDayPatterns( distinctBasePatterns[k], overlays$Days ) ] + theseOverlays = overlayTimetables[ intersectingDayPatterns( distinctBasePatterns[k], overlayTimetables$Days ) ] if (nrow(theseOverlays) <= 0) { @@ -813,9 +764,8 @@ makeCalendarForDifferentDayPatterns <- function( calendar ) thisSplit <- splitDates( timetablesForThisPattern ) - # rejects NAs + # reject NAs if (inherits(thisSplit, "data.frame")) { - splits[[k]] <- appendNumberSuffix( thisSplit, k ) } } diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index a0c287a..fed5eb8 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -765,103 +765,103 @@ test_that("3:test makeCalendarInner:one base: one day cancellations", { -test_that("4:test makeCalendarInner:one day cancellations(old)", { +test_that("4:test makeCalendarInner:one day cancellations(current)", { - #there are multiple valid ways to process this - because of cancellations being handled at a higher level this - #test case no longer applies - but quite a bit of work to create the test case, so keep it for now. - expect_true(TRUE) + #all overlays 1 day cancellations - if(FALSE) - { - #all overlays 1 day cancellations + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) - testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), - end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), - Days=c( "1111110", "0000001", "0011100", "0010000", "0001000", "1000000" ), - STP=c( "P", "P", "P", "C", "C", "C" ), - rowID=c( 1, 2, 3, 4, 5, 6)) + testData <- fixCalendarDates( testData ) - testData <- fixCalendarDates( testData ) + res <- makeCalendarInner( testData ) - res <- makeCalendarInner( testData ) + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] - res.calendar <- res[[1]] - res.calendar_dates <- res[[2]] + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023"), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023"), + Days=c( "1111110", "0000001", "0011100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + expectedResult <- fixCalendarDates( expectedResult ) - expectedResult = data.table(UID=c( "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "01-03-2023"), - end_date=c( "04-02-2023", "05-02-2023", "31-03-2023"), - Days=c( "1111110", "0000001", "0011100"), - STP=c( "P", "P", "P"), - rowID=c( 1, 2, 3)) - expectedResult <- fixCalendarDates( expectedResult ) + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) - res.calendar = removeOriginalUidField( res.calendar ) - expectedResult = removeOriginalUidField( expectedResult ) + printDifferencesDf(expectedResult,res.calendar) - printDifferencesDf(expectedResult,res.calendar) + expectedResultDates = data.table(UID=c("uid1", "uid1", "uid1"), + start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "0010000", "0001000", "1000000" ), + STP=c( "C", "C", "C" ), + rowID=c( 4, 5, 6)) + expectedResultDates <- fixCalendarDates( expectedResultDates ) - expectedResultDates = data.table(UID=c("uid1", "uid1", "uid1"), - start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), - end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), - Days=c( "0010000", "0001000", "1000000" ), - STP=c( "C", "C", "C" ), - rowID=c( 4, 5, 6)) - expectedResultDates <- fixCalendarDates( expectedResultDates ) + res.calendar_dates = removeOriginalUidField( res.calendar_dates ) + expectedResultDates = removeOriginalUidField( expectedResultDates ) - res.calendar_dates = removeOriginalUidField( res.calendar_dates ) - expectedResultDates = removeOriginalUidField( expectedResultDates ) - - expect_true(identical(expectedResult,res.calendar) - & identical(expectedResultDates,res.calendar_dates)) - } + printDifferencesDf( expectedResultDates,res.calendar_dates ) + expect_true(identical(expectedResult,res.calendar) + & identical(expectedResultDates,res.calendar_dates)) }) -test_that("5:test makeCalendarInner:one day cancellations(current)", { +test_that("5:test makeCalendarInner:one day cancellations(old)", { - # all overlays 1 day cancellations - # this method splits up the base timetable, leaving gaps where there are cancellation days + #there are multiple valid ways to process this - because of cancellations being handled at a higher level this + #test case no longer applies - but quite a bit of work to create the test case, so keep it for now. + expect_true(TRUE) - # this can create schedule entries which are by the CIF rules incorrect, because we don't - # validate that the new start/end dates align with the day pattern bitmask + if(FALSE) + { + # all overlays 1 day cancellations + # this method splits up the base timetable, leaving gaps where there are cancellation days - # while the cancellation part is no longer current, this is still a good test for all the date setting logic + # this can create schedule entries which are by the CIF rules incorrect, because we don't + # validate that the new start/end dates align with the day pattern bitmask - testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), - start_date=c("02-01-2023", "08-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), - end_date=c( "04-02-2023", "05-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), - Days=c( "1111110", "0000001", "0010000", "0001000", "1000000" ), - STP=c( "P", "P", "C", "C", "C" ), - rowID=c( 1, 2, 4, 5, 6)) - testData <- fixCalendarDates( testData ) + # while the cancellation part is no longer current, this is still a good test for all the date setting logic - res <- makeCalendarInner( testData ) + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 4, 5, 6)) + testData <- fixCalendarDates( testData ) - res.calendar <- res[[1]] - res.calendar_dates <- res[[2]] + res <- makeCalendarInner( testData ) - expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 a2", "uid1 b2", "uid1 c2"), - start_date=c("02-01-2023", "12-01-2023", "24-01-2023", "08-01-2023", "12-01-2023", "24-01-2023"), - end_date=c( "10-01-2023", "22-01-2023", "04-02-2023", "10-01-2023", "22-01-2023", "05-02-2023"), - Days=c( "1111110", "1111110", "1111110", "0000001", "0000001", "0000001"), - STP=c( "P", "P", "P", "P", "P", "P"), - rowID=c( 1, 1, 1, 2, 2, 2)) - expectedResult <- fixCalendarDates( expectedResult ) + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] - res.calendar = removeOriginalUidField( res.calendar ) - expectedResult = removeOriginalUidField( expectedResult ) + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid12"), + start_date=c("02-01-2023", "12-01-2023", "24-01-2023", "08-01-2023"), + end_date=c( "10-01-2023", "22-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "1111110", "1111110", "0000001"), + STP=c( "P", "P", "P", "P"), + rowID=c( 1, 1, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) - printDifferencesDf(expectedResult,res.calendar) + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + printDifferencesDf(expectedResult,res.calendar) - expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) + } }) From e6111fd0c746d7ce76c5c4263289501ba3fb366d Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 03:14:54 +0100 Subject: [PATCH 45/81] fix broken tests after merge of externalisation of data - remove common code to function to reduce maintenance --- R/atoc.R | 84 ++++++++++---------------------------- R/atoc_export.R | 2 +- R/atoc_nr.R | 105 +++++++++++++++++++++++++++++------------------- 3 files changed, 86 insertions(+), 105 deletions(-) diff --git a/R/atoc.R b/R/atoc.R index 6c39286..b2fff6c 100644 --- a/R/atoc.R +++ b/R/atoc.R @@ -10,7 +10,7 @@ #' @param agency where to get agency.txt (see details) #' @param shapes Logical, should shapes.txt be generated (default FALSE) #' @param transfers Logical, should transfers.txt be generated (default TRUE) -#' @param missing_tiplocs Logical, if locations = tiplocs, then will check for +#' @param missing_tiplocs Logical, if true will check for #' any missing tiplocs against the main file and add them.(default TRUE) #' @family main #' @@ -45,21 +45,6 @@ atoc2gtfs <- function(path_in, shapes = FALSE, transfers = TRUE, missing_tiplocs = TRUE) { - - if(inherits(locations,"character")){ - if(locations == "tiplocs"){ - load_data("tiplocs") - locations = tiplocs - } - } - - if(inherits(agency,"character")){ - if(agency == "atoc_agency"){ - load_data("atoc_agency") - agency = atoc_agency - } - } - # Checkmates checkmate::assert_character(path_in, len = 1) checkmate::assert_file_exists(path_in) @@ -73,6 +58,14 @@ atoc2gtfs <- function(path_in, " This will take some time, make sure you use 'ncores' to enable multi-core processing" )) } + + agency = getCachedAgencyData( agency ) + + if ( !inherits(locations, "character") || "file"!=locations ) + { + stops_sf = getCachedLocationData( locations ) + } + # Is input a zip or a folder if (grepl(".zip", path_in)) { # Unzip @@ -114,57 +107,23 @@ atoc2gtfs <- function(path_in, ) - # Get the Station Locations - # Are locations provided? - if ("sf" %in% class(locations)) { - stops_sf <- cbind(locations, sf::st_coordinates(locations)) - stops_sf <- as.data.frame(stops_sf) - stops_sf <- stops_sf[, c( - "stop_id", "stop_code", "stop_name", - "Y", "X" - )] - names(stops_sf) <- c( - "stop_id", "stop_code", "stop_name", - "stop_lat", "stop_lon" - ) - stops_sf$stop_lat <- round(stops_sf$stop_lat, 5) - stops_sf$stop_lon <- round(stops_sf$stop_lon, 5) - } - # Should the file be checked - check_file <- FALSE - if("sf" %in% class(locations) & missing_tiplocs){ - check_file <- TRUE - } - - if ("character" %in% class(locations)) { - if(locations == "file"){ - check_file <- TRUE - } - } - - if (check_file) { + if ( TRUE==missing_tiplocs || + ( inherits(locations, "character") && "file"==locations ) ) + { msn <- importMSN(files[grepl(".msn", files)], silent = silent) station <- msn[[1]] TI <- mca[["TI"]] stops.list <- station2stops(station = station, TI = TI) stops_file <- stops.list[["stops"]] rm(msn,TI,stops.list) - } - # Was a csv provided - if ("character" %in% class(locations)) { - if(locations != "file"){ - checkmate::check_file_exists(locations) - stops_csv <- utils::read.csv(locations, stringsAsFactors = FALSE) + if( FALSE==missing_tiplocs || !exists("stops_sf") ) + { + stops <- stops_file } - } - - # Chose Correct stops - if(exists("stops_csv")){ - stops <- stops_csv - } else if(exists("stops_sf")){ - if(missing_tiplocs == TRUE){ + else + { # Combine stops_missing <- stops_file[!stops_file$stop_id %in% stops_sf$stop_id,] if(nrow(stops_missing) > 0){ @@ -173,12 +132,11 @@ atoc2gtfs <- function(path_in, } else { stops <- stops_sf } - - } else { - stops <- stops_sf } - } else if(exists("stops_file")){ - stops <- stops_file + } + else + { + stops <- stops_sf } diff --git a/R/atoc_export.R b/R/atoc_export.R index 9d99243..67d83e6 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -84,7 +84,7 @@ station2stops <- function(station, TI) { #' @param flf imported flf file from importFLF #' @noRd #' -station2transfers <- function(station, flf, path_out) { +station2transfers <- function(station, flf) { ### SECTION 4: ############################################################ # make make the transfers.txt diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 037f866..942da08 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -53,46 +53,8 @@ nr2gtfs <- function(path_in, message(paste0(Sys.time(), " This will take some time, make sure you use 'ncores' to enable multi-core processing")) } - if(inherits(agency,"character")){ - if(agency == "atoc_agency"){ - load_data("atoc_agency") - agency = atoc_agency - if ( !inherits(agency, "data.frame") || 0==nrow(agency) ){ stop("failed to load atoc_agency data.") } - } - } - - if(inherits(locations,"character")){ - if(locations == "tiplocs"){ - load_data("tiplocs") - locations = tiplocs - if ( !inherits(locations, "data.frame") || 0==nrow(locations) ){ stop("failed to tiploc data.") } - } - } - - # Get the Station Locations - if (inherits(locations, "data.frame")) - { - if (inherits(locations, "sf")) - { - stops <- cbind(locations, sf::st_coordinates(locations)) - stops <- as.data.frame(stops) - stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] - names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) - } - else - { - stops = locations - } - } - else - { - stops <- utils::read.csv(locations, stringsAsFactors = FALSE) - } - - stops$stop_lat <- round(stops$stop_lat, 5) - stops$stop_lon <- round(stops$stop_lon, 5) - - + agency = getCachedAgencyData( agency ) + stops = getCachedLocationData( locations ) # Is input a zip or a folder if (!grepl(".gz", path_in)) { @@ -109,7 +71,6 @@ nr2gtfs <- function(path_in, ) - # Construct the GTFS stop_times <- mca[["stop_times"]] schedule <- mca[["schedule"]] @@ -154,3 +115,65 @@ nr2gtfs <- function(path_in, return(timetables) } + + + + +getCachedAgencyData <- function(agency = "atoc_agency") +{ + if(inherits(agency,"character")) + { + if(agency == "atoc_agency") + { + load_data("atoc_agency") + agency = atoc_agency + } + else #TODO test column names + { + checkmate::check_file_exists(agency) + agency <- utils::read.csv(agency, stringsAsFactors = FALSE) + } + + if ( !inherits(agency, "data.frame") || 0==nrow(agency) ){ stop("failed to load atoc_agency data.") } + } + + return (agency) +} + + +getCachedLocationData <- function(locations = "tiplocs") +{ + if(inherits(locations,"character")) + { + if(locations == "tiplocs") + { + load_data("tiplocs") + locations = tiplocs + } + else + { + checkmate::check_file_exists(locations) + locations <- utils::read.csv(locations, stringsAsFactors = FALSE) + } + + if ( !inherits(locations, "data.frame") || 0==nrow(locations) ){ stop("failed to tiploc data.") } + } + + # Get the Station Locations + if (inherits(locations, "sf")) + { + stops <- cbind(locations, sf::st_coordinates(locations)) + stops <- as.data.frame(stops) + stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] + names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) + } + else #TODO test column names + { + stops = locations + } + + stops$stop_lat <- round(stops$stop_lat, 5) + stops$stop_lon <- round(stops$stop_lon, 5) + + return (stops) +} From ab1871a82a94e9311a7dd5b2c1e1d3c3fb47be9d Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 09:46:38 +0100 Subject: [PATCH 46/81] bit more error checking - that we have stops before starting next part of process --- R/atoc.R | 7 ++++++- R/atoc_nr.R | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/atoc.R b/R/atoc.R index b2fff6c..62869a7 100644 --- a/R/atoc.R +++ b/R/atoc.R @@ -161,6 +161,12 @@ atoc2gtfs <- function(path_in, # remove any unused stops stops <- stops[stops$stop_id %in% stop_times$stop_id, ] + if ( nrow(stops)<=0 ) + { + stop("Could not match any stops in input data to stop database.") + } + + # Main Timetable Build timetables <- schedule2routes( stop_times = stop_times, @@ -171,7 +177,6 @@ atoc2gtfs <- function(path_in, ) rm(schedule) gc() - # load("data/atoc_agency.RData") # TODO: check for stop_times that are not valid stops diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 942da08..7cbe8e0 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -54,6 +54,7 @@ nr2gtfs <- function(path_in, } agency = getCachedAgencyData( agency ) + stops = getCachedLocationData( locations ) # Is input a zip or a folder @@ -76,7 +77,7 @@ nr2gtfs <- function(path_in, schedule <- mca[["schedule"]] rm(mca) gc() - # rm(alf, flf, mca, msn) + stop_times <- stop_times[, c( "Arrival Time", "Departure Time", "Location", "stop_sequence", "Activity", "rowID", "schedule")] @@ -88,7 +89,7 @@ nr2gtfs <- function(path_in, if ( nrow(stops)<=0 ) { - stop("Could not match any stops in input data to internal stop database.") + stop("Could not match any stops in input data to stop database.") } From 32b09e84ab57ad88ed1ad69d09946143012f8fc4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 09:52:55 +0100 Subject: [PATCH 47/81] performance tuning --- R/atoc_overlay.R | 32 ++++++++++++++++++++------------ R/gtfs_write.R | 5 +++-- tests/testthat/test_aa_unit.R | 21 +++++++++++++++++++++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index 0224216..55fa9a3 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -22,8 +22,9 @@ set_STOP_PROCESSING_UID <- function( value ) } - - +#performance - this is slow, must be generating on the fly each time subset happens - cache it. - +LETTERS <- letters[1:26] +TWO_LETTERS <- paste0(rep(letters, each = 26), rep(letters, times = 26)) # Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique appendLetterSuffix <- function( cal ) @@ -34,13 +35,12 @@ appendLetterSuffix <- function( cal ) { if (rows <= 26) { - cal$UID <- paste0(cal$UID, " ", letters[1:rows]) + cal$UID <- paste0(cal$UID, " ", LETTERS[1:rows]) } else { # Cases where we need extra letters, gives up to 676 ids - lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) - cal$UID <- paste0(cal$UID, " ", lett[1:rows]) + cal$UID <- paste0(cal$UID, " ", TWO_LETTERS[1:rows]) } } @@ -130,13 +130,12 @@ splitBitmask <- function( bitmask, asInteger=FALSE ) bitmask[duff] = " " - splitDays = strsplit(bitmask, "") - - splitDays = as.integer(unlist(splitDays)) + splitDays = ( "1"== unlist( strsplit(bitmask, "") ) ) + #performance, calling as.integer on string is surprisingly expensive, so do it this way instead which is twice as fast overall - if (!asInteger) + if (asInteger) { - splitDays = as.logical(splitDays) + splitDays = as.integer(splitDays) } return (splitDays) @@ -155,6 +154,12 @@ checkOperatingDayActive <- function(calendar) { opDays <- splitBitmaskMat( calendar$Days, asInteger=FALSE ) opDays <- split(opDays, row(opDays)) + #performance - precalculate all the days + veryfirstDay = min(calendar$start_date) + allDays = lubridate::wday( seq.Date(from = veryfirstDay, to = max(calendar$end_date), by = "day") + , label = FALSE, week_start=1 ) + veryfirstDay = veryfirstDay - 1 + checkValid <- function(dur, sd, ed, od ){ if (dur >= 7) @@ -162,7 +167,10 @@ checkOperatingDayActive <- function(calendar) { return (any(od)) } - dayNumbers <- lubridate::wday( seq.Date(from = sd, to = ed, by = "day"), label = FALSE, week_start=1 ) + firstDay = as.integer(sd)-as.integer(veryfirstDay) + lastDay = as.integer(ed)-as.integer(veryfirstDay) + + dayNumbers <- allDays[ firstDay:lastDay ] return ( any(od[dayNumbers]) ) } @@ -664,7 +672,7 @@ makeCalendarInner <- function(calendarSub) { #if every overlay is a one day cancellation else if ( all(overlayDurations == 1) && all(overlayTypes == "C") ) { - warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") + #warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") # Apply the cancellation via entries in calendar_dates.txt res = list( appendLetterSuffix( calendarSub[calendarSub$STP != "C", ] ), diff --git a/R/gtfs_write.R b/R/gtfs_write.R index c4bb114..afcd6bb 100644 --- a/R/gtfs_write.R +++ b/R/gtfs_write.R @@ -170,8 +170,9 @@ formatAttributesToGtfsSchema <- function(dt) if (length(dateColumnsToFormat) > 0) { dt[, (dateColumnsToFormat) := lapply(.SD, - function(d){ gsub("-", "", as.character(d) ) } ), - .SDcols = dateColumnsToFormat] + function(d){ sprintf("%04d%02d%02d", lubridate::year(d), lubridate::month(d), lubridate::mday(d)) } ), + .SDcols = dateColumnsToFormat] + #performance, sprintf runs 6x faster than as.character() (which calls format()), followed by gsub() } } diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index fed5eb8..7e8317d 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -65,6 +65,27 @@ test_that("test changing module level variable", { + +test_that("test splitBitmask performance", { + + execution_time <- system.time({ + for (i in 1:100000) + { + res = splitBitmask(SINGLE_DAY_PATTERN_VECTOR) + } + }) + + message(paste0("write time=", execution_time, "\n")) + + expectedResult = unlist(SINGLE_DAY_PATTERN_LIST) + + expect_true( identical( res, expectedResult ) ) +}) + + + + + test_that("test countIntersectingDayPatterns:1", { OK = TRUE From a4bf85615dedb73124d56cf6b681778006e1ddc4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 18:09:32 +0100 Subject: [PATCH 48/81] 1) further performance, treat dates as int while building timetables 2) switch off "phone home" while loading package as a child process --- NAMESPACE | 2 + R/atoc_export.R | 179 +++++++++++------- R/atoc_overlay.R | 138 ++++++++++++-- R/globals.R | 56 +++++- R/gtfs_interpolate_times.R | 6 +- R/gtfs_write.R | 3 +- R/zzz.R | 5 +- man/UK2GTFS_option_stopProcessingAtUid.Rd | 3 +- man/UK2GTFS_option_treatDatesAsInt.Rd | 19 ++ ...TFS_option_updateCachedDataOnLibaryLoad.Rd | 17 ++ man/atoc2gtfs.Rd | 2 +- tests/testthat/test_aa_unit.R | 63 +++++- tests/testthat/test_atoc.R | 3 +- 13 files changed, 398 insertions(+), 98 deletions(-) create mode 100644 man/UK2GTFS_option_treatDatesAsInt.Rd create mode 100644 man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd diff --git a/NAMESPACE b/NAMESPACE index ea9e73c..73602ab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,8 @@ export(ATOC_shapes) export(UK2GTFS_option_stopProcessingAtUid) +export(UK2GTFS_option_treatDatesAsInt) +export(UK2GTFS_option_updateCachedDataOnLibaryLoad) export(atoc2gtfs) export(dl_example_file) export(get_bank_holidays) diff --git a/R/atoc_export.R b/R/atoc_export.R index 67d83e6..9310feb 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -200,94 +200,137 @@ longnames <- function(routes, stop_times, stops) { #' @noRd #' makeCalendar <- function(schedule, ncores = 1) { - # prep the inputs - calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] - names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) - calendar$STP <- as.character(calendar$STP) - calendar$duration <- calendar$end_date - calendar$start_date + 1 + treatDatesAsInt = getOption("UK2GTFS_opt_treatDatesAsInt", default=FALSE) + treatDatesAsInt = TRUE + set_TREAT_DATES_AS_INT( treatDatesAsInt ) - if ( !all(validateCalendarDates( calendar ) ) ) - { - warning(paste0(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask")) - #TODO be more verbose about which ones - } + tryCatch({ + # prep the inputs + calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] + names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) - #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later - calendar$originalUID <- calendar$UID + calendar$STP <- as.character(calendar$STP) - #brutal, but makes code later on simpler, make all cancellations one day cancellations by splitting - #TODO don't split up into one day cancellations if all the operating day patterns on a service are identical - cancellations <- makeAllOneDay( calendar[calendar$STP == "C", ] ) - calendar <- calendar[calendar$STP != "C", ] + if ( !all(validateCalendarDates( calendar ) ) ) + { + warning(paste0(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask")) + #TODO be more verbose about which ones + } - #calendar$start_date = as.integer( calendar$start_date ) - #calendar$end_date = as.integer( calendar$end_date ) -#test treating date as int: seem to be about twice as fast on the critical line when selecting base timetable -#TODO add package option to switch between processing as date/int otherwise debugging is too hard + #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later + calendar$originalUID <- calendar$UID - #debugging option - set_STOP_PROCESSING_UID( getOption("UK2GTFS_opt_stopProcessingAtUid") ) + if( treatDatesAsInt ) + { + setupDatesCache( calendar ) + #treating date as int: seem to be about twice as fast on the critical line when selecting base timetable + calendar$start_date = as.integer( calendar$start_date ) + calendar$end_date = as.integer( calendar$end_date ) + } - message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) - calendar$`__TEMP__` <- calendar$UID - calendar_split <- calendar[, .(list(.SD)), by = `__TEMP__`][,V1] + calendar$duration <- calendar$end_date - calendar$start_date + 1 - if (ncores > 1) { - cl <- parallel::makeCluster(ncores) - parallel::clusterEvalQ(cl, { - loadNamespace("UK2GTFS") - }) - pbapply::pboptions(use_lb = TRUE) - res <- pbapply::pblapply(calendar_split, - makeCalendarInner, - cl = cl - ) - parallel::stopCluster(cl) - rm(cl) - } else { - res <- pbapply::pblapply( - calendar_split, - makeCalendarInner - ) - } - res.calendar <- lapply(res, `[[`, 1) - res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, takes 10 minutes to execute bind_rows on full GB daily timetable + #brutal, but makes code later on simpler, make all cancellations one day cancellations by splitting + #TODO don't split up into one day cancellations if all the operating day patterns on a service are identical + cancellations <- makeAllOneDay( calendar[calendar$STP == "C", ] ) + calendar <- calendar[calendar$STP != "C", ] - res.cancellation_dates <- lapply(res, `[[`, 2) - res.cancellation_dates <- res.cancellation_dates[!is.na(res.cancellation_dates)] - res.cancellation_dates <- data.table::rbindlist(res.cancellation_dates, use.names=FALSE) - stopifnot( 0==nrow(res.cancellation_dates) ) - rm(res.cancellation_dates) - #since we didn't pass in any cancellations we should no longer get any back - res.calendar = splitAndRebindBitmask( res.calendar ) - cancellations = splitAndRebindBitmask( cancellations ) + #debugging option + set_STOP_PROCESSING_UID( getOption("UK2GTFS_opt_stopProcessingAtUid") ) - #associate the split up cancellations with the (new) calendar they are associated with - #(only works because cancellations are all one day duration) - cancellations = allocateCancellationsAcrossCalendars( res.calendar, cancellations ) + message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) + calendar$`__TEMP__` <- calendar$UID + calendar_split <- calendar[, .(list(.SD)), by = `__TEMP__`][,V1] - #no longer need the field that was used to associate the original and replicated calendars together - cancellations$originalUID <- NULL - res.calendar$originalUID <- NULL + if (ncores > 1) { + cl <- parallel::makeCluster(ncores) - #error checking - dups = duplicated( res.calendar$UID ) - if( any(TRUE==dups) ) - { - dups = unique( res.calendar$UID[ dups ] ) + workerEnvs = parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) #stop the child workers from calling update_data() + workerEnv=loadNamespace("UK2GTFS") + }) - warning(paste(Sys.time(), "Duplicate UIDs were created by the makeCalendar() process, this is likely to cause downstream proceessing errors. ", - "Please capture the data and raise a bug / create a test case. ", dups)) - } + parallel::clusterExport(cl, list("TREAT_DATES_AS_INT", "WDAY_LOOKUP_MIN_VALUE", + "WDAY_LOOKUP_MAX_VALUE", "WDAY_LOOKUP_MAP"), envir=asNamespace("UK2GTFS")) + + #set module level global in all workers TODO find out why this takes forever to run + #parallel::clusterCall(cl, function(val){ set_TREAT_DATES_AS_INT(val) }, val=treatDatesAsInt ) + + pbapply::pboptions(use_lb = TRUE) + res <- pbapply::pblapply(calendar_split, + makeCalendarInner, + cl = cl + ) + + parallel::stopCluster(cl) + rm(cl) + } else { + res <- pbapply::pblapply( + calendar_split, + makeCalendarInner + ) + } + + + res.calendar <- lapply(res, `[[`, 1) + res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, takes 10 minutes to execute bind_rows on full GB daily timetable + + res.cancellation_dates <- lapply(res, `[[`, 2) + res.cancellation_dates <- res.cancellation_dates[!is.na(res.cancellation_dates)] + res.cancellation_dates <- data.table::rbindlist(res.cancellation_dates, use.names=FALSE) + stopifnot( 0==nrow(res.cancellation_dates) ) + rm(res.cancellation_dates) + #since we didn't pass in any cancellations we should no longer get any back + + res.calendar = splitAndRebindBitmask( res.calendar ) + cancellations = splitAndRebindBitmask( cancellations ) + + #associate the split up cancellations with the (new) calendar they are associated with + #(only works because cancellations are all one day duration) + cancellations = allocateCancellationsAcrossCalendars( res.calendar, cancellations ) + + #no longer need the field that was used to associate the original and replicated calendars together + cancellations$originalUID <- NULL + res.calendar$originalUID <- NULL + + #error checking + dups = duplicated( res.calendar$UID ) + if( any(TRUE==dups) ) + { + dups = unique( res.calendar$UID[ dups ] ) + + warning(paste(Sys.time(), "Duplicate UIDs were created by the makeCalendar() process, this is likely to cause downstream proceessing errors. ", + "Please capture the data and raise a bug / create a test case. ", dups)) + } + + }, finally = { + set_TREAT_DATES_AS_INT( FALSE ) + + #revert treating date as int + if( TRUE==treatDatesAsInt && exists("res.calendar") ) + { + res.calendar = makeDateFieldsDateType( res.calendar ) + cancellations = makeDateFieldsDateType( cancellations ) + } + }) return(list(res.calendar, cancellations)) } +makeDateFieldsDateType<- function( cal ) +{ + cal$start_date = as.Date( cal$start_date, origin = DATE_EPOC ) + cal$end_date = as.Date( cal$end_date, origin = DATE_EPOC ) + cal$duration = cal$end_date - cal$start_date + 1 + + return (cal) +} #' duplicateItem @@ -351,6 +394,8 @@ duplicateItems <- function( dt, split_attribute, ncores=1, indexStart=1 ) } else { cl <- parallel::makeCluster(ncores) parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) loadNamespace("UK2GTFS") }) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index 55fa9a3..1ed8c50 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -2,27 +2,110 @@ +setValueInThisEnvironment <- function( name, value ) +{ + env <- asNamespace("UK2GTFS") + unlockBinding(name, env) + assign(name, value, envir = env) + lockBinding(name, env) +} + + assign("STOP_PROCESSING_UID", NULL ) set_STOP_PROCESSING_UID <- function( value ) { - env <- asNamespace("UK2GTFS") + setValueInThisEnvironment("STOP_PROCESSING_UID", value) + + if(!is.null(value)) + { + message(paste0(Sys.time(), " Set STOP_PROCESSING_UID to [", get("STOP_PROCESSING_UID"), "]")) + } +} - unlockBinding("STOP_PROCESSING_UID", env) - assign("STOP_PROCESSING_UID", value, envir = env) - lockBinding("STOP_PROCESSING_UID", env) +#need to ensure these get set consistently into any worker processes as well as main thread - if(!is.null(value)) +assign("TREAT_DATES_AS_INT", FALSE ) + +set_TREAT_DATES_AS_INT <- function( value ) +{ + setValueInThisEnvironment("TREAT_DATES_AS_INT", as.logical(value)) +} + + +assign("WDAY_LOOKUP_MIN_VALUE", NULL ) + +set_WDAY_LOOKUP_MIN_VALUE <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MIN_VALUE", as.integer(as.integer(value)-1) ) +} + +assign("WDAY_LOOKUP_MAX_VALUE", NULL ) + +set_WDAY_LOOKUP_MAX_VALUE <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MAX_VALUE", as.integer(value)) +} + +assign("WDAY_LOOKUP_MAP", NULL ) + +set_WDAY_LOOKUP_MAP <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MAP", value) +} + + + +local_lubridate_wday <- function( date, label = FALSE, week_start=1 ) +{ + if (TRUE==TREAT_DATES_AS_INT) { - message(paste0(Sys.time(), " Set STOP_PROCESSING_UID to [", get("STOP_PROCESSING_UID"), "]")) + if( any(date<=WDAY_LOOKUP_MIN_VALUE) || any(date>WDAY_LOOKUP_MAX_VALUE) ) + { + stop("requested value index [", date, "] is outside lookup table") + } + + return ( WDAY_LOOKUP_MAP[ date-WDAY_LOOKUP_MIN_VALUE ] ) + } + else + { + return ( lubridate::wday( date, label = FALSE, week_start=1 ) ) } } -#performance - this is slow, must be generating on the fly each time subset happens - cache it. - +local_seq_date<-function( from, to, by ) +{ + if (TRUE==TREAT_DATES_AS_INT) + { + return ( seq.int(from = from, to = to) ) + } + else + { + return ( seq.Date(from = from, to = to, by = by) ) + } +} + + +setupDatesCache<-function( calendar ) +{ + minDt = min(calendar$start_date) + maxDt = max(calendar$end_date) + + set_WDAY_LOOKUP_MIN_VALUE( minDt ) + set_WDAY_LOOKUP_MAX_VALUE( maxDt ) + + firstWeek = as.integer(lubridate::wday( seq.Date(from = minDt, to = minDt+6, by = "day"), label = FALSE, week_start=1 )) + allWeeks = rep( firstWeek, length.out=( as.integer(maxDt) - as.integer(minDt) +1 ) ) + set_WDAY_LOOKUP_MAP( allWeeks ) +} + + + +#performance - this is slow, might be generating on the fly each time subset happens - cache it. - LETTERS <- letters[1:26] TWO_LETTERS <- paste0(rep(letters, each = 26), rep(letters, times = 26)) @@ -89,8 +172,8 @@ END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") #i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. validateCalendarDates <- function( calendar ) { - start_day_number = lubridate::wday( calendar$start_date, label = FALSE, week_start=1 ) - end_day_number = lubridate::wday( calendar$end_date, label = FALSE, week_start=1 ) + start_day_number = local_lubridate_wday( calendar$start_date, label = FALSE, week_start=1 ) + end_day_number = local_lubridate_wday( calendar$end_date, label = FALSE, week_start=1 ) startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1, start_day_number) endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7) @@ -128,9 +211,9 @@ splitBitmask <- function( bitmask, asInteger=FALSE ) { duff = which( nchar(bitmask) != 7 ) - bitmask[duff] = " " + bitmask[duff] = "0000000" - splitDays = ( "1"== unlist( strsplit(bitmask, "") ) ) + splitDays = ( "0"!= unlist( strsplit(bitmask, "") ) ) #performance, calling as.integer on string is surprisingly expensive, so do it this way instead which is twice as fast overall if (asInteger) @@ -156,8 +239,16 @@ checkOperatingDayActive <- function(calendar) { #performance - precalculate all the days veryfirstDay = min(calendar$start_date) - allDays = lubridate::wday( seq.Date(from = veryfirstDay, to = max(calendar$end_date), by = "day") + + if( TRUE==TREAT_DATES_AS_INT) + { + allDays = WDAY_LOOKUP_MAP[ (veryfirstDay - WDAY_LOOKUP_MIN_VALUE) : (max(calendar$end_date) - WDAY_LOOKUP_MIN_VALUE) ] + } + else + { + allDays = local_lubridate_wday( local_seq_date(from = veryfirstDay, to = max(calendar$end_date), by = "day") , label = FALSE, week_start=1 ) + } veryfirstDay = veryfirstDay - 1 checkValid <- function(dur, sd, ed, od ){ @@ -252,14 +343,23 @@ makeReplicationDates <- function(cal, startDayNum, endDayNum){ # and the end date so it's always sunday firstDate = min(cal$start_date) - 7 lastDate = max(cal$end_date) + 7 - allDates = seq.Date(from = firstDate, to = lastDate, by = "day") + allDates = local_seq_date(from = firstDate, to = lastDate, by = "day") offset = as.integer(cal$start_date)-startDayNum+2-as.integer(firstDate) end = as.integer(cal$end_date)+8-endDayNum-as.integer(firstDate) dates <- Map(function(o, e) allDates[o:e], offset, end) - return ( as.Date( unlist(dates), origin = DATE_EPOC ) ) + if (TRUE==TREAT_DATES_AS_INT) + { + res = unlist(dates) + } + else + { + res = as.Date( unlist(dates), origin = DATE_EPOC ) + } + + return (res) } @@ -282,8 +382,8 @@ makeAllOneDay <- function( cal ) } #make a list of dates for each object being replicated - startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1 ) dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) #work out how many time we need to replicate each item: number of operating days in week * num weeks @@ -304,7 +404,7 @@ makeAllOneDay <- function( cal ) #tidy up the values so they are correct for the spilt items replicatedcal$duration <- 1 - replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ lubridate::wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ local_lubridate_wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] return (replicatedcal) } @@ -328,8 +428,8 @@ expandAllWeeks <- function( cal ) #duration <- cal$end_date - cal$start_date + 1 #make a list of dates for each object being replicated - startDayNum = lubridate::wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = lubridate::wday( cal$end_date, label = FALSE, week_start=1 ) + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1 ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1 ) dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) numWeeks <- ceiling(as.integer(cal$duration) / 7) diff --git a/R/globals.R b/R/globals.R index ee70a7d..866508e 100644 --- a/R/globals.R +++ b/R/globals.R @@ -20,24 +20,72 @@ utils::globalVariables(c( #' UK2GTFS option stopProcessingAtUid #' @description sets/gets a UID value at which processing will stop - used for debugging -#' @param value option value to be set +#' @param value option value to be set (char) #' @details If no value passed in will return the current setting of the option. (Usually NULL) #' If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() #' when an exact match for that value is encountered. #' #' THIS ONLY WORKS WITH ncores==1 -#' (probably some environment nonsense I don't understand) #' #' @export UK2GTFS_option_stopProcessingAtUid <- function(value) { if (missing(value)) { - return( getOption("UK2GTFS_opt_stopProcessingAtUid") ) + return( getOption("UK2GTFS_opt_stopProcessingAtUid", default=NULL) ) } else { - return( options(UK2GTFS_opt_stopProcessingAtUid = value) ) + if ( !is.null(value) && !inherits(value, "character") ){ value = as.character( value ) } + + if ( !is.null(value) && 0==nchar(value) ){ value=NULL } + + return( options(UK2GTFS_opt_stopProcessingAtUid = value ) ) + } +} + + + + +#' UK2GTFS option treatDatesAsInt +#' @description sets/gets a logical value which determines how dates are processed while building calendar - used for debugging +#' @param value option value to be set (logical) +#' @details In the critical part of timetable building, handling dates as dates is about half the speed of handling as int +#' so we treat them as integers. However that's a complete pain for debugging, so make it configurable. +#' if errors are encountered during the timetable build phase, try setting this value to FALSE +#' +#' @export +UK2GTFS_option_treatDatesAsInt <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_treatDatesAsInt", default=TRUE) ) + } + else + { + return( options(UK2GTFS_opt_treatDatesAsInt = as.logical(value) ) ) } } + + +#' UK2GTFS option updateCachedDataOnLibaryLoad +#' @description sets/gets a logical value which determines if the data cached in the library is checked for update when loaded +#' @param value option value to be set (logical) +#' @details when child processes are initialised we want to suppress this check, so it is also used for that purpose +#' +#' @export +UK2GTFS_option_updateCachedDataOnLibaryLoad <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_updateCachedDataOnLibaryLoad", default=TRUE) ) + } + else + { + return( options(UK2GTFS_opt_updateCachedDataOnLibaryLoad = as.logical(value) ) ) + } +} + + + diff --git a/R/gtfs_interpolate_times.R b/R/gtfs_interpolate_times.R index 56bc9d4..e5fd107 100644 --- a/R/gtfs_interpolate_times.R +++ b/R/gtfs_interpolate_times.R @@ -39,7 +39,11 @@ gtfs_interpolate_times <- function(gtfs, ncores = 1){ stop_times <- pbapply::pblapply(stop_times, stops_interpolate) } else { cl <- parallel::makeCluster(ncores) - parallel::clusterEvalQ(cl, {loadNamespace("UK2GTFS")}) + parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) + loadNamespace("UK2GTFS") + }) stop_times <- pbapply::pblapply(stop_times, stops_interpolate, cl = cl diff --git a/R/gtfs_write.R b/R/gtfs_write.R index afcd6bb..f8b20e3 100644 --- a/R/gtfs_write.R +++ b/R/gtfs_write.R @@ -170,9 +170,10 @@ formatAttributesToGtfsSchema <- function(dt) if (length(dateColumnsToFormat) > 0) { dt[, (dateColumnsToFormat) := lapply(.SD, - function(d){ sprintf("%04d%02d%02d", lubridate::year(d), lubridate::month(d), lubridate::mday(d)) } ), + function(d){ as.integer(sprintf("%04d%02d%02d", lubridate::year(d), lubridate::month(d), lubridate::mday(d))) } ), .SDcols = dateColumnsToFormat] #performance, sprintf runs 6x faster than as.character() (which calls format()), followed by gsub() + #coercing back to int prevents value being quoted in output file } } diff --git a/R/zzz.R b/R/zzz.R index 11b5d3f..51807e8 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -3,7 +3,10 @@ .onLoad <- function(libname, pkgname){ tryCatch({ - update_data() + if( TRUE == UK2GTFS_option_updateCachedDataOnLibaryLoad() ) + { + update_data() + } }, error = function(err) { warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_date() :", err) }) diff --git a/man/UK2GTFS_option_stopProcessingAtUid.Rd b/man/UK2GTFS_option_stopProcessingAtUid.Rd index 29c41b9..7fbd343 100644 --- a/man/UK2GTFS_option_stopProcessingAtUid.Rd +++ b/man/UK2GTFS_option_stopProcessingAtUid.Rd @@ -7,7 +7,7 @@ UK2GTFS_option_stopProcessingAtUid(value) } \arguments{ -\item{value}{option value to be set} +\item{value}{option value to be set (char)} } \description{ sets/gets a UID value at which processing will stop - used for debugging @@ -18,5 +18,4 @@ If no value passed in will return the current setting of the option. (Usually NU when an exact match for that value is encountered. THIS ONLY WORKS WITH ncores==1 - (probably some environment nonsense I don't understand) } diff --git a/man/UK2GTFS_option_treatDatesAsInt.Rd b/man/UK2GTFS_option_treatDatesAsInt.Rd new file mode 100644 index 0000000..7a3c5d3 --- /dev/null +++ b/man/UK2GTFS_option_treatDatesAsInt.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_treatDatesAsInt} +\alias{UK2GTFS_option_treatDatesAsInt} +\title{UK2GTFS option treatDatesAsInt} +\usage{ +UK2GTFS_option_treatDatesAsInt(value) +} +\arguments{ +\item{value}{option value to be set (logical)} +} +\description{ +sets/gets a logical value which determines how dates are processed while building calendar - used for debugging +} +\details{ +In the critical part of timetable building, handling dates as dates is about half the speed of handling as int + so we treat them as integers. However that's a complete pain for debugging, so make it configurable. + if errors are encountered during the timetable build phase, try setting this value to FALSE +} diff --git a/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd b/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd new file mode 100644 index 0000000..0313b9f --- /dev/null +++ b/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_updateCachedDataOnLibaryLoad} +\alias{UK2GTFS_option_updateCachedDataOnLibaryLoad} +\title{UK2GTFS option updateCachedDataOnLibaryLoad} +\usage{ +UK2GTFS_option_updateCachedDataOnLibaryLoad(value) +} +\arguments{ +\item{value}{option value to be set (logical)} +} +\description{ +sets/gets a logical value which determines if the data cached in the library is checked for update when loaded +} +\details{ +when child processes are initialised we want to suppress this check, so it is also used for that purpose +} diff --git a/man/atoc2gtfs.Rd b/man/atoc2gtfs.Rd index 1f17e45..64855be 100644 --- a/man/atoc2gtfs.Rd +++ b/man/atoc2gtfs.Rd @@ -31,7 +31,7 @@ atoc2gtfs( \item{transfers}{Logical, should transfers.txt be generated (default TRUE)} -\item{missing_tiplocs}{Logical, if locations = tiplocs, then will check for +\item{missing_tiplocs}{Logical, if true will check for any missing tiplocs against the main file and add them.(default TRUE)} } \description{ diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index 7e8317d..d6bde92 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -85,6 +85,46 @@ test_that("test splitBitmask performance", { +test_that("test setupDatesCache", { + + testData = data.table( + start_date=c("03-01-2023", "05-01-2023", "14-02-2023", "22-01-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "24-02-2023", "23-01-2023", "26-01-2023" )) + + testData <- fixCalendarDates( testData ) + + setupDatesCache( testData ) + + start = c(2,3,4,5,6,7,1) + expected = rep( start, length.out = 53) + + env = asNamespace("UK2GTFS") + + OK = TRUE + + OK = OK & identical( get("WDAY_LOOKUP_MAP", envir=env), as.integer(expected) ) + OK = OK & identical( get("WDAY_LOOKUP_MIN_VALUE", envir=env), as.integer(19359) ) + OK = OK & identical( get("WDAY_LOOKUP_MAX_VALUE", envir=env), as.integer(19412) ) + + set_TREAT_DATES_AS_INT(TRUE) + + OK = OK & identical( local_lubridate_wday(19360), as.integer(2) ) + OK = OK & identical( local_lubridate_wday(19412), as.integer(5) ) + + res = try(local_lubridate_wday(19359), silent = TRUE) + OK = OK & inherits(res, "try-error") + + res = try(local_lubridate_wday(19413), silent = TRUE) + OK = OK & inherits(res, "try-error") + + set_TREAT_DATES_AS_INT(FALSE) + + expect_true( OK ) +}) + + + + test_that("test countIntersectingDayPatterns:1", { @@ -109,8 +149,29 @@ test_that("test countIntersectingDayPatterns:1", { OK = OK & identical(counts, expectedCounts) } { + #this is an invalid input - 4 gets coerced to true and then back to 1 patterns = c("4000001", "1001100", "0000001") - expectedCounts = c(5,0,0,1,1,0,2) + expectedCounts = c(2,0,0,1,1,0,2) + + counts <- countIntersectingDayPatterns( patterns ) + + printDifferences( counts, expectedCounts) + OK = OK & identical(counts, expectedCounts) + } + { + #this is an invalid input anything that isn't '0' coerced to true and then back to 1 + patterns = c("a 00,01", "1001100", "0000001") + expectedCounts = c(2,1,0,1,2,0,2) + + counts <- countIntersectingDayPatterns( patterns ) + + printDifferences( counts, expectedCounts) + OK = OK & identical(counts, expectedCounts) + } + { + #this is an invalid input anything that isn't of length 7 gets coerced to zero + patterns = c("1111111", "111111") + expectedCounts = c(1,1,1,1,1,1,1) counts <- countIntersectingDayPatterns( patterns ) diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index 4b7ff21..84519d0 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -14,7 +14,7 @@ test_that("test atoc data is there", { context("Test the main atoc function") test_that("test atoc2gtfs singlecore", { - +browser() gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1) @@ -27,6 +27,7 @@ test_that("test atoc2gtfs singlecore", { context("Test the main atoc function, with different settings") test_that("test atoc2gtfs singlecore", { + browser() gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1, locations = "file") From 0f5daf9232333b8a166ac27877c89221a4c241cf Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 21:15:26 +0100 Subject: [PATCH 49/81] make process dates as int work in child processes --- R/atoc_export.R | 40 +++++++++++++++++++++++--------------- tests/testthat/test_atoc.R | 4 ++-- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 9310feb..514f2d3 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -211,7 +211,14 @@ makeCalendar <- function(schedule, ncores = 1) { calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) - calendar$STP <- as.character(calendar$STP) + if( treatDatesAsInt ) + { + setupDatesCache( calendar ) + #treating date as int: seem to be about twice as fast on the critical line when selecting base timetable + calendar$start_date = as.integer( calendar$start_date ) + calendar$end_date = as.integer( calendar$end_date ) + } + if ( !all(validateCalendarDates( calendar ) ) ) { @@ -221,15 +228,7 @@ makeCalendar <- function(schedule, ncores = 1) { #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later calendar$originalUID <- calendar$UID - - if( treatDatesAsInt ) - { - setupDatesCache( calendar ) - #treating date as int: seem to be about twice as fast on the critical line when selecting base timetable - calendar$start_date = as.integer( calendar$start_date ) - calendar$end_date = as.integer( calendar$end_date ) - } - + calendar$STP <- as.character(calendar$STP) calendar$duration <- calendar$end_date - calendar$start_date + 1 @@ -249,17 +248,26 @@ makeCalendar <- function(schedule, ncores = 1) { if (ncores > 1) { cl <- parallel::makeCluster(ncores) - workerEnvs = parallel::clusterEvalQ(cl, { + parallel::clusterEvalQ(cl, { #put any setup required for all worker processes in here options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) #stop the child workers from calling update_data() workerEnv=loadNamespace("UK2GTFS") }) - parallel::clusterExport(cl, list("TREAT_DATES_AS_INT", "WDAY_LOOKUP_MIN_VALUE", - "WDAY_LOOKUP_MAX_VALUE", "WDAY_LOOKUP_MAP"), envir=asNamespace("UK2GTFS")) - - #set module level global in all workers TODO find out why this takes forever to run - #parallel::clusterCall(cl, function(val){ set_TREAT_DATES_AS_INT(val) }, val=treatDatesAsInt ) + #copy variables from this context into global context of worker processes + varList = list("TREAT_DATES_AS_INT", "WDAY_LOOKUP_MIN_VALUE", "WDAY_LOOKUP_MAX_VALUE", "WDAY_LOOKUP_MAP") + parallel::clusterExport(cl=cl, varlist=varList, envir=asNamespace("UK2GTFS")) + + #set module level global in all workers + parallel::clusterEvalQ(cl, { + copyFromGlobalEnvToPackageEnv<- function(varName){ + UK2GTFS:::setValueInThisEnvironment(varName, get(varName, envir=.GlobalEnv)) + } + copyFromGlobalEnvToPackageEnv("TREAT_DATES_AS_INT") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MIN_VALUE") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MAX_VALUE") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MAP") + }) pbapply::pboptions(use_lb = TRUE) res <- pbapply::pblapply(calendar_split, diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index 84519d0..4b577f2 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -14,7 +14,7 @@ test_that("test atoc data is there", { context("Test the main atoc function") test_that("test atoc2gtfs singlecore", { -browser() + gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1) @@ -27,7 +27,7 @@ browser() context("Test the main atoc function, with different settings") test_that("test atoc2gtfs singlecore", { - browser() + gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1, locations = "file") From 7d3191b3612494154d82ca71d50af0cc8de61567 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 7 Sep 2023 21:43:52 +0100 Subject: [PATCH 50/81] comment update after investigating hand when >1 thread used --- R/atoc_main.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/atoc_main.R b/R/atoc_main.R index 15def02..d183fd8 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -92,7 +92,7 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = message(paste0(Sys.time(), " Duplicating necessary stop times")) } - #TODO find out why this hangs if ncores > 1 + #TODO if ncores > 1 this takes forever - the data being joined must somehow trigger massive memory copying ? #also sets the trip_id on stop_times stop_times <- duplicate_stop_times(calendar = calendar, stop_times = stop_times, ncores = 1) From 63549664d19bd36167e4eb771410429f6af01765 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 09:03:09 +0100 Subject: [PATCH 51/81] remove test code --- R/atoc_export.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 514f2d3..0c0071e 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -201,8 +201,7 @@ longnames <- function(routes, stop_times, stops) { #' makeCalendar <- function(schedule, ncores = 1) { - treatDatesAsInt = getOption("UK2GTFS_opt_treatDatesAsInt", default=FALSE) - treatDatesAsInt = TRUE + treatDatesAsInt = getOption("UK2GTFS_opt_treatDatesAsInt", default=TRUE) set_TREAT_DATES_AS_INT( treatDatesAsInt ) tryCatch({ From 69622fd63715db390662451acb27e55d94c45a23 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 09:16:52 +0100 Subject: [PATCH 52/81] fix warning message typo --- R/zzz.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/zzz.R b/R/zzz.R index 51807e8..8a36620 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,7 +8,7 @@ update_data() } }, error = function(err) { - warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_date() :", err) + warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_data() :", err) }) } From 8c60dd39c0b5c6dfc8fcf3e09c170b285dde20f2 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 09:45:31 +0100 Subject: [PATCH 53/81] more informative warning message about bad data --- R/atoc_export.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 0c0071e..dd71dbe 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -219,10 +219,11 @@ makeCalendar <- function(schedule, ncores = 1) { } - if ( !all(validateCalendarDates( calendar ) ) ) + okCalendarDates = validateCalendarDates( calendar ) + if ( !all( okCalendarDates ) ) { - warning(paste0(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask")) - #TODO be more verbose about which ones + warning(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask.\n Services=", + paste( unique( calendar$UID[ !okCalendarDates ] ), collapse = "," ) ) } #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later From 0c47eb7cb9669225a488166712183fc524c972b5 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 09:49:59 +0100 Subject: [PATCH 54/81] tidy up exception handling --- R/atoc_export.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index dd71dbe..7e551ad 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -320,10 +320,10 @@ makeCalendar <- function(schedule, ncores = 1) { set_TREAT_DATES_AS_INT( FALSE ) #revert treating date as int - if( TRUE==treatDatesAsInt && exists("res.calendar") ) + if( TRUE==treatDatesAsInt ) { - res.calendar = makeDateFieldsDateType( res.calendar ) - cancellations = makeDateFieldsDateType( cancellations ) + if (exists("res.calendar")){ res.calendar = makeDateFieldsDateType( res.calendar ) } + if (exists("cancellations")){ cancellations = makeDateFieldsDateType( cancellations ) } } }) From 87210d668704f5f99c567cf2238e36ff1c6bd275 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 11:15:48 +0100 Subject: [PATCH 55/81] 1) decrease sensitivity of update check to only look at date instead of date+time (this means if github can't be reached should only nag once per day to update instead of on every single package load) 2) add timeout to update check on package load and make it short (but make default when explicitly called long) --- R/extdata.R | 19 +++++++++++++------ R/zzz.R | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/R/extdata.R b/R/extdata.R index ea2dd4b..f16e88b 100644 --- a/R/extdata.R +++ b/R/extdata.R @@ -9,9 +9,9 @@ #' #' @export #' -update_data <- function(){ +update_data <- function( timeout=60 ){ - check <- check_data() + check <- check_data( timeout=timeout ) if(check$date_package != check$date){ @@ -72,12 +72,12 @@ download_data <- function(tag_name, package_location, date){ #' @return TRUE if data is up-to-date or if unable to check #' @noRd -check_data <- function(default_tag = "v0.1.2"){ +check_data <- function( timeout = 60, default_tag = "v0.1.2"){ # Try not to hammer the API Sys.sleep(5) # Check date on data repo - res = try(httr::GET("https://api.github.com/repos/ITSleeds/UK2GTFS-data/releases"), - silent = TRUE) + res = try(httr::GET("https://api.github.com/repos/ITSleeds/UK2GTFS-data/releases", httr::timeout(get("timeout")), + silent = TRUE )) if(inherits(res, "try-error")){ message("Unable to check for latest data") date = Sys.time() @@ -94,13 +94,20 @@ check_data <- function(default_tag = "v0.1.2"){ } } + date = as.Date(date) #make it less sensitive by only comparing date rather than date+time + #Check if date.txt in package package_location <- system.file(package = "UK2GTFS") if(!file.exists(file.path(package_location, "extdata/date.txt"))){ writeLines("nodata", file.path(package_location, "extdata/date.txt")) } - date_package <- readLines(file.path(package_location, "extdata/date.txt")) + tryCatch({ + date_package <- as.Date( readLines(file.path(package_location, "extdata/date.txt")) ) + }, error = function(err) { + date_package = "nodata" + }) + return(list(date_package = date_package, date = date, tag_name = tag_name, package_location = package_location)) diff --git a/R/zzz.R b/R/zzz.R index 8a36620..1681d77 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -5,7 +5,7 @@ tryCatch({ if( TRUE == UK2GTFS_option_updateCachedDataOnLibaryLoad() ) { - update_data() + update_data( timeout=10 ) } }, error = function(err) { warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_data() :", err) From f8ef3d447cd4dda968485c3a0c1527eda186a56f Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 11:44:09 +0100 Subject: [PATCH 56/81] documentation update --- R/extdata.R | 2 ++ man/update_data.Rd | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/R/extdata.R b/R/extdata.R index f16e88b..1025c77 100644 --- a/R/extdata.R +++ b/R/extdata.R @@ -7,6 +7,8 @@ #' Raw data can be viewed and contributed to at #' https://github.com/ITSLeeds/UK2GTFS-data #' +#' @param timeout maximum duration (in seconds) to wait for a response from the server (github.com) +#' #' @export #' update_data <- function( timeout=60 ){ diff --git a/man/update_data.Rd b/man/update_data.Rd index 5e7e4c3..de69b19 100644 --- a/man/update_data.Rd +++ b/man/update_data.Rd @@ -4,7 +4,10 @@ \alias{update_data} \title{Update the data inside the UK2GTFS package} \usage{ -update_data() +update_data(timeout = 60) +} +\arguments{ +\item{timeout}{maximum duration (in seconds) to wait for a response from the server (github.com)} } \description{ As UK2GTFS has large datasets that update separately to the R package they From d02c8f4ddb48c745c15da44109d6347c97cb28f3 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 14:26:55 +0100 Subject: [PATCH 57/81] added public_only and WTT flags to atco import consistent with NR import (atoc data doesn't seem to have many non-public moves in it to start with) --- R/atoc.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/R/atoc.R b/R/atoc.R index 62869a7..c79d93a 100644 --- a/R/atoc.R +++ b/R/atoc.R @@ -12,6 +12,8 @@ #' @param transfers Logical, should transfers.txt be generated (default TRUE) #' @param missing_tiplocs Logical, if true will check for #' any missing tiplocs against the main file and add them.(default TRUE) +#' @param working_timetable Logical, should WTT times be used instead of public times (default FALSE) +#' @param public_only Logical, only return calls/services that are for public passenger pickup/set down (default TRUE) #' @family main #' #' @details Locations @@ -44,7 +46,9 @@ atoc2gtfs <- function(path_in, agency = "atoc_agency", shapes = FALSE, transfers = TRUE, - missing_tiplocs = TRUE) { + missing_tiplocs = TRUE, + working_timetable = FALSE, + public_only = TRUE) { # Checkmates checkmate::assert_character(path_in, len = 1) checkmate::assert_file_exists(path_in) @@ -103,7 +107,9 @@ atoc2gtfs <- function(path_in, file = files[grepl(".mca", files)], silent = silent, ncores = 1, - full_import = TRUE + full_import = TRUE, + working_timetable = working_timetable, + public_only = public_only ) @@ -173,7 +179,8 @@ atoc2gtfs <- function(path_in, stops = stops, schedule = schedule, silent = silent, - ncores = ncores + ncores = ncores, + public_only = public_only ) rm(schedule) gc() From 344ed3ee74ae68d3c715478ab1e2053bc0bf9df9 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 17:18:43 +0100 Subject: [PATCH 58/81] rebuilt documentation after adding 2 parameters --- man/atoc2gtfs.Rd | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/man/atoc2gtfs.Rd b/man/atoc2gtfs.Rd index 64855be..2f9ee33 100644 --- a/man/atoc2gtfs.Rd +++ b/man/atoc2gtfs.Rd @@ -12,7 +12,9 @@ atoc2gtfs( agency = "atoc_agency", shapes = FALSE, transfers = TRUE, - missing_tiplocs = TRUE + missing_tiplocs = TRUE, + working_timetable = FALSE, + public_only = TRUE ) } \arguments{ @@ -33,6 +35,10 @@ atoc2gtfs( \item{missing_tiplocs}{Logical, if true will check for any missing tiplocs against the main file and add them.(default TRUE)} + +\item{working_timetable}{Logical, should WTT times be used instead of public times (default FALSE)} + +\item{public_only}{Logical, only return calls/services that are for public passenger pickup/set down (default TRUE)} } \description{ Convert ATOC CIF files to GTFS From db3781b4c6e541cc134c5e4cc15284b84b33f8e4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 17:20:55 +0100 Subject: [PATCH 59/81] added missing dependency --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index f189383..df4252d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,6 +31,7 @@ Imports: geodist, httr, iotools, + stringi, stringr, sf, parallel, From c39bccc944ca2f759717a20180e4ceff4c135292 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 17:31:29 +0100 Subject: [PATCH 60/81] reduce build warning about undeclared variables in data.table joins --- R/globals.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/globals.R b/R/globals.R index 866508e..453725b 100644 --- a/R/globals.R +++ b/R/globals.R @@ -8,12 +8,15 @@ utils::globalVariables(c( "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "pattern", "schedule", "ATOC Code", "route_long_name", "Train Status", "i", "DaysOfWeek", - 'speed','agency_id','agency_name', - 'agency_url','agency_timezone', - 'agency_lang','agency_id', 'Freq', + 'speed','agency_id','agency_name', 'agency_url','agency_timezone', + 'agency_lang','agency_id', 'Freq', 'operator_code','route_id', 'UID','hash','vehicle_type','running_board','service_number', - 'operator_code','route_id', - 'speed_after','distance','school_terms','distance_after','historic_bank_holidays' + 'speed_after','distance','school_terms','distance_after','historic_bank_holidays', + '%>%', '.', 'Activity', 'Arrival Time', 'Departure Time', 'N', 'Public Arrival Time', + 'Public Departure Time','STP', 'Scheduled Arrival Time', 'Scheduled Departure Time', + 'Train Category', 'V1', '_TEMP_', '__TEMP__', 'duration', + 'i.friday', 'i.monday', 'i.saturday', 'i.sunday', 'i.thursday', 'i.tuesday', 'i.wednesday', 'originalUID', + 'route_id_new', 'route_type', 'service_id', 'service_id_new', 'stop_name', 'trip_id_new' )) From 3d57ad835bc5af8cecae1c18c561e262ad4c224f Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 21:37:01 +0100 Subject: [PATCH 61/81] performance- create data.table in place instead of by copy, more heavily vectorise process_activity --- R/atoc_import.R | 103 ++++++++++++++++------------------ tests/testthat/test_aa_unit.R | 29 ++++++++++ 2 files changed, 77 insertions(+), 55 deletions(-) diff --git a/R/atoc_import.R b/R/atoc_import.R index a566415..81410ac 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -146,7 +146,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 17 - 1), widths = c(1, 4, 26 + 4, 1, 7, 3, 3, 3, 5, 1, 5, 2, 1, 1, 11, 3) ) - station <- data.table(station) + setDT(station) names(station) <- c( "Record Type", "Reserved1", "Station Name", "CATE Interchange status", "TIPLOC Code", "CRS Reference Code", @@ -194,7 +194,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 5 - 1), widths = c(1, 4, 26 + 4, 45) ) - timetable <- data.table(timetable) + setDT(timetable) names(timetable) <- c( "Record Type", "Reserved1", "Station Name", "GBTT numbers" @@ -217,7 +217,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 2), widths = c(1, 79) ) - comment <- data.table(comment) + setDT(comment) names(comment) <- c("Record Type", "Comment") comment$`Record Type` <- NULL @@ -233,7 +233,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 6 - 1), widths = c(1, 4, 26 + 5, 26, 20) ) - alias <- data.table(alias) + setDT(alias) names(alias) <- c( "Record Type", "Reserved1", "Station Name", "Station Alias", "Reserved3" @@ -322,53 +322,46 @@ process_times <- function(dt, working_timetable) { # Process Activity Codes process_activity <- function(dt, public_only) { - #performance, runs about twice as fast if we do processing outside data.table then insert it later - splitActivity = stringi::stri_extract_all_regex(dt$Activity, ".{2}") - - if (public_only) { - # Filter to stops for passengers - #see https://wiki.openraildata.com/index.php?title=Activity_codes for definitions - acts <- c( - "TB", # Train Starts - "T ", # Stops to take up and set down passengers - "D ", # Stops to set down passengers - "U ", # Stops to take up passengers - "R ", # Request stop - "TF" # Train Finishes - ) - - clean_activity3 <- function(x) { - x <- x[x %in% acts] - if (length(x) > 0) { - x <- paste(x, collapse = ",") - return(x) - } else { - return("Other") - } - } - } else { - +# if ( any( 12 != nchar(dt$Activity) ) ) +# { +# stop("bad input data in process_activity(), all Activity fields should be 12 chars long") +# } +# don't really need this test since we're reading in from fixed width files - clean_activity3 <- function(x) { - - #remove empty elements - x <- x[x != " "] + #performance, runs about twice as fast if we do processing outside data.table then insert it later + splitActivity = unlist( stringi::stri_extract_all_regex(dt$Activity, ".{2}") ) + + splitActivityMat = matrix(splitActivity, ncol=6, byrow=TRUE) + + # Filter to stops for passengers + #see https://wiki.openraildata.com/index.php?title=Activity_codes for definitions + acts <- c( + "TB", # Train Starts + "T " , # Stops to take up and set down passengers + "D ", # Stops to set down passengers + "U ", # Stops to take up passengers + "R ", # Request stop + "TF" # Train Finishes + ) - if (length(x) > 0) { - x <- paste(x, collapse = ",") - return(x) - } else { - return("Other") - } - } + if(public_only) + { + allowed = (" "!=splitActivityMat) & (splitActivityMat %in% acts) + } + else + { + allowed = (" "!=splitActivityMat) } + splitActivityMat[!allowed] <- "" - dt$Activity = lapply(splitActivity, clean_activity3) + activity = sprintf("%s,%s,%s,%s,%s,%s", splitActivityMat[,1], splitActivityMat[,2], splitActivityMat[,3], splitActivityMat[,4], splitActivityMat[,5], splitActivityMat[,6] ) - dt <- dt[Activity != "Other"] + #remove whitespace, replace multiple comma with single comma, remove leading comma, remove trailing comma. + dt$Activity <- gsub(",$", "", gsub("^,", "", gsub(",+", ",", gsub("\\s+", "", activity)))) - dt[, Activity := gsub("\\s+", "", Activity)] + #remove rows with no activity we're interested in + dt <- dt[ ""!=dt$Activity ] return(dt) } @@ -423,7 +416,7 @@ importMCA <- function(file, 6, 1, 1, 1, 1, 4, 4, 1, 1 ) ) - BS <- data.table(BS) + setDT(BS) names(BS) <- c( "Record Identity", "Transaction Type", "Train UID", "Date Runs From", "Date Runs To", "Days Run", "Bank Holiday Running", "Train Status", @@ -463,7 +456,7 @@ importMCA <- function(file, col_types = rep("character", 8), widths = c(2, 4, 5, 2, 1, 8, 1, 57) ) - BX <- data.table(BX) + setDT(BX) names(BX) <- c( "Record Identity", "Traction Class", "UIC Code", "ATOC Code", "Applicable Timetable Code", "Retail Train ID", "Source", "Spare" @@ -488,7 +481,7 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 1, 5, 4, 3, 3, 2, 2, 12, 2, 37) ) - LO <- data.table(LO) + setDT(LO) names(LO) <- c( "Record Identity", "Location", "Suffix", "Scheduled Departure Time", "Public Departure Time", "Platform", "Line", "Engineering Allowance", @@ -519,7 +512,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 7, 1, 5, 5, 5, 4, 4, 3, 3, 3, 12, 2, 2, 2, 20) ) - LI <- data.table(LI) + setDT(LI) names(LI) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Scheduled Departure Time", "Scheduled Pass", "Public Arrival Time", @@ -552,7 +545,7 @@ importMCA <- function(file, col_types = rep("character", 9), widths = c(2, 7, 1, 5, 4, 3, 3, 12, 43) ) - LT <- data.table(LT) + setDT(LT) names(LT) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Public Arrival Time", "Platform", "Path", "Activity", "Spare" @@ -586,7 +579,7 @@ importMCA <- function(file, 4, 4, 5, 8, 5 ) ) - CR <- data.table(CR) + setDT(CR) names(CR) <- c( "Record Identity", "Location", "Train Category", "Train Identity", "Headcode", "Course Indicator", @@ -612,7 +605,7 @@ importMCA <- function(file, col_types = rep("character", 11), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 8) ) - TI <- data.table(TI) + setDT(TI) names(TI) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", @@ -635,7 +628,7 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 7, 1) ) - TA <- data.table(TA) + setDT(TA) names(TA) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", "STANOX", "PO MCP Code", @@ -658,7 +651,7 @@ importMCA <- function(file, col_types = rep("character", 3), widths = c(2, 7, 71) ) - TD <- data.table(TD) + setDT(TD) names(TD) <- c("Record Identity", "TIPLOC code", "Spare") TD$Spare <- NULL TD$`Record Identity` <- NULL @@ -680,7 +673,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 1, 6, 6, 6, 6, 7, 2, 1, 7, 1, 1, 1, 1, 31, 1) ) - AA <- data.table(AA) + setDT(AA) names(AA) <- c( "Record Identity", "Transaction Type", "Base UID", "Assoc UID", "Assoc Start date", "Assoc End date", "Assoc Days", "Assoc Cat", @@ -719,7 +712,7 @@ importMCA <- function(file, col_types = rep("character", 2), widths = c(2, 78) ) - ZZ <- data.table(ZZ) + setDT(ZZ) names(ZZ) <- c("Record Identity", "Spare") ZZ$Spare <- NULL ZZ <- strip_whitespace(ZZ) diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index d6bde92..5773a6b 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -602,6 +602,35 @@ test_that("test hasGapInOperatingDays:1", { + +test_that("test process_activity:1", { + + OK = TRUE + + { + testData = data.table( Activity=c(" ", "TBT D U R TF", "abcdefghijkl", "abcdefghij ", "TBT EEU XXTF", " D U ") ) + + res = process_activity( testData, TRUE ) + + expectedResult = data.table( Activity=c("TB,T,D,U,R,TF", "TB,T,U,TF", "D,U" ) ) + + OK = OK & identical(expectedResult,res) + } + { + testData = data.table( Activity=c(" ", "TBT D U R TF", "abcdefghijkl", "a def ijkl", " cdefghij ") ) + + res = process_activity( testData, FALSE ) + + expectedResult = data.table( Activity=c("TB,T,D,U,R,TF", "ab,cd,ef,gh,ij,kl", "a,d,ef,ij,kl","cd,ef,gh,ij" )) + + OK = OK & identical(expectedResult,res) + } + + expect_true( OK ) +}) + + + #when running for real, this hangs if ncores>1 having trouble reproducing test_that("test duplicateItems:1", { From 8b47e7a73450ffb4cf4ba6f3a52a685ce75cf8b1 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 22:08:39 +0100 Subject: [PATCH 62/81] final performance optimisation --- R/atoc_import.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/atoc_import.R b/R/atoc_import.R index 81410ac..b78d782 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -357,8 +357,9 @@ process_activity <- function(dt, public_only) { activity = sprintf("%s,%s,%s,%s,%s,%s", splitActivityMat[,1], splitActivityMat[,2], splitActivityMat[,3], splitActivityMat[,4], splitActivityMat[,5], splitActivityMat[,6] ) - #remove whitespace, replace multiple comma with single comma, remove leading comma, remove trailing comma. - dt$Activity <- gsub(",$", "", gsub("^,", "", gsub(",+", ",", gsub("\\s+", "", activity)))) + #replace multiple comma with single comma, remove whitespace, remove leading comma, remove trailing comma. + activity = gsub(",+", ",", activity) + dt[, Activity := gsub("\\s+|^,|,$", "", activity)] #remove rows with no activity we're interested in dt <- dt[ ""!=dt$Activity ] From 8d7424955508b8a4af62561d20477e746d8f079a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 8 Sep 2023 22:27:58 +0100 Subject: [PATCH 63/81] typo --- R/atoc_overlay.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index 1ed8c50..a7676e7 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -565,7 +565,7 @@ selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) # triggered by test case "10:test makeCalendarInner" -# when we have a 1 day overlay sitting on the start/end data of a base timetable +# when we have a 1 day overlay sitting on the start/end date of a base timetable # the dates overlap - fix it fixOverlappingDates <- function( cal ) { From b7a37954f03615a639581e2d1571ff62d93328da Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sat, 9 Sep 2023 01:26:06 +0100 Subject: [PATCH 64/81] bug fix for strange data.table / Period memory allocation problem --- R/gtfs_cleaning.R | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/R/gtfs_cleaning.R b/R/gtfs_cleaning.R index 144cedd..4becc0a 100644 --- a/R/gtfs_cleaning.R +++ b/R/gtfs_cleaning.R @@ -212,7 +212,8 @@ PUBLIC_SERVICE_CATEGORY = c("OL", "OU", "OO", "OW", "XC", "XD", "XI", #' 4. Replace missing agency names with "MISSINGAGENCY" #' 5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters) #' (these have a null route_type, so loading into OpenTripPlanner fails if these are present) -#' 6' If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) +#' 6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) +#' 7. Remove shapes that no longer have any trips #' #' @export gtfs_clean <- function(gtfs, public_only = FALSE) { @@ -254,6 +255,20 @@ gtfs_clean <- function(gtfs, public_only = FALSE) { gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times ), with=FALSE] + + #what is this batty code I hear you cry ?! + gtfs$stop_times$arrival_time = gtfs$stop_times$arrival_time[ 1: nrow(gtfs$stop_times) ] + gtfs$stop_times$departure_time = gtfs$stop_times$departure_time[ 1: nrow(gtfs$stop_times) ] + #well, it's a bug workaround. Not entirely sure of the trigger, but when we have 30M stop times, and filter down to 1M + #the hour and minute component of the Period 'object' report a length of 30M, when there is only supposed to be 1M of them. + #The nrow() in the data.table says 1M, and the number of seconds in the period 'object' says 1M. + #Clearly 'object' is in big air quotes...... + #as a result it blows up in gtfs_write() when writing because the sprintf moans about the input vectors being different lengths. + #This fixes it. + #stamping on the gc() button at the end of this fn for good measure. + #- remember kids, R is not suitable for production use..... + + #after merging GTFS files we may have compressed the calendar and calendar_dates so a service pattern is used by #multiple trips - so don't remove calendar and calendar_dates that link to routes with NA route_type in case #it's in use by multiple trips/routes @@ -279,7 +294,20 @@ gtfs_clean <- function(gtfs, public_only = FALSE) { { gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ] } + + rm(joinedCalls) + rm(joinedTrips) + rm(filteredCalls) + rm(filteredTrips) + gc() } + # 7 remove shapes that no longer have any trips + if ("shapes" %in% names(gtfs)) + { + gtfs$shapes <- gtfs$shapes[gtfs$shapes$shape_id %in% gtfs$trips$shape_id, ] + } + + return(gtfs) } From db1a439a70cec7b6f1645620b8b117449ac5f009 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sat, 9 Sep 2023 03:04:58 +0100 Subject: [PATCH 65/81] bring in seconds from WTT instead of throwing them away --- R/atoc_export.R | 10 ++++---- R/atoc_import.R | 17 ++++++++------ tests/testthat/test_aa_unit.R | 44 +++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 12 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 7e551ad..ed76ff0 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -569,8 +569,8 @@ afterMidnight <- function(stop_times, safe = TRUE) { ) stop_times <- dplyr::left_join(stop_times, stop_times.summary, by = "trip_id") - stop_times$arvfinal <- ifelse(stop_times$arv < stop_times$dept_first, stop_times$arv + 2400, stop_times$arv) - stop_times$depfinal <- ifelse(stop_times$dept < stop_times$dept_first, stop_times$dept + 2400, stop_times$dept) + stop_times$arvfinal <- ifelse(stop_times$arv < stop_times$dept_first, stop_times$arv + 240000, stop_times$arv) + stop_times$depfinal <- ifelse(stop_times$dept < stop_times$dept_first, stop_times$dept + 240000, stop_times$dept) if (safe) { @@ -589,7 +589,7 @@ afterMidnight <- function(stop_times, safe = TRUE) { numb2time2 <- function(numb){ #performance, substr is relatively expensive - numb <- sprintf("%02d:%02d:00", numb %/% 100, numb %% 100) + numb <- sprintf("%02d:%02d:%02d", numb %/% 10000, (numb %/% 100) %% 100, numb %% 100) } stop_times$arrival_time <- numb2time2(stop_times$arvfinal) @@ -607,10 +607,10 @@ afterMidnight <- function(stop_times, safe = TRUE) { fixStopTimeData <- function(stop_times) { # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains - stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "0000" & stop_times$Activity == "D", + stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "000000" & stop_times$Activity == "D", stop_times$arrival_time, stop_times$departure_time) - stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "0000" & stop_times$Activity == "U", + stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "000000" & stop_times$Activity == "U", stop_times$departure_time, stop_times$arrival_time) diff --git a/R/atoc_import.R b/R/atoc_import.R index b78d782..13b40bc 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -295,23 +295,26 @@ strip_whitespace <- function(dt) { -#TODO update this to handle seconds instead of just truncating them (public TT is to nearest minute, WTT more accurate) process_times <- function(dt, working_timetable) { - if (working_timetable) { + #fill in the missing seconds - substituting H for 30 seconds. + if (working_timetable) + { if ("Scheduled Arrival Time" %in% colnames(dt)) { - dt[, `Arrival Time` := gsub("H", "", `Scheduled Arrival Time`)] + dt[, `Arrival Time` := gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", `Scheduled Arrival Time`))] } if ("Scheduled Departure Time" %in% colnames(dt)) { - dt[, `Departure Time` := gsub("H", "", `Scheduled Departure Time`)] + dt[, `Departure Time` := gsub("^(\\d{4}) $","\\100", gsub("^(\\d{4})H$", "\\130", `Scheduled Departure Time`))] } - } else { + } + else + { if ("Public Arrival Time" %in% colnames(dt)) { - dt[, `Arrival Time` := gsub("H", "", `Public Arrival Time`)] + dt[, `Arrival Time` := gsub("^(\\d{4})$", "\\100", `Public Arrival Time`)] } if ("Public Departure Time" %in% colnames(dt)) { - dt[, `Departure Time` := gsub("H", "", `Public Departure Time`)] + dt[, `Departure Time` := gsub("^(\\d{4})$", "\\100", `Public Departure Time`)] } } diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index 5773a6b..e002e6b 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -85,6 +85,50 @@ test_that("test splitBitmask performance", { + + + +test_that("test process_times", { + + testData = data.table( + `Scheduled Arrival Time` =c("", " ", "0000 ", "1234H"), + `Scheduled Departure Time`=c("", " ", "0106 ", "2156H"), + `Public Arrival Time` =c("", " ", "0135", "tjkl" ), + `Public Departure Time` =c("", " ", "1234", "tgbi" )) + + OK = TRUE + + { + res = process_times( testData, FALSE ) + + res = res[,c("Arrival Time","Departure Time")] + + expectedResult = data.table( + `Arrival Time` =c("", " ", "013500", "tjkl" ), + `Departure Time` =c("", " ", "123400", "tgbi" )) + + printDifferencesDf(expectedResult, res) + OK = OK & identical(expectedResult, res) + } + { + res = process_times( testData, TRUE ) + + res = res[,c("Arrival Time","Departure Time")] + + expectedResult = data.table( + `Arrival Time` =c("", " ", "000000", "123430" ), + `Departure Time` =c("", " ", "010600", "215630" )) + + printDifferencesDf(expectedResult, res) + OK = OK & identical(expectedResult, res) + } + + expect_true( OK ) +}) + + + + test_that("test setupDatesCache", { testData = data.table( From 5270ebf87b3002867b0d670fa78bd7050fd51f00 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 15:38:49 +0100 Subject: [PATCH 66/81] further information on bug workaround --- R/gtfs_cleaning.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/gtfs_cleaning.R b/R/gtfs_cleaning.R index 4becc0a..dc1fde4 100644 --- a/R/gtfs_cleaning.R +++ b/R/gtfs_cleaning.R @@ -265,6 +265,7 @@ gtfs_clean <- function(gtfs, public_only = FALSE) { #Clearly 'object' is in big air quotes...... #as a result it blows up in gtfs_write() when writing because the sprintf moans about the input vectors being different lengths. #This fixes it. + # see also stops_interpolate() "Needed because rbindlist doesn't work with periods for some reason" which smells like a similar workaround. #stamping on the gc() button at the end of this fn for good measure. #- remember kids, R is not suitable for production use..... From b074b8f4f476eafea78b1aacd86f1589b4843578 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 15:39:36 +0100 Subject: [PATCH 67/81] performance optimisation when copying row data --- R/atoc_overlay.R | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index a7676e7..93d1d66 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -508,6 +508,7 @@ allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) +CALENDAR_COLS_TO_COPY <- c("UID", "Days", "STP", "rowID" ) NOT_NEEDED <- c("__NOT_NEEDED_MARKER__~@$$%&*((") @@ -548,16 +549,8 @@ selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) #this speeds things up when we look up the required priority overlay **SEE_NOTE** #so we don't need to sort again here, just pick the top filtered result - #stash the generated start & end dates - #performance - copying to separate variables seems to be fastest - start_date = calNew$start_date[rowIndex] - end_date = calNew$end_date[rowIndex] - - calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] - #this is the most time consuming line in this fn. takes about 10x longer than the single variable copy below - - calNew$start_date[rowIndex] = start_date - calNew$end_date[rowIndex] = end_date + set( calNew, i = rowIndex, j = CALENDAR_COLS_TO_COPY, value = cal[ baseTimetableIndexes[1], CALENDAR_COLS_TO_COPY, with=FALSE ] ) + #set() runs 3x faster than this style of copy calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] return (calNew) } From 951da04d9820380c9a845bcf2c45eebf5ed0d4cd Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 15:40:45 +0100 Subject: [PATCH 68/81] performance optimisations while reading input file --- R/atoc_import.R | 102 +++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/R/atoc_import.R b/R/atoc_import.R index 13b40bc..3715cd3 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -274,6 +274,8 @@ strip_whitespace_df <- function(df) { #' Strip White Space #' #' @details +#' Input data.table is modified in-place and returned to the caller. +#' #' Strips trailing whitespace from all char columns in a data.table #' empty values are converted to NA #' returns the data.table @@ -286,35 +288,36 @@ strip_whitespace <- function(dt) { char_cols <- sapply(dt, is.character) char_col_names <- names(char_cols[char_cols]) - return ( dt[, (char_col_names) := lapply(.SD, function(val) { - val <- trimws(val, which = "right") - val[val == ""] <- NA - return(val) - }), .SDcols = char_col_names] ) -} + for (col_name in char_col_names) { + set(dt, j = col_name, value = trimws(dt[[col_name]], which = "right")) + dt[dt[[col_name]] == "", (col_name) := NA_character_] + } + return (dt) +} +#does in place-modification of input data.table process_times <- function(dt, working_timetable) { #fill in the missing seconds - substituting H for 30 seconds. if (working_timetable) { if ("Scheduled Arrival Time" %in% colnames(dt)) { - dt[, `Arrival Time` := gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", `Scheduled Arrival Time`))] + set(dt, j = "Arrival Time", value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[["Scheduled Arrival Time"]]))) } if ("Scheduled Departure Time" %in% colnames(dt)) { - dt[, `Departure Time` := gsub("^(\\d{4}) $","\\100", gsub("^(\\d{4})H$", "\\130", `Scheduled Departure Time`))] + set(dt, j = "Departure Time", value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[["Scheduled Departure Time"]]))) } } else { if ("Public Arrival Time" %in% colnames(dt)) { - dt[, `Arrival Time` := gsub("^(\\d{4})$", "\\100", `Public Arrival Time`)] + set(dt, j = "Arrival Time", value = gsub("^(\\d{4})$", "\\100", dt[["Public Arrival Time"]])) } if ("Public Departure Time" %in% colnames(dt)) { - dt[, `Departure Time` := gsub("^(\\d{4})$", "\\100", `Public Departure Time`)] + set(dt, j = "Departure Time", value = gsub("^(\\d{4})$", "\\100", dt[["Public Departure Time"]])) } } @@ -362,7 +365,7 @@ process_activity <- function(dt, public_only) { #replace multiple comma with single comma, remove whitespace, remove leading comma, remove trailing comma. activity = gsub(",+", ",", activity) - dt[, Activity := gsub("\\s+|^,|,$", "", activity)] + set(dt, j="Activity", value = gsub("\\s+|^,|,$", "", activity)) #remove rows with no activity we're interested in dt <- dt[ ""!=dt$Activity ] @@ -402,6 +405,7 @@ importMCA <- function(file, n = -1 ) types <- substr(raw, 1, 2) + rowIds <- seq(from = 1, to = length(types)) # break out each part of the file # Header Record @@ -448,7 +452,7 @@ importMCA <- function(file, BS$Speed <- as.integer(BS$Speed) # Add the rowid - BS$rowID <- seq(from = 1, to = length(types))[types == "BS"] + BS$rowID <- rowIds[types == "BS"] # Basic Schedule Extra Details if (!silent) { @@ -471,7 +475,7 @@ importMCA <- function(file, # clean data # Add the rowid - BX$rowID <- seq(from = 1, to = length(types))[types == "BX"] + BX$rowID <- rowIds[types == "BX"] @@ -492,18 +496,13 @@ importMCA <- function(file, "Pathing Allowance", "Activity", "Performance Allowance", "Spare" ) - LO$Spare <- NULL - LO$`Record Identity` <- NULL - # Add the rowid - LO$rowID <- seq(from = 1, to = length(types))[types == "LO"] - - LO <- process_activity(LO, public_only) - - LO <- process_times( LO, working_timetable ) - LO <- LO[, c("rowID", "Location", "Activity", "Departure Time" )] + # Add the rowid + LO$rowID <- rowIds[types == "LO"] - LO <- strip_whitespace(LO) + LO[, c("Scheduled Arrival Time","Public Arrival Time") := ""] + LO <- LO[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time" )] # Intermediate Station @@ -524,19 +523,12 @@ importMCA <- function(file, "Engineering Allowance", "Pathing Allowance", "Performance Allowance", "Spare" ) - LI$Spare <- NULL - LI$`Record Identity` <- NULL - # Add the rowid - LI$rowID <- seq(from = 1, to = length(types))[types == "LI"] - - LI <- process_activity(LI, public_only) - LI <- process_times( LI, working_timetable ) - - LI <- LI[, c("rowID", "Location", "Activity", "Arrival Time", "Departure Time" )] - - LI <- strip_whitespace(LI) + # Add the rowid + LI$rowID <- rowIds[types == "LI"] + LI <- LI[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time" )] # Terminating Station @@ -554,18 +546,13 @@ importMCA <- function(file, "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Public Arrival Time", "Platform", "Path", "Activity", "Spare" ) - LT$Spare <- NULL - LT$`Record Identity` <- NULL - # Add the rowid - LT$rowID <- seq(from = 1, to = length(types))[types == "LT"] - - LT <- process_activity(LT, public_only) - - LT <- process_times( LT, working_timetable ) - LT <- LT[, c("rowID", "Location", "Activity", "Arrival Time" )] + # Add the rowid + LT$rowID <- rowIds[types == "LT"] - LT <- strip_whitespace(LT) + LT[, c("Scheduled Departure Time","Public Departure Time") := ""] + LT <- LT[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time" )] # TIPLOC Insert @@ -598,7 +585,7 @@ importMCA <- function(file, CR <- strip_whitespace(CR) # Add the rowid - CR$rowID <- seq(from = 1, to = length(types))[types == "CR"] + CR$rowID <- rowIds[types == "CR"] if (!silent) { message(paste0(Sys.time(), " importing TIPLOC Insert")) @@ -620,7 +607,7 @@ importMCA <- function(file, TI <- strip_whitespace(TI) # Add the rowid - TI$rowID <- seq(from = 1, to = length(types))[types == "TI"] + TI$rowID <- rowIds[types == "TI"] # TIPLOC Amend if (!silent) { @@ -643,7 +630,7 @@ importMCA <- function(file, TA <- strip_whitespace(TA) # Add the rowid - TA$rowID <- seq(from = 1, to = length(types))[types == "TA"] + TA$rowID <- rowIds[types == "TA"] # TIPLOC Delete if (!silent) { @@ -662,7 +649,7 @@ importMCA <- function(file, TD <- strip_whitespace(TD) # Add the rowid - TD$rowID <- seq(from = 1, to = length(types))[types == "TD"] + TD$rowID <- rowIds[types == "TD"] } @@ -703,7 +690,7 @@ importMCA <- function(file, AA$`Assoc Location Suffix` <- as.integer(AA$`Assoc Location Suffix`) # Add the rowid - AA$rowID <- seq(from = 1, to = length(types))[types == "AA"] + AA$rowID <- rowIds[types == "AA"] } # Trailer Record @@ -722,14 +709,23 @@ importMCA <- function(file, ZZ <- strip_whitespace(ZZ) # Add the rowid - ZZ$rowID <- seq(from = 1, to = length(types))[types == "ZZ"] + ZZ$rowID <- rowIds[types == "ZZ"] # Prep the main files if (!silent) { message(paste0(Sys.time(), " Preparing Imported Data")) } - stop_times <- dplyr::bind_rows(list(LO, LI, LT)) + stop_times <- data.table::rbindlist(list(LO, LI, LT), use.names=FALSE) + + stop_times <- process_activity(stop_times, public_only) + + stop_times <- process_times( stop_times, working_timetable ) + + stop_times <- stop_times[, c("rowID", "Location", "Activity", "Arrival Time", "Departure Time")] + + stop_times <- strip_whitespace(stop_times) + stop_times <- stop_times[order(stop_times$rowID), ] #the BS record is followed by the LO, LI, LT records relating to it @@ -743,8 +739,8 @@ importMCA <- function(file, # the BX record appears the row after the BS record, so it's rowId is one more than it's corresponding BS record. # use this to join the two records together. - BX$rowIDm1 <- BX$rowID - 1 - BX$rowID <- NULL + set(BX, j = "rowID", value = BX$rowID - 1) + setnames(BX, "rowID", "rowIDm1") schedule <- dplyr::left_join(BS, BX, by = c("rowID" = "rowIDm1")) if (full_import) { From 5d28d760a8ed49c88196feb0158374664764396b Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 15:44:09 +0100 Subject: [PATCH 69/81] small performance improvement --- R/atoc_export.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index ed76ff0..0f0a136 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -587,13 +587,14 @@ afterMidnight <- function(stop_times, safe = TRUE) { } } - numb2time2 <- function(numb){ + numb2time2 <- function(dt, colNameDest, colNameSource){ #performance, substr is relatively expensive - numb <- sprintf("%02d:%02d:%02d", numb %/% 10000, (numb %/% 100) %% 100, numb %% 100) + set(dt, j=colNameDest, value= sprintf("%02d:%02d:%02d", + dt[[colNameSource]] %/% 10000, (dt[[colNameSource]] %/% 100) %% 100, dt[[colNameSource]] %% 100) ) } - stop_times$arrival_time <- numb2time2(stop_times$arvfinal) - stop_times$departure_time <- numb2time2(stop_times$depfinal) + numb2time2(stop_times, "arrival_time", "arvfinal") + numb2time2(stop_times, "departure_time", "depfinal") stop_times <- stop_times[, c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", From 745653b87078e5d05cc64834c2b9dec5b6113ea1 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 15:57:52 +0100 Subject: [PATCH 70/81] documentation update --- man/gtfs_clean.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/gtfs_clean.Rd b/man/gtfs_clean.Rd index 5f51ce6..19d42d7 100644 --- a/man/gtfs_clean.Rd +++ b/man/gtfs_clean.Rd @@ -24,5 +24,6 @@ Task done: 4. Replace missing agency names with "MISSINGAGENCY" 5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters) (these have a null route_type, so loading into OpenTripPlanner fails if these are present) -6' If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) +6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) +7. Remove shapes that no longer have any trips } From e7dcbbeaa3f528cd646e484b521f4695d9a20a1c Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 10 Sep 2023 16:32:37 +0100 Subject: [PATCH 71/81] fix build warning about missing import --- DESCRIPTION | 11 ++++++----- R/atoc_export.R | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index df4252d..4640a71 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,15 +31,16 @@ Imports: geodist, httr, iotools, - stringi, - stringr, - sf, - parallel, lubridate, + parallel, purrr (>= 1.0), pbapply, - readr (>= 2.0), RcppSimdJson, + readr (>= 2.0), + sf, + stats, + stringi, + stringr, xml2, zip, Suggests: diff --git a/R/atoc_export.R b/R/atoc_export.R index 0f0a136..5701967 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -493,7 +493,7 @@ duplicate_related_items <- function(calendar, related_items, original_join_field #join the count of number of duplicates required to the stop times (so we can retrieve it later when doing the duplication) related_items <- dplyr::left_join(related_items, rowID.unique, - by = setNames("Var1",original_join_field) ) + by = stats::setNames("Var1",original_join_field) ) #set the number of duplications required related_items$`_reps` <- related_items$Freq From 07995f83ebfc079cda905cbebcdb4801590f4212 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 11 Sep 2023 16:16:25 +0100 Subject: [PATCH 72/81] add 'frequencies' to cleaning - otherwise resulting GTFS file can't be loaded into OTP --- R/gtfs_cleaning.R | 6 ++++++ man/gtfs_clean.Rd | 1 + 2 files changed, 7 insertions(+) diff --git a/R/gtfs_cleaning.R b/R/gtfs_cleaning.R index dc1fde4..3b561aa 100644 --- a/R/gtfs_cleaning.R +++ b/R/gtfs_cleaning.R @@ -214,6 +214,7 @@ PUBLIC_SERVICE_CATEGORY = c("OL", "OU", "OO", "OW", "XC", "XD", "XI", #' (these have a null route_type, so loading into OpenTripPlanner fails if these are present) #' 6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) #' 7. Remove shapes that no longer have any trips +#' 8 Remove frequencies that no longer have any trips #' #' @export gtfs_clean <- function(gtfs, public_only = FALSE) { @@ -309,6 +310,11 @@ gtfs_clean <- function(gtfs, public_only = FALSE) { gtfs$shapes <- gtfs$shapes[gtfs$shapes$shape_id %in% gtfs$trips$shape_id, ] } + # 8 remove frequencies that no longer have any trips + if ("frequencies" %in% names(gtfs)) + { + gtfs$frequencies <- gtfs$frequencies[gtfs$frequencies$trip_id %in% gtfs$trips$trip_id, ] + } return(gtfs) } diff --git a/man/gtfs_clean.Rd b/man/gtfs_clean.Rd index 19d42d7..06c9cee 100644 --- a/man/gtfs_clean.Rd +++ b/man/gtfs_clean.Rd @@ -26,4 +26,5 @@ Task done: (these have a null route_type, so loading into OpenTripPlanner fails if these are present) 6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) 7. Remove shapes that no longer have any trips +8 Remove frequencies that no longer have any trips } From 7f45b96931dae36d717a70f9becec414ff533814 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Mon, 18 Sep 2023 20:38:30 +0100 Subject: [PATCH 73/81] code tidy - mark everything as integer --- R/atoc_overlay.R | 156 +++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index 93d1d66..f086b90 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -40,7 +40,7 @@ assign("WDAY_LOOKUP_MIN_VALUE", NULL ) set_WDAY_LOOKUP_MIN_VALUE <- function( value ) { - setValueInThisEnvironment("WDAY_LOOKUP_MIN_VALUE", as.integer(as.integer(value)-1) ) + setValueInThisEnvironment("WDAY_LOOKUP_MIN_VALUE", as.integer(as.integer(value)-1L) ) } assign("WDAY_LOOKUP_MAX_VALUE", NULL ) @@ -59,7 +59,7 @@ set_WDAY_LOOKUP_MAP <- function( value ) -local_lubridate_wday <- function( date, label = FALSE, week_start=1 ) +local_lubridate_wday <- function( date, label = FALSE, week_start=1L ) { if (TRUE==TREAT_DATES_AS_INT) { @@ -72,7 +72,7 @@ local_lubridate_wday <- function( date, label = FALSE, week_start=1 ) } else { - return ( lubridate::wday( date, label = FALSE, week_start=1 ) ) + return ( lubridate::wday( date, label = FALSE, week_start=1L ) ) } } @@ -98,32 +98,32 @@ setupDatesCache<-function( calendar ) set_WDAY_LOOKUP_MIN_VALUE( minDt ) set_WDAY_LOOKUP_MAX_VALUE( maxDt ) - firstWeek = as.integer(lubridate::wday( seq.Date(from = minDt, to = minDt+6, by = "day"), label = FALSE, week_start=1 )) - allWeeks = rep( firstWeek, length.out=( as.integer(maxDt) - as.integer(minDt) +1 ) ) + firstWeek = as.integer(lubridate::wday( seq.Date(from = minDt, to = minDt+6L, by = "day"), label = FALSE, week_start=1L )) + allWeeks = rep( firstWeek, length.out=( as.integer(maxDt) - as.integer(minDt) +1L ) ) set_WDAY_LOOKUP_MAP( allWeeks ) } #performance - this is slow, might be generating on the fly each time subset happens - cache it. - -LETTERS <- letters[1:26] -TWO_LETTERS <- paste0(rep(letters, each = 26), rep(letters, times = 26)) +LETTERS <- letters[1L:26L] +TWO_LETTERS <- paste0(rep(letters, each = 26L), rep(letters, times = 26L)) # Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique appendLetterSuffix <- function( cal ) { rows = nrow(cal) - if (rows > 1) + if (rows > 1L) { - if (rows <= 26) + if (rows <= 26L) { cal$UID <- paste0(cal$UID, " ", LETTERS[1:rows]) } else { # Cases where we need extra letters, gives up to 676 ids - cal$UID <- paste0(cal$UID, " ", TWO_LETTERS[1:rows]) + cal$UID <- paste0(cal$UID, " ", TWO_LETTERS[1L:rows]) } } @@ -134,7 +134,7 @@ appendLetterSuffix <- function( cal ) # Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique appendNumberSuffix<-function( cal, numToAppend ) { - if( numToAppend>1 ) #don't need to append a new number if we only have one pattern + if( numToAppend>1L ) #don't need to append a new number if we only have one pattern { # further differentiate the UID by appending a number to the end for each different days pattern cal$UID <- paste0(cal$UID, numToAppend) @@ -149,8 +149,8 @@ appendNumberSuffix<-function( cal, numToAppend ) # e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE hasGapInOperatingDays <- function( daysBitmask ) { - firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1] - lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1] + firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1L] + lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1L] operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) @@ -172,11 +172,11 @@ END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") #i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. validateCalendarDates <- function( calendar ) { - start_day_number = local_lubridate_wday( calendar$start_date, label = FALSE, week_start=1 ) - end_day_number = local_lubridate_wday( calendar$end_date, label = FALSE, week_start=1 ) + start_day_number = local_lubridate_wday( calendar$start_date, label = FALSE, week_start=1L ) + end_day_number = local_lubridate_wday( calendar$end_date, label = FALSE, week_start=1L ) - startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1, start_day_number) - endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7) + startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1L, start_day_number) + endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7L) return (startOk & endOk) } @@ -209,7 +209,7 @@ splitBitmaskMat <- function( bitmaskVector, asInteger=FALSE ) splitBitmask <- function( bitmask, asInteger=FALSE ) { - duff = which( nchar(bitmask) != 7 ) + duff = which( nchar(bitmask) != 7L ) bitmask[duff] = "0000000" @@ -228,7 +228,7 @@ splitBitmask <- function( bitmask, asInteger=FALSE ) checkOperatingDayActive <- function(calendar) { - if (all(calendar$duration >= 7)) + if (all(calendar$duration >= 7L)) { return (calendar$Days!="0000000") } @@ -249,11 +249,11 @@ checkOperatingDayActive <- function(calendar) { allDays = local_lubridate_wday( local_seq_date(from = veryfirstDay, to = max(calendar$end_date), by = "day") , label = FALSE, week_start=1 ) } - veryfirstDay = veryfirstDay - 1 + veryfirstDay = veryfirstDay - 1L checkValid <- function(dur, sd, ed, od ){ - if (dur >= 7) + if (dur >= 7L) { return (any(od)) } @@ -302,7 +302,7 @@ countIntersectingDayPatterns <- function( dayPatterns ) intersectingDayPattern <- function( dayPattern1, dayPattern2 ) { - return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1) ) + return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1L) ) } @@ -319,7 +319,7 @@ intersectingDayPatterns <- function( dayPatternBase, dayPatternOverlay ) intersects = unpackedBaseRepmat & unpackedOverlay - res <- apply(intersects, 1, any) + res <- apply(intersects, 1L, any) return ( res ) } @@ -341,12 +341,12 @@ makeReplicationDates <- function(cal, startDayNum, endDayNum){ #make a sequences of dates, offsetting the start date so it's always monday (aligning with bitmask start day) # and the end date so it's always sunday - firstDate = min(cal$start_date) - 7 - lastDate = max(cal$end_date) + 7 + firstDate = min(cal$start_date) - 7L + lastDate = max(cal$end_date) + 7L allDates = local_seq_date(from = firstDate, to = lastDate, by = "day") - offset = as.integer(cal$start_date)-startDayNum+2-as.integer(firstDate) - end = as.integer(cal$end_date)+8-endDayNum-as.integer(firstDate) + offset = as.integer(cal$start_date)-startDayNum+2L-as.integer(firstDate) + end = as.integer(cal$end_date)+8L-endDayNum-as.integer(firstDate) dates <- Map(function(o, e) allDates[o:e], offset, end) @@ -373,23 +373,23 @@ makeReplicationDates <- function(cal, startDayNum, endDayNum){ #' makeAllOneDay <- function( cal ) { - duration <- cal$end_date - cal$start_date + 1 + duration <- cal$end_date - cal$start_date + 1L - if ( 0==nrow(cal) || all(1 == duration)) + if ( 0L==nrow(cal) || all(1L == duration)) { #nothing to do return (cal) } #make a list of dates for each object being replicated - startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1 ) + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1L ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1L ) dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) #work out how many time we need to replicate each item: number of operating days in week * num weeks bitmaskMat = splitBitmaskMat( cal$Days, asInteger=FALSE ) dayCount = rowSums(bitmaskMat) - numWeeks <- ceiling(as.integer(cal$duration) / 7) + numWeeks <- ceiling(as.integer(cal$duration) / 7L) repetitions = dayCount * numWeeks #replicate the calendar rows the appropriate number of times @@ -404,7 +404,7 @@ makeAllOneDay <- function( cal ) #tidy up the values so they are correct for the spilt items replicatedcal$duration <- 1 - replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ local_lubridate_wday( replicatedcal$start_date, label = FALSE, week_start=1 ) ] + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ local_lubridate_wday( replicatedcal$start_date, label = FALSE, week_start=1L ) ] return (replicatedcal) } @@ -419,7 +419,7 @@ makeAllOneDay <- function( cal ) #' expandAllWeeks <- function( cal ) { - if ( 0==nrow(cal) ) + if ( 0L==nrow(cal) ) { #nothing to do return (cal) @@ -428,11 +428,11 @@ expandAllWeeks <- function( cal ) #duration <- cal$end_date - cal$start_date + 1 #make a list of dates for each object being replicated - startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1 ) - endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1 ) + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1L ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1L ) dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) - numWeeks <- ceiling(as.integer(cal$duration) / 7) + numWeeks <- ceiling(as.integer(cal$duration) / 7L) #replicate a logical vector for the start date and use that to select the relevant dates from the date sequence startDayLogical <- SINGLE_DAY_PATTERN_LIST[startDayNum] @@ -452,7 +452,7 @@ expandAllWeeks <- function( cal ) replicatedcal$end_date <- endDates #tidy up the values so they are correct for the spilt items - replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1 + replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1L return (replicatedcal) } @@ -488,7 +488,7 @@ allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) #and the day of the cancellation is an operating day of the calendar item joined = cancellations[calendar, on = .(originalUID==originalUID, start_date>=start_date, - end_date<=end_date), nomatch = 0][ + end_date<=end_date), nomatch = 0L][ ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] #revert the stashed (join) fields @@ -565,37 +565,37 @@ fixOverlappingDates <- function( cal ) rowCount = nrow(cal) #forwards - for (j in seq(1, rowCount)) { + for (j in seq(1L, rowCount)) { #adjust our end date if next item a higher priority overlay - if (j1 && !is.na(cal$UID[j-1]) && cal$STP[j-1] < cal$STP[j] ) + if(j>1 && !is.na(cal$UID[j-1L]) && cal$STP[j-1L] < cal$STP[j] ) { - cal$start_date[j] <- cal$end_date[j-1] +1 + cal$start_date[j] <- cal$end_date[j-1L] +1L } } } #backwards - for (j in seq(rowCount, 1)) { + for (j in seq(rowCount, 1L)) { #adjust our end date if previous item a higher priority overlay - if (j>1 && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1]) ) + if (j>1L && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1L]) ) { - if ( cal$STP[j-1] < cal$STP[j] ) + if ( cal$STP[j-1L] < cal$STP[j] ) { - cal$start_date[j] <- cal$end_date[j-1] +1 + cal$start_date[j] <- cal$end_date[j-1L] +1L } - if(j1 && !is.na(calNew$UID[j-1]) && NOT_NEEDED != calNew$UID[j-1] ) + if(j>1L && !is.na(calNew$UID[j-1L]) && NOT_NEEDED != calNew$UID[j-1L] ) { - calNew$start_date[j] <- calNew$end_date[j-1] +1 + calNew$start_date[j] <- calNew$end_date[j-1L] +1L } } } #backwards - for (j in seq(rowCount, 1)) { + for (j in seq(rowCount, 1L)) { #if we are not valid & the previous item is already valid, fill in our details and adjust our start date - if (j>1 && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1]) ) + if (j>1L && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1L]) ) { calNew <- selectOverlayTimeableAndCopyAttributes(cal, calNew, j) - if ( NOT_NEEDED != calNew$UID[j-1]) + if ( NOT_NEEDED != calNew$UID[j-1L]) { - calNew$start_date[j] <- calNew$end_date[j-1] +1 + calNew$start_date[j] <- calNew$end_date[j-1L] +1L } #if next item valid adjust our start date - if(j 0, ] + # calNew <- calNew[calNew$duration > 0L, ] #performance, do all subsets in one go - calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0), ] + calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0L), ] # Append UID to note the changes - if (nrow(calNew) > 0) + if (nrow(calNew) > 0L) { calNew <- appendLetterSuffix( calNew ) } @@ -735,14 +735,14 @@ splitDates <- function(cal) { #' makeCalendarInner <- function(calendarSub) { - if ( 1 == nrow(calendarSub) ) + if ( 1L == nrow(calendarSub) ) { # make into an single entry res = list(calendarSub, NA) } else { - if (length(unique(calendarSub$UID)) > 1) + if (length(unique(calendarSub$UID)) > 1L) { stop(paste("Error: makeCalendarInner was passed more than one service to work on. service=", unique(calendarSub$UID))) } @@ -757,13 +757,13 @@ makeCalendarInner <- function(calendarSub) { overlayDurations <- as.numeric(calendarSub$duration[calendarSub$STP != baseType]) overlayTypes <- calendarSub$STP[calendarSub$STP != baseType] - if( length(overlayDurations) <= 0 ) + if( length(overlayDurations) <= 0L ) { #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules res = list( appendLetterSuffix(calendarSub), NA) } #if every overlay is a one day cancellation - else if ( all(overlayDurations == 1) && all(overlayTypes == "C") ) + else if ( all(overlayDurations == 1L) && all(overlayTypes == "C") ) { #warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") @@ -776,7 +776,7 @@ makeCalendarInner <- function(calendarSub) { uniqueDayPatterns <- unique(calendarSub$Days[calendarSub$STP != "C"]) # if the day patterns are all identical - if (length(uniqueDayPatterns) <= 1 ) + if (length(uniqueDayPatterns) <= 1L ) { #performance pre-sort all the entries by the priority #this speeds things up when we look up the required priority overlay **SEE_NOTE** @@ -828,7 +828,7 @@ makeCalendarForDifferentDayPatterns <- function( calendar, uniqueDayPatterns ) #do the day patterns overlap each other in any way ? #e.g. a mon-sat pattern with a wed-fri overlap. - if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1) ) + if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1L) ) { gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] @@ -843,13 +843,13 @@ makeCalendarForDifferentDayPatterns <- function( calendar, uniqueDayPatterns ) distinctBasePatterns = unique( baseTimetables$Days ) - for (k in seq(1, length(distinctBasePatterns))) { + for (k in seq(1L, length(distinctBasePatterns))) { theseBases = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] theseOverlays = overlayTimetables[ intersectingDayPatterns( distinctBasePatterns[k], overlayTimetables$Days ) ] - if (nrow(theseOverlays) <= 0) + if (nrow(theseOverlays) <= 0L) { splits[[k]] <- appendNumberSuffix( appendLetterSuffix( theseBases ), k ) } From 5df39a94dbe84162a5d578058dc84ef2be288f38 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 19 Sep 2023 14:38:08 +0100 Subject: [PATCH 74/81] explanatory comment why data is mangled --- R/atoc_import.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/atoc_import.R b/R/atoc_import.R index 3715cd3..2045759 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -167,6 +167,9 @@ importMSN <- function(file, silent = TRUE) { # convert to SF object # for some reason the coordinates are mangled + #https://data.atoc.org/sites/all/themes/atoc/files/RSPS5046.pdf + #east = Values are in 0.1 km units. Format is ‘1nnnn’ where nnnn is the distance in 0.1 km units. + #north = Values are in 0.1 km units. Format is ‘6nnnn’ where nnnn is the distance in 0.1 km units station$`Ordnance Survey Grid Ref East` <- as.numeric(station$`Ordnance Survey Grid Ref East`) station$`Ordnance Survey Grid Ref North` <- as.numeric(station$`Ordnance Survey Grid Ref North`) station$`Ordnance Survey Grid Ref East` <- station$`Ordnance Survey Grid Ref East` * 100 - 1e6 From f32d620b629293df679e1e90f2a5620892267650 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 19 Sep 2023 14:38:53 +0100 Subject: [PATCH 75/81] better mapping of additional field onto field defined in GTFS standard --- R/atoc_main.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/atoc_main.R b/R/atoc_main.R index d183fd8..2aace00 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -153,10 +153,16 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = #gtfs systems tend to be tolerant of additional fields, so expose the train_category and power_type so that the consumer can do analysis on them. #e.g. filter out ECS moves + + #https://developers.google.com/transit/gtfs/reference#tripstxt 'trip_short_name' + #"Public facing text used to identify the trip to riders, e.g.to identify train numbers for commuter rail trips. + #If provided, should uniquely identify a trip within a service day; it should not be used for destination names or limited/express designations." + #best mapping from alpha headcode onto standard GTFS field is 'trip_short_name' since it's unique within a single day. + # Ditch unnecessary columns routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type", "train_category")] trips <- trips[, c("trip_id", "route_id", "service_id", "Train Identity", "Power Type")] - names(trips) <- c("trip_id", "route_id", "service_id", "train_identity", "power_type") + names(trips) <- c("trip_id", "route_id", "service_id", "trip_short_name", "power_type") stop_times <- stop_times[, c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type")] calendar <- calendar[, c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] From 0ee68ea9b899e0d28f7616972d99830f746b42fa Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 19 Sep 2023 14:44:16 +0100 Subject: [PATCH 76/81] add support for getting rail data from NAPTAN xml file since that has more permissive licence than ATCO data. --- NAMESPACE | 3 + R/get_naptan.R | 233 ++++++++++++++++++++++++- man/as_data_table_naptan_stop_area.Rd | 17 ++ man/as_data_table_naptan_stop_point.Rd | 22 +++ man/get_naptan.Rd | 4 +- man/get_naptan_xml_doc.Rd | 88 ++++++++++ 6 files changed, 359 insertions(+), 8 deletions(-) create mode 100644 man/as_data_table_naptan_stop_area.Rd create mode 100644 man/as_data_table_naptan_stop_point.Rd create mode 100644 man/get_naptan_xml_doc.Rd diff --git a/NAMESPACE b/NAMESPACE index 73602ab..16ac317 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,10 +4,13 @@ export(ATOC_shapes) export(UK2GTFS_option_stopProcessingAtUid) export(UK2GTFS_option_treatDatesAsInt) export(UK2GTFS_option_updateCachedDataOnLibaryLoad) +export(as_data_table_naptan_stop_area) +export(as_data_table_naptan_stop_point) export(atoc2gtfs) export(dl_example_file) export(get_bank_holidays) export(get_naptan) +export(get_naptan_xml_doc) export(gtfs_clean) export(gtfs_clip) export(gtfs_compress) diff --git a/R/get_naptan.R b/R/get_naptan.R index 4a34ba8..c5a39d8 100644 --- a/R/get_naptan.R +++ b/R/get_naptan.R @@ -1,6 +1,6 @@ #' Get naptan #' -#' Download the NaPTAN stop locations for more information on NaPTAN see +#' Download the NaPTAN stop locations in CSV format. For more information on NaPTAN see #' https://data.gov.uk/dataset/ff93ffc1-6656-47d8-9155-85ea0b8f2251/national-public-transport-access-nodes-naptan #' @param url character, url to the csv format NaPTAN #' @param naptan_extra data frame of missing stops default uses `naptan_missing` @@ -9,7 +9,7 @@ #' functions downloads them from the offical DfT source. NaPTAN has some #' missing bus stops which are added by UK2GTFS. See `naptan_missing` #' -#' +#' Do not use this function for heavy rail use - download in XML format which includes TIPLOC identifiers #' #' @export @@ -19,10 +19,13 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data load_data("naptan_missing") dir.create("temp_naptan") - utils::download.file(url = url, destfile = "temp_naptan/Stops.csv", mode = "wb", quiet = TRUE) - naptan <- readr::read_csv("temp_naptan/Stops.csv", progress = FALSE, show_col_types = FALSE) - unlink("temp_naptan", recursive = TRUE) - #file.remove("naptan.zip") + + tryCatch({ + utils::download.file(url = url, destfile = "temp_naptan/Stops.csv", mode = "wb", quiet = TRUE) + naptan <- readr::read_csv("temp_naptan/Stops.csv", progress = FALSE, show_col_types = FALSE) + }, finally = { + unlink("temp_naptan", recursive = TRUE) + }) # clean file naptan <- naptan[, c("ATCOCode", "NaptanCode", "CommonName", "Easting", "Northing")] @@ -36,7 +39,7 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data naptan$stop_lon <- format(round(naptan$stop_lon, 6), scientific = FALSE) naptan$stop_lat <- format(round(naptan$stop_lat, 6), scientific = FALSE) - # Append alterative tags + # Append alternative tags naptan_extra <- naptan_extra[!naptan_extra$stop_id %in% naptan$stop_id,] naptan <- rbind(naptan, naptan_extra) @@ -66,3 +69,219 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data # # return(naptan) # } + + + +#' Get naptan xml doc +#' +#' Download the NaPTAN stop locations in XML format. +#' For more information on NaPTAN see https://beta-naptan.dft.gov.uk/ +#' @param url character, url to the xml format NaPTAN +#' @param timeout int, timeout in seconds to wait for download to complete +#' @return xml document node +#' @details TransXchange does not store the location of bus stops, so this +#' functions downloads them from the offical DfT source. +#' +#' NrStations with multiple tiplocs seem to be represented as multiple natpan nodes, +#' with one AnnotatedRailRef per naptan node, despite the schema supporting a multiplicity relationship. +#' e.g. stourbridge, clapham junction. +#' these can be joined together on the CRS +#' +#' As of 2023 Naptan is published under a more permissive (OGL3) licence than ATOC data (creative commons licence). +#' +#' http://naptan.dft.gov.uk/naptan/schema/2.5.1/napt/NaPT_stop-v2-5-1.xsd +#' +#' +#' +#' sequence of optional elements +#' +#' +#' +#' +#' +#' +#' +#' +#' +#' +#' + + sequence of optional elements + + + + + + + + + + + Date: Tue, 19 Sep 2023 15:06:17 +0100 Subject: [PATCH 77/81] further little code tidy adding L decoration to numbers --- R/atoc_overlay.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R index f086b90..9998ee0 100644 --- a/R/atoc_overlay.R +++ b/R/atoc_overlay.R @@ -118,7 +118,7 @@ appendLetterSuffix <- function( cal ) { if (rows <= 26L) { - cal$UID <- paste0(cal$UID, " ", LETTERS[1:rows]) + cal$UID <- paste0(cal$UID, " ", LETTERS[1L:rows]) } else { @@ -154,7 +154,7 @@ hasGapInOperatingDays <- function( daysBitmask ) operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) - res = ( lastDay-firstDay+1 != operatingDayCount ) + res = ( lastDay-firstDay+1L != operatingDayCount ) res[is.na(res)] <- FALSE #shouldn't really get this, probably operating days are '0000000' @@ -247,7 +247,7 @@ checkOperatingDayActive <- function(calendar) { else { allDays = local_lubridate_wday( local_seq_date(from = veryfirstDay, to = max(calendar$end_date), by = "day") - , label = FALSE, week_start=1 ) + , label = FALSE, week_start=1L ) } veryfirstDay = veryfirstDay - 1L @@ -403,7 +403,7 @@ makeAllOneDay <- function( cal ) replicatedcal$end_date <- replicatedcal$start_date <- selectedDates #tidy up the values so they are correct for the spilt items - replicatedcal$duration <- 1 + replicatedcal$duration <- 1L replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ local_lubridate_wday( replicatedcal$start_date, label = FALSE, week_start=1L ) ] return (replicatedcal) @@ -518,9 +518,9 @@ selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) { #if we have two adjacent complete items e.g. ....end 13th Jan start 14th jan..... #then it's not a real gap and just an artefact of the algorithm use to generate the dates - if( rowIndex>1 && rowIndex1L && rowIndex= calNew$end_date[rowIndex],,which=TRUE] #are we in a gap between two base timetables with no overlays - if ( length(baseTimetableIndexes)<=0 ) + if ( length(baseTimetableIndexes)<=0L ) { calNew$UID[rowIndex] <- NOT_NEEDED return (calNew) @@ -549,7 +549,7 @@ selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) #this speeds things up when we look up the required priority overlay **SEE_NOTE** #so we don't need to sort again here, just pick the top filtered result - set( calNew, i = rowIndex, j = CALENDAR_COLS_TO_COPY, value = cal[ baseTimetableIndexes[1], CALENDAR_COLS_TO_COPY, with=FALSE ] ) + set( calNew, i = rowIndex, j = CALENDAR_COLS_TO_COPY, value = cal[ baseTimetableIndexes[1L], CALENDAR_COLS_TO_COPY, with=FALSE ] ) #set() runs 3x faster than this style of copy calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] return (calNew) @@ -575,7 +575,7 @@ fixOverlappingDates <- function( cal ) cal$end_date[j] <- cal$start_date[j+1L] -1L } - if(j>1 && !is.na(cal$UID[j-1L]) && cal$STP[j-1L] < cal$STP[j] ) + if(j>1L && !is.na(cal$UID[j-1L]) && cal$STP[j-1L] < cal$STP[j] ) { cal$start_date[j] <- cal$end_date[j-1L] +1L } From 83a4441c0bb4cb5122b55a4e5d000457579a9855 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Wed, 20 Sep 2023 20:28:58 +0100 Subject: [PATCH 78/81] stop null easting/northing leaking out as invalid lat/long and ensure they are NA instead --- R/atoc_import.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/R/atoc_import.R b/R/atoc_import.R index 2045759..e7df250 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -169,18 +169,21 @@ importMSN <- function(file, silent = TRUE) { # for some reason the coordinates are mangled #https://data.atoc.org/sites/all/themes/atoc/files/RSPS5046.pdf #east = Values are in 0.1 km units. Format is ‘1nnnn’ where nnnn is the distance in 0.1 km units. - #north = Values are in 0.1 km units. Format is ‘6nnnn’ where nnnn is the distance in 0.1 km units station$`Ordnance Survey Grid Ref East` <- as.numeric(station$`Ordnance Survey Grid Ref East`) + station$`Ordnance Survey Grid Ref East` <- ifelse( + (0==station$`Ordnance Survey Grid Ref East`), NA, station$`Ordnance Survey Grid Ref East` * 100 - 1e6) + + #north = Values are in 0.1 km units. Format is ‘6nnnn’ where nnnn is the distance in 0.1 km units station$`Ordnance Survey Grid Ref North` <- as.numeric(station$`Ordnance Survey Grid Ref North`) - station$`Ordnance Survey Grid Ref East` <- station$`Ordnance Survey Grid Ref East` * 100 - 1e6 - station$`Ordnance Survey Grid Ref North` <- station$`Ordnance Survey Grid Ref North` * 100 - 6e6 + station$`Ordnance Survey Grid Ref North` <- ifelse( + (0==station$`Ordnance Survey Grid Ref North` | 69999==station$`Ordnance Survey Grid Ref North`), NA, + station$`Ordnance Survey Grid Ref North` * 100 - 6e6) station <- sf::st_as_sf(station, - coords = c( - "Ordnance Survey Grid Ref East", - "Ordnance Survey Grid Ref North" - ), - crs = 27700 + coords = c( "Ordnance Survey Grid Ref East", + "Ordnance Survey Grid Ref North"), + crs = 27700, + na.fail = FALSE ) station <- sf::st_transform(station, 4326) From 1b6e515405d9485842bc789eaeff9edd4962dd53 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Wed, 20 Sep 2023 20:30:29 +0100 Subject: [PATCH 79/81] alter detection of ships so it actually works- real world data doesn't completely follow the schema. Also add 'metro' to long name for underground & Tyne&Wear metro services --- R/atoc_export.R | 18 ++++++++++++------ R/atoc_main.R | 9 +++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 5701967..8a70d91 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -179,12 +179,18 @@ longnames <- function(routes, stop_times, stops) { routes <- dplyr::left_join(routes, stop_times_sub, by = c("rowID" = "schedule")) - - routes[`Train Category` == "SS", route_long_name := paste("Ship",route_long_name)] - routes[`Train Category` %in% c("BS", "BR"), route_long_name := paste("Bus",route_long_name)] - routes[!(`Train Category` %in% c("SS", "BS", "BR")), route_long_name := paste("Train",route_long_name)] - #TODO reflect the London Transport services being set to metro/underground in this naming code - + #you'd expect to only have to look at category to tell if it's a ship, but in practice the category for + #ships is NA, so we have to look at 'Train Status' too. + routes["SS" ==`Train Category` | "S"==`Train Status` | "4"==`Train Status`, + route_long_name := paste("Ship",route_long_name)] + routes[`Train Category` %in% c("BS", "BR"), + route_long_name := paste("Bus",route_long_name)] + + #Tyne & Wear metro is "OL" in data OL="London Underground/Metro Service" + routes[`Train Category` %in% c("EL", "OL"), + route_long_name := paste("Metro",route_long_name)] + routes[!(`Train Category` %in% c("SS", "BS", "BR", "EL", "OL") | "S"==`Train Status` | "4"==`Train Status`), + route_long_name := paste("Train",route_long_name)] return(routes) } diff --git a/R/atoc_main.R b/R/atoc_main.R index 2aace00..518222f 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -127,12 +127,17 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = trips <- dplyr::left_join(trips, train_status, by = c("Train Status" = "train_status")) rm(train_status) + trips$route_type[trips$`Train Category` %in% c("EL", "OL") & trips$route_type == 2 ] <- 1 + # London Underground is Metro (unless already identified as a bus/ship etc) + # "OL" is also used for Tyne & Wear metro + routes <- trips routes <- dplyr::group_by(routes, `ATOC Code`, route_long_name, `Train Category`, route_type ) routes <- dplyr::summarise(routes) routes$route_id <- 1:nrow(routes) + #join route_id back into trip table trips <- dplyr::left_join(trips, routes, by = c("ATOC Code", "route_long_name", "Train Category", "route_type")) routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name", "Train Category" )] @@ -141,10 +146,6 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = # IDs are not meaningful, just leave out routes$route_short_name <- "" # was: routes$route_id - routes$route_type[routes$agency_id == "LT" & routes$route_type == 2 ] <- 1 - # London Underground is Metro (unless already identified as a bus/ship etc) - #TODO look at what this causes LizPurpCrossRailElizabethLine to be categorised as. - #TODO move to longnames() ### Section 6: ####################################################### # Final Checks From d889ee568d8ef9382090cc58f712aaf5391603c3 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 22 Sep 2023 21:09:18 +0100 Subject: [PATCH 80/81] if stop doesn't exist in TIPLOC file - put tiploc code name into route name instead of NA --- R/atoc_export.R | 5 +++-- R/atoc_nr.R | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index 8a70d91..d8391b8 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -161,15 +161,16 @@ longnames <- function(routes, stop_times, stops) { stop_times_sub, dplyr::rename(stops[, c("stop_id", "stop_name")], stop_name_a = stop_name), by = c("stop_id_a" = "stop_id")) + stop_times_sub <- dplyr::left_join( stop_times_sub, dplyr::rename(stops[, c("stop_id", "stop_name")], stop_name_b = stop_name), by = c("stop_id_b" = "stop_id")) stop_times_sub$route_long_name <- paste0("from ", - stop_times_sub$stop_name_a, + ifelse( is.na(stop_times_sub$stop_name_a), stop_times_sub$stop_id_a, stop_times_sub$stop_name_a), " to ", - stop_times_sub$stop_name_b) + ifelse( is.na(stop_times_sub$stop_name_b), stop_times_sub$stop_id_b, stop_times_sub$stop_name_b) ) stop_times_sub$route_long_name <- gsub(" Rail Station", "" , stop_times_sub$route_long_name) diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 7cbe8e0..f1ea53a 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -164,9 +164,9 @@ getCachedLocationData <- function(locations = "tiplocs") if (inherits(locations, "sf")) { stops <- cbind(locations, sf::st_coordinates(locations)) - stops <- as.data.frame(stops) - stops <- stops[, c( "stop_id", "stop_code", "stop_name", "Y", "X" )] - names(stops) <- c( "stop_id", "stop_code", "stop_name", "stop_lat", "stop_lon" ) + stops <- sf::st_drop_geometry(stops) + stops <- as.data.table(stops) + setnames(stops, old = c("Y", "X"), new = c("stop_lat", "stop_lon")) } else #TODO test column names { From a148e7025d2e3492b5f2a4c3ca2a354b3911eaec Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Sun, 24 Sep 2023 12:25:00 +0100 Subject: [PATCH 81/81] 1) when no public arrival time available fall back to WTT time then pass time 2) when looking at non-public services include passes marked as 'no pickup/dropoff available' from a GTFS perspective --- R/atoc_export.R | 9 ++-- R/atoc_import.R | 87 +++++++++++++++++++++++++---------- tests/testthat/test_aa_unit.R | 55 +++++++++++++++++----- 3 files changed, 110 insertions(+), 41 deletions(-) diff --git a/R/atoc_export.R b/R/atoc_export.R index d8391b8..1ca9195 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -652,14 +652,15 @@ clean_activities2 <- function(x, public_only = TRUE) { x$pickup_type[is.na(x$pickup_type)] <- 0 x$drop_off_type[is.na(x$drop_off_type)] <- 0 } - x <- x[, c("pickup_type", "drop_off_type")] } else #set all of the stops on a route to be valid for passenger boarding / alighting from a GTFS perspective + # (unless they are 'pass' which have no 'activity' as they woosh past - just like deadlines.) { - x$pickup_type <- 0 - x$drop_off_type <- 0 - x <- x[, c("pickup_type", "drop_off_type", "activity")] + x$pickup_type <- ifelse( is.na(x$activity), 1, 0 ) + x$drop_off_type <- ifelse( is.na(x$activity), 1, 0 ) } + x <- x[, c("pickup_type", "drop_off_type")] + return(x) } diff --git a/R/atoc_import.R b/R/atoc_import.R index e7df250..38fd6dd 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -303,34 +303,68 @@ strip_whitespace <- function(dt) { } -#does in place-modification of input data.table + process_times <- function(dt, working_timetable) { - #fill in the missing seconds - substituting H for 30 seconds. - if (working_timetable) - { - if ("Scheduled Arrival Time" %in% colnames(dt)) { - set(dt, j = "Arrival Time", value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[["Scheduled Arrival Time"]]))) - } - if ("Scheduled Departure Time" %in% colnames(dt)) { - set(dt, j = "Departure Time", value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[["Scheduled Departure Time"]]))) - } - } - else + dt = processOneTime(dt, working_timetable, "Arrival Time", "Scheduled Arrival Time", "Public Arrival Time") + dt = processOneTime(dt, working_timetable, "Departure Time", "Scheduled Departure Time", "Public Departure Time") + + return(dt) +} + + +#does in place-modification of input data.table +#select the public arrive/depart times if they exist, otherwise select the wtt arrive/depart times if they exist, otherwise select the pass time +#and at the same time fill in the missing seconds values (and 30 seconds if 'H' is indicated) +processOneTime <- function(dt, working_timetable, targetField, sourceFieldWtt, sourceField) +{ + hasPass = "Scheduled Pass" %in% colnames(dt) + + if (sourceFieldWtt %in% colnames(dt)) { - if ("Public Arrival Time" %in% colnames(dt)) { - set(dt, j = "Arrival Time", value = gsub("^(\\d{4})$", "\\100", dt[["Public Arrival Time"]])) + if (working_timetable) + { + if(hasPass) + { + set(dt, j = targetField, value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", + data.table::fifelse( " "==dt[[sourceFieldWtt]], + dt[["Scheduled Pass"]], + dt[[sourceFieldWtt]]))) + ) + } + else + { + set(dt, j = targetField, value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[[sourceFieldWtt]]))) + } } - - if ("Public Departure Time" %in% colnames(dt)) { - set(dt, j = "Departure Time", value = gsub("^(\\d{4})$", "\\100", dt[["Public Departure Time"]])) + else + { + if(hasPass) + { + set(dt, j = targetField, value = data.table::fifelse( "0000"==dt[[sourceField]], + gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", + data.table::fifelse( " "==dt[[sourceFieldWtt]], + dt[["Scheduled Pass"]], + dt[[sourceFieldWtt]]))), + gsub("^(\\d{4})$", "\\100", dt[[sourceField]])) + ) + } + else + { + #If there is no Public Arrival time this field will default to 0000. (we will use WTT instead) + set(dt, j = targetField, value = data.table::fifelse( "0000"==dt[[sourceField]], + gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[[sourceFieldWtt]])), + gsub("^(\\d{4})$", "\\100", dt[[sourceField]])) + ) + } } } - return(dt) + return (dt) } + # Process Activity Codes process_activity <- function(dt, public_only) { @@ -373,8 +407,11 @@ process_activity <- function(dt, public_only) { activity = gsub(",+", ",", activity) set(dt, j="Activity", value = gsub("\\s+|^,|,$", "", activity)) - #remove rows with no activity we're interested in - dt <- dt[ ""!=dt$Activity ] + #remove rows with no activity we're interested in (there is no activity at 'pass' locations) + if(public_only) + { + dt <- dt[ ""!=dt$Activity ] + } return(dt) } @@ -506,9 +543,9 @@ importMCA <- function(file, # Add the rowid LO$rowID <- rowIds[types == "LO"] - LO[, c("Scheduled Arrival Time","Public Arrival Time") := ""] + LO[, c("Scheduled Arrival Time","Public Arrival Time", "Scheduled Pass") := ""] LO <- LO[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", - "Public Arrival Time", "Public Departure Time" )] + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] # Intermediate Station @@ -534,7 +571,7 @@ importMCA <- function(file, LI$rowID <- rowIds[types == "LI"] LI <- LI[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", - "Public Arrival Time", "Public Departure Time" )] + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] # Terminating Station @@ -556,9 +593,9 @@ importMCA <- function(file, # Add the rowid LT$rowID <- rowIds[types == "LT"] - LT[, c("Scheduled Departure Time","Public Departure Time") := ""] + LT[, c("Scheduled Departure Time","Public Departure Time", "Scheduled Pass") := ""] LT <- LT[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", - "Public Arrival Time", "Public Departure Time" )] + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] # TIPLOC Insert diff --git a/tests/testthat/test_aa_unit.R b/tests/testthat/test_aa_unit.R index e002e6b..8bd114b 100644 --- a/tests/testthat/test_aa_unit.R +++ b/tests/testthat/test_aa_unit.R @@ -84,19 +84,49 @@ test_that("test splitBitmask performance", { +test_that("test process_times", { + testData = data.table( + `Scheduled Arrival Time` =c("", " ", "0000 ", "1234H", "5678 "), + `Scheduled Departure Time`=c("", " ", "0106 ", "2156H", "8765H"), + `Public Arrival Time` =c("", " ", "0135", "tjkl", "0000"), + `Public Departure Time` =c("", " ", "1234", "tgbi", "0000")) + OK = TRUE + { + res = process_times( testData, FALSE ) -test_that("test process_times", { + res = res[,c("Arrival Time","Departure Time")] - testData = data.table( - `Scheduled Arrival Time` =c("", " ", "0000 ", "1234H"), - `Scheduled Departure Time`=c("", " ", "0106 ", "2156H"), - `Public Arrival Time` =c("", " ", "0135", "tjkl" ), - `Public Departure Time` =c("", " ", "1234", "tgbi" )) + expectedResult = data.table( + `Arrival Time` =c("", " ", "013500", "tjkl", "567800"), + `Departure Time` =c("", " ", "123400", "tgbi", "876530")) - OK = TRUE + printDifferencesDf(expectedResult, res) + OK = OK & identical(expectedResult, res) + } + { + res = process_times( testData, TRUE ) + + res = res[,c("Arrival Time","Departure Time")] + + expectedResult = data.table( + `Arrival Time` =c("", " ", "000000", "123430", "567800" ), + `Departure Time` =c("", " ", "010600", "215630", "876530" )) + + printDifferencesDf(expectedResult, res) + OK = OK & identical(expectedResult, res) + } + + + testData = data.table( + `Scheduled Arrival Time` =c("", " ", " ", "1234H", " "), + `Scheduled Departure Time`=c("", " ", "0106 ", "2156H", " "), + `Public Arrival Time` =c("", " ", "0135", "tjkl", "0000"), + `Public Departure Time` =c("", " ", "1234", "tgbi", "0000"), + `Scheduled Pass` =c("", "1234 ", "0001 ", "1234H", "5678 ") + ) { res = process_times( testData, FALSE ) @@ -104,8 +134,8 @@ test_that("test process_times", { res = res[,c("Arrival Time","Departure Time")] expectedResult = data.table( - `Arrival Time` =c("", " ", "013500", "tjkl" ), - `Departure Time` =c("", " ", "123400", "tgbi" )) + `Arrival Time` =c("", " ", "013500", "tjkl", "567800"), + `Departure Time` =c("", " ", "123400", "tgbi", "567800")) printDifferencesDf(expectedResult, res) OK = OK & identical(expectedResult, res) @@ -116,13 +146,14 @@ test_that("test process_times", { res = res[,c("Arrival Time","Departure Time")] expectedResult = data.table( - `Arrival Time` =c("", " ", "000000", "123430" ), - `Departure Time` =c("", " ", "010600", "215630" )) + `Arrival Time` =c("", "123400", "000100", "123430", "567800" ), + `Departure Time` =c("", "123400", "010600", "215630", "567800" )) printDifferencesDf(expectedResult, res) OK = OK & identical(expectedResult, res) } + expect_true( OK ) }) @@ -665,7 +696,7 @@ test_that("test process_activity:1", { res = process_activity( testData, FALSE ) - expectedResult = data.table( Activity=c("TB,T,D,U,R,TF", "ab,cd,ef,gh,ij,kl", "a,d,ef,ij,kl","cd,ef,gh,ij" )) + expectedResult = data.table( Activity=c("", "TB,T,D,U,R,TF", "ab,cd,ef,gh,ij,kl", "a,d,ef,ij,kl","cd,ef,gh,ij" )) OK = OK & identical(expectedResult,res) }