diff --git a/DESCRIPTION b/DESCRIPTION index bae4db6..4640a71 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,13 @@ Package: UK2GTFS Type: Package Title: Converts UK transport timetable datasets to GTFS format -Version: 0.2.1 +Version: 1.0 Authors@R: c( person("Malcolm", "Morgan", email = "m.morgan1@leeds.ac.uk", role = c("aut","cre"), comment = c(ORCID = "0000-0002-9488-9183")), person("Adrian", "Schönig", role = c("ctb")), - person("Owen", "O'Neill", role = c("ctb")) + person("Owen", "O'Neill", email = "owen@wvr.org.uk", role = c("aut"), + comment = c(ORCID = "0009-0008-0595-3042")) ) Maintainer: Malcolm Morgan Description: The UK uses a range of odd formats to store timetable data, this package converts them to the nice GTFS format. @@ -30,14 +31,16 @@ Imports: geodist, httr, iotools, - stringr, - sf, - parallel, lubridate, + parallel, purrr (>= 1.0), pbapply, - readr (>= 2.0), RcppSimdJson, + readr (>= 2.0), + sf, + stats, + stringi, + stringr, xml2, zip, Suggests: diff --git a/NAMESPACE b/NAMESPACE index 50480a7..16ac317 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,16 @@ # Generated by roxygen2: do not edit by hand export(ATOC_shapes) +export(UK2GTFS_option_stopProcessingAtUid) +export(UK2GTFS_option_treatDatesAsInt) +export(UK2GTFS_option_updateCachedDataOnLibaryLoad) +export(as_data_table_naptan_stop_area) +export(as_data_table_naptan_stop_point) export(atoc2gtfs) export(dl_example_file) export(get_bank_holidays) export(get_naptan) +export(get_naptan_xml_doc) export(gtfs_clean) export(gtfs_clip) export(gtfs_compress) @@ -33,3 +39,5 @@ export(station2stops) export(transxchange2gtfs) export(transxchange_import) export(update_data) +import(data.table) +importFrom(data.table,":=") diff --git a/R/atoc.R b/R/atoc.R index 6c39286..c79d93a 100644 --- a/R/atoc.R +++ b/R/atoc.R @@ -10,8 +10,10 @@ #' @param agency where to get agency.txt (see details) #' @param shapes Logical, should shapes.txt be generated (default FALSE) #' @param transfers Logical, should transfers.txt be generated (default TRUE) -#' @param missing_tiplocs Logical, if locations = tiplocs, then will check for +#' @param missing_tiplocs Logical, if true will check for #' any missing tiplocs against the main file and add them.(default TRUE) +#' @param working_timetable Logical, should WTT times be used instead of public times (default FALSE) +#' @param public_only Logical, only return calls/services that are for public passenger pickup/set down (default TRUE) #' @family main #' #' @details Locations @@ -44,22 +46,9 @@ atoc2gtfs <- function(path_in, agency = "atoc_agency", shapes = FALSE, transfers = TRUE, - missing_tiplocs = TRUE) { - - if(inherits(locations,"character")){ - if(locations == "tiplocs"){ - load_data("tiplocs") - locations = tiplocs - } - } - - if(inherits(agency,"character")){ - if(agency == "atoc_agency"){ - load_data("atoc_agency") - agency = atoc_agency - } - } - + missing_tiplocs = TRUE, + working_timetable = FALSE, + public_only = TRUE) { # Checkmates checkmate::assert_character(path_in, len = 1) checkmate::assert_file_exists(path_in) @@ -73,6 +62,14 @@ atoc2gtfs <- function(path_in, " This will take some time, make sure you use 'ncores' to enable multi-core processing" )) } + + agency = getCachedAgencyData( agency ) + + if ( !inherits(locations, "character") || "file"!=locations ) + { + stops_sf = getCachedLocationData( locations ) + } + # Is input a zip or a folder if (grepl(".zip", path_in)) { # Unzip @@ -110,61 +107,29 @@ atoc2gtfs <- function(path_in, file = files[grepl(".mca", files)], silent = silent, ncores = 1, - full_import = TRUE + full_import = TRUE, + working_timetable = working_timetable, + public_only = public_only ) - # Get the Station Locations - # Are locations provided? - if ("sf" %in% class(locations)) { - stops_sf <- cbind(locations, sf::st_coordinates(locations)) - stops_sf <- as.data.frame(stops_sf) - stops_sf <- stops_sf[, c( - "stop_id", "stop_code", "stop_name", - "Y", "X" - )] - names(stops_sf) <- c( - "stop_id", "stop_code", "stop_name", - "stop_lat", "stop_lon" - ) - stops_sf$stop_lat <- round(stops_sf$stop_lat, 5) - stops_sf$stop_lon <- round(stops_sf$stop_lon, 5) - } - # Should the file be checked - check_file <- FALSE - if("sf" %in% class(locations) & missing_tiplocs){ - check_file <- TRUE - } - - if ("character" %in% class(locations)) { - if(locations == "file"){ - check_file <- TRUE - } - } - - if (check_file) { + if ( TRUE==missing_tiplocs || + ( inherits(locations, "character") && "file"==locations ) ) + { msn <- importMSN(files[grepl(".msn", files)], silent = silent) station <- msn[[1]] TI <- mca[["TI"]] stops.list <- station2stops(station = station, TI = TI) stops_file <- stops.list[["stops"]] rm(msn,TI,stops.list) - } - # Was a csv provided - if ("character" %in% class(locations)) { - if(locations != "file"){ - checkmate::check_file_exists(locations) - stops_csv <- utils::read.csv(locations, stringsAsFactors = FALSE) + if( FALSE==missing_tiplocs || !exists("stops_sf") ) + { + stops <- stops_file } - } - - # Chose Correct stops - if(exists("stops_csv")){ - stops <- stops_csv - } else if(exists("stops_sf")){ - if(missing_tiplocs == TRUE){ + else + { # Combine stops_missing <- stops_file[!stops_file$stop_id %in% stops_sf$stop_id,] if(nrow(stops_missing) > 0){ @@ -173,12 +138,11 @@ atoc2gtfs <- function(path_in, } else { stops <- stops_sf } - - } else { - stops <- stops_sf } - } else if(exists("stops_file")){ - stops <- stops_file + } + else + { + stops <- stops_sf } @@ -203,17 +167,23 @@ atoc2gtfs <- function(path_in, # remove any unused stops stops <- stops[stops$stop_id %in% stop_times$stop_id, ] + if ( nrow(stops)<=0 ) + { + stop("Could not match any stops in input data to stop database.") + } + + # Main Timetable Build timetables <- schedule2routes( stop_times = stop_times, stops = stops, schedule = schedule, silent = silent, - ncores = ncores + ncores = ncores, + public_only = public_only ) rm(schedule) gc() - # load("data/atoc_agency.RData") # TODO: check for stop_times that are not valid stops diff --git a/R/atoc_export.R b/R/atoc_export.R index 85ca81b..1ca9195 100644 --- a/R/atoc_export.R +++ b/R/atoc_export.R @@ -84,7 +84,7 @@ station2stops <- function(station, TI) { #' @param flf imported flf file from importFLF #' @noRd #' -station2transfers <- function(station, flf, path_out) { +station2transfers <- function(station, flf) { ### SECTION 4: ############################################################ # make make the transfers.txt @@ -134,175 +134,8 @@ station2transfers <- function(station, flf, path_out) { return(transfers) } -#' split overlapping start and end dates# -#' -#' @param cal cal object -#' @details split overlapping start and end dates -#' @noRd - -splitDates <- function(cal) { - - # get all the dates that - dates <- c(cal$start_date, cal$end_date) - dates <- dates[order(dates)] - # create all unique pairs - dates.df <- data.frame( - start_date = dates[seq(1, length(dates) - 1)], - end_date = dates[seq(2, length(dates))] - ) - - cal.new <- dplyr::right_join(cal, dates.df, - by = c( "start_date", "end_date" ) - ) - - if ("P" %in% cal$STP) { - match <- "P" - } else { - match <- cal$STP[cal$STP != "C"] - match <- match[1] - } - - # fill in the original missing schedule - for (j in seq(1, nrow(cal.new))) { - if (is.na(cal.new$UID[j])) { - st_tmp <- cal.new$start_date[j] - ed_tmp <- cal.new$end_date[j] - new.UID <- cal$UID[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Days <- cal$Days[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.roWID <- cal$rowID[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.ATOC <- cal$`ATOC Code`[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Retail <- cal$`Retail Train ID`[cal$STP == match & - cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.head <- cal$Headcode[cal$STP == match & cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - new.Status <- cal$`Train Status`[cal$STP == match & - cal$start_date <= st_tmp & - cal$end_date >= ed_tmp] - if (length(new.UID) == 1) { - cal.new$UID[j] <- new.UID - cal.new$Days[j] <- new.Days - cal.new$rowID[j] <- new.roWID - cal.new$`ATOC Code`[j] <- new.ATOC - cal.new$`Retail Train ID`[j] <- new.Retail - cal.new$`Train Status`[j] <- new.Status - cal.new$Headcode[j] <- new.head - cal.new$STP[j] <- match - } else if (length(new.UID) > 1) { - message("Going From") - print(cal) - message("To") - print(cal.new) - stop() - # readline(prompt="Press [enter] to continue")print() - } - } - } - - # remove any gaps - cal.new <- cal.new[!is.na(cal.new$UID), ] - - # remove duplicated rows - cal.new <- cal.new[!duplicated(cal.new), ] - - # modify end and start dates - for (j in seq(1, nrow(cal.new))) { - if (cal.new$STP[j] == "P") { - # check if end date need changing - if (j < nrow(cal.new)) { - if (cal.new$end_date[j] == cal.new$start_date[j + 1]) { - cal.new$end_date[j] <- (cal.new$end_date[j] - 1) - } - } - # check if start date needs changing - if (j > 1) { - if (cal.new$start_date[j] == cal.new$end_date[j - 1]) { - cal.new$start_date[j] <- (cal.new$start_date[j] + 1) - } - } - } - } - - # remove cancelled trips - cal.new <- cal.new[cal.new$STP != "C", ] - - # fix duration - cal.new$duration <- cal.new$end_date - cal.new$start_date + 1 - - # remove any zero or negative day schedules - cal.new <- cal.new[cal.new$duration > 0, ] - - # Append UID to note the changes - if (nrow(cal.new) > 0) { - if (nrow(cal.new) < 27) { - cal.new$UID <- paste0(cal.new$UID, " ", letters[1:nrow(cal.new)]) - } else { - # Cases where we need extra letters, gives upto 676 ids - lett <- paste0(rep(letters, each = 26), rep(letters, times = 26)) - cal.new$UID <- paste0(cal.new$UID, " ", lett[1:nrow(cal.new)]) - } - } else { - cal.new <- NA - } - - - return(cal.new) -} - - - -DATE_EPOC <- as.Date("01/01/1970", format = "%d/%m/%Y") -WEEKDAY_NAME_VECTOR <- c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") -CHECKROWS_NAME_VECTOR <- c(WEEKDAY_NAME_VECTOR, "duration", "start_date", "end_date") - -DURATION_INDEX <- match("duration", CHECKROWS_NAME_VECTOR) -START_DATE_INDEX <- match("start_date", CHECKROWS_NAME_VECTOR) -END_DATE_INDEX <- match("end_date", CHECKROWS_NAME_VECTOR) -MONDAY_INDEX <- match("monday", CHECKROWS_NAME_VECTOR) -SUNDAY_INDEX <- match("sunday", CHECKROWS_NAME_VECTOR) - -# TODO: Does not work within functions, rejig to work in package. -# -#' internal function for cleaning calendar -#' -#' @details -#' check for schedules that don't overlay with the days they run i.e. -#' Mon - Sat schedules for a sunday only service -#' return a logical vector of if the calendar is valid -#' -#' @param tmp 1 row dataframe -#' @noRd -#' -checkrows <- function(tmp) { - #tmp = res[i,] - # message(paste0("done ",i)) - - if (tmp[DURATION_INDEX] < 7) { - days.valid <- weekdays(seq.POSIXt( - from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ), - to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ), - by = "DSTday" - )) - days.valid <- tolower(days.valid) - #get a vector of names of days of week that the timetable is valid on - days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX] - days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ] - - if (any(days.valid %in% days.match)) { - return(TRUE) - } else { - return(FALSE) - } - } else { - return(TRUE) - } -} #' internal function for constructing longnames of routes #' @@ -315,28 +148,29 @@ checkrows <- function(tmp) { #' @noRd #' longnames <- function(routes, stop_times, stops) { + stop_times_sub <- dplyr::group_by(stop_times, trip_id) stop_times_sub <- dplyr::summarise(stop_times_sub, schedule = unique(schedule), stop_id_a = stop_id[stop_sequence == 1], # seq = min(stop_sequence), - stop_id_b = stop_id[stop_sequence == max(stop_sequence)] - ) + stop_id_b = stop_id[stop_sequence == max(stop_sequence)] ) # Add names for `stop_id_[a|b]` as `stop_name_[a|b]` stop_times_sub <- dplyr::left_join( stop_times_sub, dplyr::rename(stops[, c("stop_id", "stop_name")], stop_name_a = stop_name), by = c("stop_id_a" = "stop_id")) + stop_times_sub <- dplyr::left_join( stop_times_sub, dplyr::rename(stops[, c("stop_id", "stop_name")], stop_name_b = stop_name), by = c("stop_id_b" = "stop_id")) - stop_times_sub$route_long_name <- paste0("From ", - stop_times_sub$stop_name_a, + stop_times_sub$route_long_name <- paste0("from ", + ifelse( is.na(stop_times_sub$stop_name_a), stop_times_sub$stop_id_a, stop_times_sub$stop_name_a), " to ", - stop_times_sub$stop_name_b) + ifelse( is.na(stop_times_sub$stop_name_b), stop_times_sub$stop_id_b, stop_times_sub$stop_name_b) ) stop_times_sub$route_long_name <- gsub(" Rail Station", "" , stop_times_sub$route_long_name) @@ -346,9 +180,23 @@ longnames <- function(routes, stop_times, stops) { routes <- dplyr::left_join(routes, stop_times_sub, by = c("rowID" = "schedule")) + #you'd expect to only have to look at category to tell if it's a ship, but in practice the category for + #ships is NA, so we have to look at 'Train Status' too. + routes["SS" ==`Train Category` | "S"==`Train Status` | "4"==`Train Status`, + route_long_name := paste("Ship",route_long_name)] + routes[`Train Category` %in% c("BS", "BR"), + route_long_name := paste("Bus",route_long_name)] + + #Tyne & Wear metro is "OL" in data OL="London Underground/Metro Service" + routes[`Train Category` %in% c("EL", "OL"), + route_long_name := paste("Metro",route_long_name)] + routes[!(`Train Category` %in% c("SS", "BS", "BR", "EL", "OL") | "S"==`Train Status` | "4"==`Train Status`), + route_long_name := paste("Train",route_long_name)] return(routes) } + + #' make calendar #' #' @details @@ -359,254 +207,362 @@ longnames <- function(routes, stop_times, stops) { #' @noRd #' makeCalendar <- function(schedule, ncores = 1) { - # prep the inputs - calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", - "Days Run", "STP indicator", "rowID", "Headcode", - "ATOC Code", "Retail Train ID", "Train Status")] - calendar$`STP indicator` <- as.character(calendar$`STP indicator`) - # calendar = calendar[order(-calendar$`STP indicator`),] - names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", - "rowID", "Headcode", "ATOC Code", - "Retail Train ID", "Train Status") - calendar$duration <- calendar$end_date - calendar$start_date + 1 - - # UIDs = unique(calendar$UID) - # length_todo = length(UIDs) - message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) - calendar_split <- split(calendar, calendar$UID) - - - if (ncores > 1) { - cl <- parallel::makeCluster(ncores) - # parallel::clusterExport( - # cl = cl, - # varlist = c("calendar", "UIDs"), - # envir = environment() - # ) - parallel::clusterEvalQ(cl, { - loadNamespace("UK2GTFS") - }) - pbapply::pboptions(use_lb = TRUE) - res <- pbapply::pblapply(calendar_split, - makeCalendar.inner, - cl = cl - ) - parallel::stopCluster(cl) - rm(cl) - } else { - res <- pbapply::pblapply( - calendar_split, - makeCalendar.inner) - } - res.calendar <- lapply(res, `[[`, 1) - res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, was taking 10 minutes to execute bind_rows - res.calendar_dates <- lapply(res, `[[`, 2) - res.calendar_dates <- res.calendar_dates[!is.na(res.calendar_dates)] - res.calendar_dates <- data.table::rbindlist(res.calendar_dates, use.names=FALSE) + treatDatesAsInt = getOption("UK2GTFS_opt_treatDatesAsInt", default=TRUE) + set_TREAT_DATES_AS_INT( treatDatesAsInt ) + + tryCatch({ + + # prep the inputs + calendar <- schedule[, c("Train UID", "Date Runs From", "Date Runs To", "Days Run", "STP indicator", "rowID" )] + names(calendar) <- c("UID", "start_date", "end_date", "Days", "STP", "rowID" ) + + if( treatDatesAsInt ) + { + setupDatesCache( calendar ) + #treating date as int: seem to be about twice as fast on the critical line when selecting base timetable + calendar$start_date = as.integer( calendar$start_date ) + calendar$end_date = as.integer( calendar$end_date ) + } + + + okCalendarDates = validateCalendarDates( calendar ) + if ( !all( okCalendarDates ) ) + { + warning(Sys.time(), " Some calendar dates had incorrect start or end dates that did not align with operating day bitmask.\n Services=", + paste( unique( calendar$UID[ !okCalendarDates ] ), collapse = "," ) ) + } + + #we're going to be splitting and replicating calendar entries - stash the original UID so we can join back on it later + calendar$originalUID <- calendar$UID + calendar$STP <- as.character(calendar$STP) + calendar$duration <- calendar$end_date - calendar$start_date + 1 + + + #brutal, but makes code later on simpler, make all cancellations one day cancellations by splitting + #TODO don't split up into one day cancellations if all the operating day patterns on a service are identical + cancellations <- makeAllOneDay( calendar[calendar$STP == "C", ] ) + calendar <- calendar[calendar$STP != "C", ] - days <- lapply(res.calendar$Days, function(x) { - as.integer(substring(x, 1:7, 1:7)) + + #debugging option + set_STOP_PROCESSING_UID( getOption("UK2GTFS_opt_stopProcessingAtUid") ) + + message(paste0(Sys.time(), " Constructing calendar and calendar_dates")) + calendar$`__TEMP__` <- calendar$UID + calendar_split <- calendar[, .(list(.SD)), by = `__TEMP__`][,V1] + + if (ncores > 1) { + cl <- parallel::makeCluster(ncores) + + parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) #stop the child workers from calling update_data() + workerEnv=loadNamespace("UK2GTFS") + }) + + #copy variables from this context into global context of worker processes + varList = list("TREAT_DATES_AS_INT", "WDAY_LOOKUP_MIN_VALUE", "WDAY_LOOKUP_MAX_VALUE", "WDAY_LOOKUP_MAP") + parallel::clusterExport(cl=cl, varlist=varList, envir=asNamespace("UK2GTFS")) + + #set module level global in all workers + parallel::clusterEvalQ(cl, { + copyFromGlobalEnvToPackageEnv<- function(varName){ + UK2GTFS:::setValueInThisEnvironment(varName, get(varName, envir=.GlobalEnv)) + } + copyFromGlobalEnvToPackageEnv("TREAT_DATES_AS_INT") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MIN_VALUE") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MAX_VALUE") + copyFromGlobalEnvToPackageEnv("WDAY_LOOKUP_MAP") + }) + + pbapply::pboptions(use_lb = TRUE) + res <- pbapply::pblapply(calendar_split, + makeCalendarInner, + cl = cl + ) + + parallel::stopCluster(cl) + rm(cl) + } else { + res <- pbapply::pblapply( + calendar_split, + makeCalendarInner + ) + } + + + res.calendar <- lapply(res, `[[`, 1) + res.calendar <- data.table::rbindlist(res.calendar, use.names=FALSE) #performance, takes 10 minutes to execute bind_rows on full GB daily timetable + + res.cancellation_dates <- lapply(res, `[[`, 2) + res.cancellation_dates <- res.cancellation_dates[!is.na(res.cancellation_dates)] + res.cancellation_dates <- data.table::rbindlist(res.cancellation_dates, use.names=FALSE) + stopifnot( 0==nrow(res.cancellation_dates) ) + rm(res.cancellation_dates) + #since we didn't pass in any cancellations we should no longer get any back + + res.calendar = splitAndRebindBitmask( res.calendar ) + cancellations = splitAndRebindBitmask( cancellations ) + + #associate the split up cancellations with the (new) calendar they are associated with + #(only works because cancellations are all one day duration) + cancellations = allocateCancellationsAcrossCalendars( res.calendar, cancellations ) + + #no longer need the field that was used to associate the original and replicated calendars together + cancellations$originalUID <- NULL + res.calendar$originalUID <- NULL + + #error checking + dups = duplicated( res.calendar$UID ) + if( any(TRUE==dups) ) + { + dups = unique( res.calendar$UID[ dups ] ) + + warning(paste(Sys.time(), "Duplicate UIDs were created by the makeCalendar() process, this is likely to cause downstream proceessing errors. ", + "Please capture the data and raise a bug / create a test case. ", dups)) + } + + }, finally = { + set_TREAT_DATES_AS_INT( FALSE ) + + #revert treating date as int + if( TRUE==treatDatesAsInt ) + { + if (exists("res.calendar")){ res.calendar = makeDateFieldsDateType( res.calendar ) } + if (exists("cancellations")){ cancellations = makeDateFieldsDateType( cancellations ) } + } }) - days <- matrix(unlist(days), ncol = 7, byrow = TRUE) - days <- as.data.frame(days) - names(days) <- WEEKDAY_NAME_VECTOR - res.calendar <- cbind(res.calendar, days) - res.calendar$Days <- NULL + return(list(res.calendar, cancellations)) +} + + +makeDateFieldsDateType<- function( cal ) +{ + cal$start_date = as.Date( cal$start_date, origin = DATE_EPOC ) + cal$end_date = as.Date( cal$end_date, origin = DATE_EPOC ) + cal$duration = cal$end_date - cal$start_date + 1 - message(paste0( - Sys.time(), - " Removing trips that only occur on days of the week that are outside the timetable validity period" - )) + return (cal) +} + + +#' duplicateItem +#' +#' @details +#' Function that duplicates a data.table, adding a "index" column to all rows in the output indicating which +#' instance of the duplication the row is associated with +#' +#' @param dt data.table +#' @param reps number of duplicates to be created +#' @param indexStart starting number for the "index" value added to the item +#' @noRd +#' +duplicateItem <- function( dt, reps, indexStart=1 ) +{ + if ( is.na(reps) | reps<1 ) return (NULL) + + #replicate all the rows in dt times=reps + duplicates <- dt[rep(seq(1, nrow(dt)), reps), ] + + #create and apply indexes to the created rows- each group of replicated rows gets the same index number. + index <- rep(seq( indexStart, indexStart-1+reps ), nrow(dt)) + + duplicates$index <- index[order(index)] + + return(duplicates) +} + + + + +#' duplicateItems +#' +#' @details +#' Function that duplicates a large data.table, adding a "index" column to all rows in the output indicating which +#' instance of the duplication the row is associated with +#' +#' requires a column called "_reps" on the object to determine how many times it is to be duplicated +#' +#' @param dt data.table +#' @param split_attribute name of attribute to split the items between worker tasks +#' @param indexStart starting number for the "index" value added to the item +#' @noRd +#' +duplicateItems <- function( dt, split_attribute, ncores=1, indexStart=1 ) +{ + #add an additional column before splitting on it - so that the value we're really splitting on still appears in the output. + dt[, `_TEMP_` := get(split_attribute) ] + dt_split <- dt[, .(list(.SD)), by = `_TEMP_`][,V1] + dt$`_TEMP_` <- NULL - #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar))) - #performance - doing this split on 500k rows takes 60s - longer than the parallel execution below and consumes 3gb memory. - res.calendar.days <- res.calendar[,CHECKROWS_NAME_VECTOR] - res.calendar.days <- data.table::transpose(res.calendar.days) - #transpose on the same size runs in around 3s, but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer. + duplicate_int <- function(dta) { + rep <- dta$`_reps`[1] + return ( duplicateItem( dta, rep, indexStart ) ) + } - if (ncores > 1) { + if (ncores == 1) { + duplicates <- pbapply::pblapply(dt_split, duplicate_int) + } else { cl <- parallel::makeCluster(ncores) parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) loadNamespace("UK2GTFS") }) - keep <- pbapply::pbsapply(res.calendar.days, checkrows, - cl = cl - ) + + duplicates <- pbapply::pblapply(dt_split, + duplicate_int, + cl = cl) parallel::stopCluster(cl) rm(cl) - } else { - keep <- pbapply::pbsapply(res.calendar.days, checkrows) } - res.calendar <- res.calendar[keep, ] + duplicates <- data.table::rbindlist(duplicates, use.names=FALSE) - return(list(res.calendar, res.calendar_dates)) -} + duplicates$`_reps` <- NULL #performance, putting this inside duplicate_int roughly doubles the execution time -#' make calendar helper function -#' @param i row number to do -#' @noRd -#' -makeCalendar.inner <- function(calendar.sub) { # i, UIDs, calendar){ - # UIDs.sub = UIDs[i] - # calendar.sub = calendar[calendar$UID == UIDs.sub,] - # calendar.sub = schedule[schedule$`Train UID` == UIDs.sub,] - if (nrow(calendar.sub) == 1) { - # make into an single entry - return(list(calendar.sub, NA)) - } else { - # check duration and types - dur <- as.numeric(calendar.sub$duration[calendar.sub$STP != "P"]) - typ <- calendar.sub$STP[calendar.sub$STP != "P"] - typ.all <- calendar.sub$STP - if (all(dur == 1) & all(typ == "C") & length(typ) > 0 & - length(typ.all) == 2) { - # One Day cancellations - # Modify in the calendar_dates.txt - return(list( - calendar.sub[calendar.sub$STP == "P", ], - calendar.sub[calendar.sub$STP != "P", ] - )) - } else { - # check for identical day pattern - if (length(unique(calendar.sub$Days)) == 1 & - sum(typ.all == "P") == 1) { - calendar.new <- splitDates(calendar.sub) - #calendar.new <- UK2GTFS:::splitDates(calendar.sub) - return(list(calendar.new, NA)) - } else { - # split by day pattern - splits <- list() - daypatterns <- unique(calendar.sub$Days) - for (k in seq(1, length(daypatterns))) { - # select for each pattern but include cancellations with a - # different day pattern - calendar.sub.day <- calendar.sub[calendar.sub$Days == daypatterns[k] | - calendar.sub$STP == "C", ] - - if (all(calendar.sub.day$STP == "C")) { - # ignore cases of only cancelled - splits[[k]] <- NULL - } else { - calendar.new.day <- splitDates(calendar.sub.day) - # rejects nas - if (inherits(calendar.new.day, "data.frame")) { - calendar.new.day$UID <- paste0(calendar.new.day$UID, k) - splits[[k]] <- calendar.new.day - } - } - } - splits <- data.table::rbindlist(splits, use.names=FALSE) # dplyr::bind_rows(splits) - return(list(splits, NA)) - } - } - } + return (duplicates) } + + + + #' Duplicate stop_times #' #' @details -#' Function that duplicates top times for trips that have been split into -#' multiple trips +#' Function that duplicates stop times for trips that have been split into +#' multiple trips and sets the new trip id on the duplicated stop_times #' #' @param calendar calendar data.frame #' @param stop_times stop_times data.frame #' @param ncores number of processes for parallel processing (default = 1) #' @noRd #' -duplicate.stop_times_alt <- function(calendar, stop_times, ncores = 1) { - calendar.nodup <- calendar[!duplicated(calendar$rowID), ] - calendar.dup <- calendar[duplicated(calendar$rowID), ] - rowID.unique <- as.data.frame(table(calendar.dup$rowID)) - rowID.unique$Var1 <- as.integer(as.character(rowID.unique$Var1)) - stop_times <- dplyr::left_join(stop_times, rowID.unique, - by = c("schedule" = "Var1") +duplicate_stop_times <- function(calendar, stop_times, ncores = 1) { + + outputColumnNames = c( + "trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", + "pickup_type", "drop_off_type", "schedule" ) - stop_times_split <- split(stop_times, stop_times$schedule) - - # TODO: The could handle cases of non duplicated stoptimes within duplicate.stop_times.int - # rather than splitting and rejoining, would bring code tidyness and speed improvements - duplicate.stop_times.int <- function(stop_times.tmp) { - # message(i) - # stop_times.tmp = stop_times[stop_times$schedule == rowID.unique$Var1[i],] - # reps = rowID.unique$Freq[i] - reps <- stop_times.tmp$Freq[1] - if (is.na(reps)) { - return(NULL) - } else { - index <- rep(seq(1, reps), nrow(stop_times.tmp)) - index <- index[order(index)] - stop_times.tmp <- stop_times.tmp[rep(seq(1, nrow(stop_times.tmp)), reps), ] - stop_times.tmp$index <- index - return(stop_times.tmp) - } + + #it's pretty marginal doing this on multiple threads. With a typical sized day all GB file, + #doing the split takes 2.4s and the duplication 7.8s (on one thread) + #TODO look at avoiding the split if threads=1 + + return ( duplicate_related_items( calendar, stop_times, + original_join_field = "schedule", + new_join_field = "trip_id", + outputColumnNames = outputColumnNames, + ncores=ncores ) ) +} + + + +#' Duplicate related items +#' +#' @details +#' Function that duplicates items that are related to calendar +#' expected input are calendar items have been duplicated but retain the same (now duplicate) 'rowID' +#' this tells us which objects to duplicate and how many are required +#' the related_items have an attribute , which joins back to 'rowID' on the calendar items +#' +#' After duplication the duplicated items are joined back onto the input calendar items +#' to create an additional attribute on the output objects +#' +#' The calendar item attribute forms the new relation between the calendar items and +#' related items, so must be unique. +#' +#' @param calendar calendar data.frame +#' @param related_items data.frame of items to be replicated +#' @param ncores number of processes for parallel processing (default = 1) (currently hangs/crashes if >1) +#' @noRd +#' +duplicate_related_items <- function(calendar, related_items, original_join_field, new_join_field, outputColumnNames, ncores = 1) { + + calendar.dup <- calendar[duplicated(calendar$rowID), ] + + if( nrow(calendar.dup) <= 0 ) + { + #no duplicating to do + warning("duplicate_related_items: there were no duplicates detected. In real data this may indicate there has been an error earlier in the processing.") + related_items_dup = data.table() } + else + { + #create a count of the number of each duplicate of rowID + rowID.unique <- as.data.frame(table(calendar.dup$rowID)) + rowID.unique$Var1 <- as.integer(as.character(rowID.unique$Var1)) - if (ncores == 1) { - stop_times.dup <- pbapply::pblapply(stop_times_split, duplicate.stop_times.int) - } else { - cl <- parallel::makeCluster(ncores) - stop_times.dup <- pbapply::pblapply(stop_times_split, - duplicate.stop_times.int, - cl = cl - ) - parallel::stopCluster(cl) - rm(cl) + #join the count of number of duplicates required to the stop times (so we can retrieve it later when doing the duplication) + related_items <- dplyr::left_join(related_items, rowID.unique, + by = stats::setNames("Var1",original_join_field) ) + + #set the number of duplications required + related_items$`_reps` <- related_items$Freq + + # TODO: The could handle cases of non duplicated stoptimes within duplicate.stop_times.int + # rather than splitting and rejoining, would bring code tidyness and speed improvements + related_items_dup <- duplicateItems( related_items, original_join_field, ncores=ncores, indexStart=1 ) + + + # join via rowID+index to get new de-duplicated trip_id + + #create index on the table we want to join to - group by the rowId, index runs from 0..count()-1 of group size + #we start at zero so we don't effect the original stop_times rows and just join in the duplicated rows + new_join_ids <- dplyr::group_by(calendar, rowID) + new_join_ids <- dplyr::mutate(new_join_ids, Index = seq(0, dplyr::n()-1)) + new_join_ids <- as.data.frame( new_join_ids[, c("rowID", new_join_field, "Index")] ) + + related_items_dup <- dplyr::left_join(related_items_dup, new_join_ids, by = setNames(c("rowID","Index"),c(original_join_field,"index")) ) + + #select columns required + related_items_dup <- related_items_dup[, outputColumnNames, with=FALSE] } - stop_times.dup <- dplyr::bind_rows(stop_times.dup) - # stop_times.dup$index <- NULL - - # Join on the nonduplicated trip_ids - trip.ids.nodup <- calendar.nodup[, c("rowID", "trip_id")] - stop_times <- dplyr::left_join(stop_times, trip.ids.nodup, by = c("schedule" = "rowID")) - stop_times <- stop_times[!is.na(stop_times$trip_id), ] # when routes are cancled their stop times are left without valid trip_ids - - # join on the duplicated trip_ids - calendar2 <- dplyr::group_by(calendar, rowID) - calendar2 <- dplyr::mutate(calendar2, Index = seq(1, dplyr::n())) - - stop_times.dup$index2 <- as.integer(stop_times.dup$index + 1) - trip.ids.dup <- calendar2[, c("rowID", "trip_id", "Index")] - trip.ids.dup <- as.data.frame(trip.ids.dup) - stop_times.dup <- dplyr::left_join(stop_times.dup, trip.ids.dup, by = c("schedule" = "rowID", "index2" = "Index")) - stop_times.dup <- stop_times.dup[, c( - "arrival_time", "departure_time", "stop_id", "stop_sequence", - "pickup_type", "drop_off_type", "rowID", "schedule", "trip_id" - )] - stop_times <- stop_times[, c( - "arrival_time", "departure_time", "stop_id", "stop_sequence", - "pickup_type", "drop_off_type", "rowID", "schedule", "trip_id" - )] + calendar.nodup <- calendar[!duplicated(calendar$rowID), ] + + # when routes are cancelled their stop times are left without valid trip_ids - remove those rows + # this only applies to the non-duplicated rows + # Join via rowID to determine the trip_id + related_ids_nodup <- calendar.nodup[, c("rowID", new_join_field), with=FALSE] + related_items_no_dup <- dplyr::left_join(related_items, related_ids_nodup, by = setNames("rowID",original_join_field)) + related_items_no_dup <- related_items_no_dup[!is.na(related_items_no_dup[[new_join_field]]), ] + - # stop_times.dup = stop_times.dup[order(stop_times.dup$rowID),] + #select columns required, join output results together + related_items_no_dup <- related_items_no_dup[, outputColumnNames, with=FALSE] - stop_times.comb <- data.table::rbindlist(list(stop_times, stop_times.dup), use.names=FALSE) + related_items_comb <- data.table::rbindlist(list(related_items_no_dup, related_items_dup), use.names=FALSE) - return(stop_times.comb) + return(related_items_comb) } -#' fix times for jounrneys that run past midnight + + + + +#' fix times for journeys that run past midnight #' #' @details -#' When train runs over midnight GTFS requries the stop times to be in +#' When train runs over midnight GTFS requires the stop times to be in #' 24h+ e.g. 26:30:00 #' #' @param stop_times stop_times data.frame #' @param safe logical (default = TRUE) should the check for trains -#' running more than 24h be perfomed? +#' running more than 24h be performed? #' #' @details #' Not running the 24 check is faster, if the check is run a warning #' is returned, but the error is not fixed. As the longest train -#' jounrey in the UK is 13 hours (Aberdeen to Penzance) this is -#' unlikley to be a problem. +#' journey in the UK is 13 hours (Aberdeen to Penzance) this is +#' unlikely to be a problem. #' @noRd #' afterMidnight <- function(stop_times, safe = TRUE) { @@ -620,8 +576,8 @@ afterMidnight <- function(stop_times, safe = TRUE) { ) stop_times <- dplyr::left_join(stop_times, stop_times.summary, by = "trip_id") - stop_times$arvfinal <- ifelse(stop_times$arv < stop_times$dept_first, stop_times$arv + 2400, stop_times$arv) - stop_times$depfinal <- ifelse(stop_times$dept < stop_times$dept_first, stop_times$dept + 2400, stop_times$dept) + stop_times$arvfinal <- ifelse(stop_times$arv < stop_times$dept_first, stop_times$arv + 240000, stop_times$arv) + stop_times$depfinal <- ifelse(stop_times$dept < stop_times$dept_first, stop_times$dept + 240000, stop_times$dept) if (safe) { @@ -638,14 +594,14 @@ afterMidnight <- function(stop_times, safe = TRUE) { } } - numb2time2 <- function(numb){ - numb <- stringr::str_pad(as.character(numb), 4, pad = "0") - numb <- paste0(substr(numb,1,2),":",substr(numb,3,4),":00") - numb + numb2time2 <- function(dt, colNameDest, colNameSource){ + #performance, substr is relatively expensive + set(dt, j=colNameDest, value= sprintf("%02d:%02d:%02d", + dt[[colNameSource]] %/% 10000, (dt[[colNameSource]] %/% 100) %% 100, dt[[colNameSource]] %% 100) ) } - stop_times$arrival_time <- numb2time2(stop_times$arvfinal) - stop_times$departure_time <- numb2time2(stop_times$depfinal) + numb2time2(stop_times, "arrival_time", "arvfinal") + numb2time2(stop_times, "departure_time", "depfinal") stop_times <- stop_times[, c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", @@ -655,6 +611,26 @@ afterMidnight <- function(stop_times, safe = TRUE) { + +fixStopTimeData <- function(stop_times) +{ + # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains + stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "000000" & stop_times$Activity == "D", + stop_times$arrival_time, + stop_times$departure_time) + stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "000000" & stop_times$Activity == "U", + stop_times$departure_time, + stop_times$arrival_time) + + #fix missing arrival / departure times by copying from the other time. + stop_times$arrival_time[is.na(stop_times$arrival_time)] <- stop_times$departure_time[is.na(stop_times$arrival_time)] + stop_times$departure_time[is.na(stop_times$departure_time)] <- stop_times$arrival_time[is.na(stop_times$departure_time)] + + return (stop_times) +} + + + #' Clean Activities #' @param x character activities #' @details @@ -663,17 +639,28 @@ afterMidnight <- function(stop_times, safe = TRUE) { #' #' @noRd #' -clean_activities2 <- function(x) { +clean_activities2 <- function(x, public_only = TRUE) { x <- data.frame(activity = x, stringsAsFactors = FALSE) - x <- dplyr::left_join(x, activity_codes, by = c("activity")) - if (anyNA(x$pickup_type)) { - mss <- unique(x$activity[is.na(x$pickup_type)]) - message("Unknown Activity codes '", paste(unique(mss), collapse = "' '"), "' please report these codes as a GitHub Issue") - x$pickup_type[is.na(x$pickup_type)] <- 0 - x$drop_off_type[is.na(x$drop_off_type)] <- 0 + + if (public_only) + { + x <- dplyr::left_join(x, activity_codes, by = c("activity")) + if (anyNA(x$pickup_type)) { + mss <- unique(x$activity[is.na(x$pickup_type)]) + warning("Unknown Activity codes '", paste(unique(mss), collapse = "' '"), "' please report these codes as a GitHub Issue") + x$pickup_type[is.na(x$pickup_type)] <- 0 + x$drop_off_type[is.na(x$drop_off_type)] <- 0 + } + } + else #set all of the stops on a route to be valid for passenger boarding / alighting from a GTFS perspective + # (unless they are 'pass' which have no 'activity' as they woosh past - just like deadlines.) + { + x$pickup_type <- ifelse( is.na(x$activity), 1, 0 ) + x$drop_off_type <- ifelse( is.na(x$activity), 1, 0 ) } x <- x[, c("pickup_type", "drop_off_type")] + return(x) } diff --git a/R/atoc_import.R b/R/atoc_import.R index 8103122..38fd6dd 100644 --- a/R/atoc_import.R +++ b/R/atoc_import.R @@ -1,4 +1,8 @@ -#' Import the .alf file +#' @import data.table +#' @importFrom data.table ":=" + + +# Import the .alf file #' #' @details #' Imports the .alf file and returns data.frame @@ -142,7 +146,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 17 - 1), widths = c(1, 4, 26 + 4, 1, 7, 3, 3, 3, 5, 1, 5, 2, 1, 1, 11, 3) ) - + setDT(station) names(station) <- c( "Record Type", "Reserved1", "Station Name", "CATE Interchange status", "TIPLOC Code", "CRS Reference Code", @@ -162,18 +166,24 @@ importMSN <- function(file, silent = TRUE) { station <- strip_whitespace(station) # convert to SF object - # for some reasonf the coordinates are mangled + # for some reason the coordinates are mangled + #https://data.atoc.org/sites/all/themes/atoc/files/RSPS5046.pdf + #east = Values are in 0.1 km units. Format is ‘1nnnn’ where nnnn is the distance in 0.1 km units. station$`Ordnance Survey Grid Ref East` <- as.numeric(station$`Ordnance Survey Grid Ref East`) + station$`Ordnance Survey Grid Ref East` <- ifelse( + (0==station$`Ordnance Survey Grid Ref East`), NA, station$`Ordnance Survey Grid Ref East` * 100 - 1e6) + + #north = Values are in 0.1 km units. Format is ‘6nnnn’ where nnnn is the distance in 0.1 km units station$`Ordnance Survey Grid Ref North` <- as.numeric(station$`Ordnance Survey Grid Ref North`) - station$`Ordnance Survey Grid Ref East` <- station$`Ordnance Survey Grid Ref East` * 100 - 1e6 - station$`Ordnance Survey Grid Ref North` <- station$`Ordnance Survey Grid Ref North` * 100 - 6e6 + station$`Ordnance Survey Grid Ref North` <- ifelse( + (0==station$`Ordnance Survey Grid Ref North` | 69999==station$`Ordnance Survey Grid Ref North`), NA, + station$`Ordnance Survey Grid Ref North` * 100 - 6e6) station <- sf::st_as_sf(station, - coords = c( - "Ordnance Survey Grid Ref East", - "Ordnance Survey Grid Ref North" - ), - crs = 27700 + coords = c( "Ordnance Survey Grid Ref East", + "Ordnance Survey Grid Ref North"), + crs = 27700, + na.fail = FALSE ) station <- sf::st_transform(station, 4326) @@ -190,7 +200,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 5 - 1), widths = c(1, 4, 26 + 4, 45) ) - + setDT(timetable) names(timetable) <- c( "Record Type", "Reserved1", "Station Name", "GBTT numbers" @@ -213,7 +223,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 2), widths = c(1, 79) ) - + setDT(comment) names(comment) <- c("Record Type", "Comment") comment$`Record Type` <- NULL @@ -229,7 +239,7 @@ importMSN <- function(file, silent = TRUE) { col_types = rep("character", 6 - 1), widths = c(1, 4, 26 + 5, 26, 20) ) - + setDT(alias) names(alias) <- c( "Record Type", "Reserved1", "Station Name", "Station Alias", "Reserved3" @@ -250,13 +260,14 @@ importMSN <- function(file, silent = TRUE) { #' Strip White Space #' #' @details -#' Strips whitespace from a dataframe of charters vectors and returns -#' the data frame +#' Strips trailing whitespace from a dataframe of character vectors +#' empty values are converted to NA +#' returns the data frame #' #' @param df data frame #' @noRd #' -strip_whitespace <- function(df) { +strip_whitespace_df <- function(df) { sws <- function(val) { val <- trimws(val, which = "right") val[val == ""] <- NA @@ -266,6 +277,148 @@ strip_whitespace <- function(df) { return(df) } +#' Strip White Space +#' +#' @details +#' Input data.table is modified in-place and returned to the caller. +#' +#' Strips trailing whitespace from all char columns in a data.table +#' empty values are converted to NA +#' returns the data.table +#' +#' @param dt data table +#' @noRd +#' +strip_whitespace <- function(dt) { + + char_cols <- sapply(dt, is.character) + char_col_names <- names(char_cols[char_cols]) + + for (col_name in char_col_names) { + set(dt, j = col_name, value = trimws(dt[[col_name]], which = "right")) + dt[dt[[col_name]] == "", (col_name) := NA_character_] + } + + return (dt) +} + + + +process_times <- function(dt, working_timetable) { + + dt = processOneTime(dt, working_timetable, "Arrival Time", "Scheduled Arrival Time", "Public Arrival Time") + dt = processOneTime(dt, working_timetable, "Departure Time", "Scheduled Departure Time", "Public Departure Time") + + return(dt) +} + + +#does in place-modification of input data.table +#select the public arrive/depart times if they exist, otherwise select the wtt arrive/depart times if they exist, otherwise select the pass time +#and at the same time fill in the missing seconds values (and 30 seconds if 'H' is indicated) +processOneTime <- function(dt, working_timetable, targetField, sourceFieldWtt, sourceField) +{ + hasPass = "Scheduled Pass" %in% colnames(dt) + + if (sourceFieldWtt %in% colnames(dt)) + { + if (working_timetable) + { + if(hasPass) + { + set(dt, j = targetField, value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", + data.table::fifelse( " "==dt[[sourceFieldWtt]], + dt[["Scheduled Pass"]], + dt[[sourceFieldWtt]]))) + ) + } + else + { + set(dt, j = targetField, value = gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[[sourceFieldWtt]]))) + } + } + else + { + if(hasPass) + { + set(dt, j = targetField, value = data.table::fifelse( "0000"==dt[[sourceField]], + gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", + data.table::fifelse( " "==dt[[sourceFieldWtt]], + dt[["Scheduled Pass"]], + dt[[sourceFieldWtt]]))), + gsub("^(\\d{4})$", "\\100", dt[[sourceField]])) + ) + } + else + { + #If there is no Public Arrival time this field will default to 0000. (we will use WTT instead) + set(dt, j = targetField, value = data.table::fifelse( "0000"==dt[[sourceField]], + gsub("^(\\d{4}) $","\\100",gsub("^(\\d{4})H$", "\\130", dt[[sourceFieldWtt]])), + gsub("^(\\d{4})$", "\\100", dt[[sourceField]])) + ) + } + } + } + + return (dt) +} + + + +# Process Activity Codes +process_activity <- function(dt, public_only) { + +# if ( any( 12 != nchar(dt$Activity) ) ) +# { +# stop("bad input data in process_activity(), all Activity fields should be 12 chars long") +# } +# don't really need this test since we're reading in from fixed width files + + #performance, runs about twice as fast if we do processing outside data.table then insert it later + splitActivity = unlist( stringi::stri_extract_all_regex(dt$Activity, ".{2}") ) + + splitActivityMat = matrix(splitActivity, ncol=6, byrow=TRUE) + + # Filter to stops for passengers + #see https://wiki.openraildata.com/index.php?title=Activity_codes for definitions + acts <- c( + "TB", # Train Starts + "T " , # Stops to take up and set down passengers + "D ", # Stops to set down passengers + "U ", # Stops to take up passengers + "R ", # Request stop + "TF" # Train Finishes + ) + + if(public_only) + { + allowed = (" "!=splitActivityMat) & (splitActivityMat %in% acts) + } + else + { + allowed = (" "!=splitActivityMat) + } + + splitActivityMat[!allowed] <- "" + + activity = sprintf("%s,%s,%s,%s,%s,%s", splitActivityMat[,1], splitActivityMat[,2], splitActivityMat[,3], splitActivityMat[,4], splitActivityMat[,5], splitActivityMat[,6] ) + + #replace multiple comma with single comma, remove whitespace, remove leading comma, remove trailing comma. + activity = gsub(",+", ",", activity) + set(dt, j="Activity", value = gsub("\\s+|^,|,$", "", activity)) + + #remove rows with no activity we're interested in (there is no activity at 'pass' locations) + if(public_only) + { + dt <- dt[ ""!=dt$Activity ] + } + + return(dt) +} + + + + #' Import the .mca file #' #' @details @@ -273,16 +426,18 @@ strip_whitespace <- function(df) { #' #' @param file Path to .mca file #' @param silent logical, should messages be displayed -#' @param ncores number of cores to use when paralell processing +#' @param ncores number of cores to use when parallel processing #' @param full_import import all data, default FALSE #' @param working_timetable use rail industry scheduling times instead of public times +#' @param public_only only return calls that are for public passenger pick up/set down #' @export #' importMCA <- function(file, silent = TRUE, ncores = 1, full_import = FALSE, - working_timetable = FALSE) { + working_timetable = FALSE, + public_only = TRUE) { # see https://wiki.openraildata.com/index.php/CIF_File_Format if (!silent) { @@ -293,6 +448,7 @@ importMCA <- function(file, n = -1 ) types <- substr(raw, 1, 2) + rowIds <- seq(from = 1, to = length(types)) # break out each part of the file # Header Record @@ -311,6 +467,7 @@ importMCA <- function(file, 6, 1, 1, 1, 1, 4, 4, 1, 1 ) ) + setDT(BS) names(BS) <- c( "Record Identity", "Transaction Type", "Train UID", "Date Runs From", "Date Runs To", "Days Run", "Bank Holiday Running", "Train Status", @@ -338,7 +495,7 @@ importMCA <- function(file, BS$Speed <- as.integer(BS$Speed) # Add the rowid - BS$rowID <- seq(from = 1, to = length(types))[types == "BS"] + BS$rowID <- rowIds[types == "BS"] # Basic Schedule Extra Details if (!silent) { @@ -350,6 +507,7 @@ importMCA <- function(file, col_types = rep("character", 8), widths = c(2, 4, 5, 2, 1, 8, 1, 57) ) + setDT(BX) names(BX) <- c( "Record Identity", "Traction Class", "UIC Code", "ATOC Code", "Applicable Timetable Code", "Retail Train ID", "Source", "Spare" @@ -360,7 +518,9 @@ importMCA <- function(file, # clean data # Add the rowid - BX$rowID <- seq(from = 1, to = length(types))[types == "BX"] + BX$rowID <- rowIds[types == "BX"] + + # Origin Station if (!silent) { @@ -372,28 +532,21 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 1, 5, 4, 3, 3, 2, 2, 12, 2, 37) ) + setDT(LO) names(LO) <- c( "Record Identity", "Location", "Suffix", "Scheduled Departure Time", "Public Departure Time", "Platform", "Line", "Engineering Allowance", - "Pathing Allowance", "Pathing Allowance", "Performance Allowance", + "Pathing Allowance", "Activity", "Performance Allowance", "Spare" ) - LO$Spare <- NULL - LO$`Record Identity` <- NULL - LO <- strip_whitespace(LO) - - if(working_timetable){ - LO$`Departure Time` <- gsub("H", "", - LO$`Scheduled Departure Time`) - }else{ - LO$`Departure Time` <- gsub("H", "", - LO$`Public Departure Time`) - } - - LO <- LO[, c("Location", "Departure Time")] # Add the rowid - LO$rowID <- seq(from = 1, to = length(types))[types == "LO"] + LO$rowID <- rowIds[types == "LO"] + + LO[, c("Scheduled Arrival Time","Public Arrival Time", "Scheduled Pass") := ""] + LO <- LO[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] + # Intermediate Station if (!silent) { @@ -405,6 +558,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 7, 1, 5, 5, 5, 4, 4, 3, 3, 3, 12, 2, 2, 2, 20) ) + setDT(LI) names(LI) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Scheduled Departure Time", "Scheduled Pass", "Public Arrival Time", @@ -412,60 +566,12 @@ importMCA <- function(file, "Engineering Allowance", "Pathing Allowance", "Performance Allowance", "Spare" ) - LI$Spare <- NULL - LI$`Record Identity` <- NULL - - # Process Activity Codes - activity <- strsplit(LI$Activity, "(?<=.{2})", perl=TRUE) - - clean_activity3 <- function(x){ - # Filter to stops for passengers - acts <- c( - "TB", # Train Starts - "T ", # Stops to take up and set down passengers - "D ", # Stops to set down passengers - "U ", # Stops to take up passengers - "R ", # Request stop - "TF" # Train Finishes - ) - x <- x[x %in% acts] - x <- gsub(" ","",x) - if(length(x) > 0){ - x <- paste(x, collapse = ",") - return(x) - } else { - return("Other") - } - } - - LI$Activity <- unlist(lapply(activity, clean_activity3)) - - LI <- strip_whitespace(LI) # Add the rowid - LI$rowID <- seq(from = 1, to = length(types))[types == "LI"] - - LI <- LI[LI$Activity != "Other",] - # Check for errors in the times - - - if(working_timetable){ - LI$`Arrival Time` <- gsub("H", "", - LI$`Scheduled Arrival Time`) - LI$`Departure Time` <- gsub("H", "", - LI$`Scheduled Departure Time`) - }else{ - LI$`Arrival Time` <- gsub("H", "", - LI$`Public Arrival Time`) - LI$`Departure Time` <- gsub("H", "", - LI$`Public Departure Time`) - } - - LI <- LI[, c( - "Location", "Arrival Time", - "Departure Time", "Activity", "rowID" - )] + LI$rowID <- rowIds[types == "LI"] + LI <- LI[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] # Terminating Station @@ -478,30 +584,19 @@ importMCA <- function(file, col_types = rep("character", 9), widths = c(2, 7, 1, 5, 4, 3, 3, 12, 43) ) + setDT(LT) names(LT) <- c( "Record Identity", "Location", "Suffix", "Scheduled Arrival Time", "Public Arrival Time", "Platform", "Path", "Activity", "Spare" ) - LT$Spare <- NULL - LT$`Record Identity` <- NULL - - # Process Activity Codes - activity <- strsplit(LT$Activity, "(?<=.{2})", perl=TRUE) - LT$Activity <- unlist(lapply(activity, clean_activity3)) - - LT <- strip_whitespace(LT) - LT$`Scheduled Arrival Time` <- gsub("H", "", LT$`Scheduled Arrival Time`) - if(working_timetable){ - LT$`Arrival Time` <- gsub("H", "", LT$`Scheduled Arrival Time`) - }else{ - LT$`Arrival Time` <- gsub("H", "", LT$`Public Arrival Time`) - } + # Add the rowid + LT$rowID <- rowIds[types == "LT"] - LT <- LT[, c("Location", "Arrival Time", "Activity")] + LT[, c("Scheduled Departure Time","Public Departure Time", "Scheduled Pass") := ""] + LT <- LT[, c("rowID", "Location", "Activity", "Scheduled Arrival Time", "Scheduled Departure Time", + "Public Arrival Time", "Public Departure Time", "Scheduled Pass" )] - # Add the rowid - LT$rowID <- seq(from = 1, to = length(types))[types == "LT"] # TIPLOC Insert if (full_import) { @@ -518,6 +613,7 @@ importMCA <- function(file, 4, 4, 5, 8, 5 ) ) + setDT(CR) names(CR) <- c( "Record Identity", "Location", "Train Category", "Train Identity", "Headcode", "Course Indicator", @@ -532,7 +628,7 @@ importMCA <- function(file, CR <- strip_whitespace(CR) # Add the rowid - CR$rowID <- seq(from = 1, to = length(types))[types == "CR"] + CR$rowID <- rowIds[types == "CR"] if (!silent) { message(paste0(Sys.time(), " importing TIPLOC Insert")) @@ -543,6 +639,7 @@ importMCA <- function(file, col_types = rep("character", 11), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 8) ) + setDT(TI) names(TI) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", @@ -553,7 +650,7 @@ importMCA <- function(file, TI <- strip_whitespace(TI) # Add the rowid - TI$rowID <- seq(from = 1, to = length(types))[types == "TI"] + TI$rowID <- rowIds[types == "TI"] # TIPLOC Amend if (!silent) { @@ -565,6 +662,7 @@ importMCA <- function(file, col_types = rep("character", 12), widths = c(2, 7, 2, 6, 1, 26, 5, 4, 3, 16, 7, 1) ) + setDT(TA) names(TA) <- c( "Record Identity", "TIPLOC code", "Capitals", "NALCO", "NLC Check Character", "TPS Description", "STANOX", "PO MCP Code", @@ -575,7 +673,7 @@ importMCA <- function(file, TA <- strip_whitespace(TA) # Add the rowid - TA$rowID <- seq(from = 1, to = length(types))[types == "TA"] + TA$rowID <- rowIds[types == "TA"] # TIPLOC Delete if (!silent) { @@ -587,13 +685,14 @@ importMCA <- function(file, col_types = rep("character", 3), widths = c(2, 7, 71) ) + setDT(TD) names(TD) <- c("Record Identity", "TIPLOC code", "Spare") TD$Spare <- NULL TD$`Record Identity` <- NULL TD <- strip_whitespace(TD) # Add the rowid - TD$rowID <- seq(from = 1, to = length(types))[types == "TD"] + TD$rowID <- rowIds[types == "TD"] } @@ -608,6 +707,7 @@ importMCA <- function(file, col_types = rep("character", 16), widths = c(2, 1, 6, 6, 6, 6, 7, 2, 1, 7, 1, 1, 1, 1, 31, 1) ) + setDT(AA) names(AA) <- c( "Record Identity", "Transaction Type", "Base UID", "Assoc UID", "Assoc Start date", "Assoc End date", "Assoc Days", "Assoc Cat", @@ -633,7 +733,7 @@ importMCA <- function(file, AA$`Assoc Location Suffix` <- as.integer(AA$`Assoc Location Suffix`) # Add the rowid - AA$rowID <- seq(from = 1, to = length(types))[types == "AA"] + AA$rowID <- rowIds[types == "AA"] } # Trailer Record @@ -646,19 +746,33 @@ importMCA <- function(file, col_types = rep("character", 2), widths = c(2, 78) ) + setDT(ZZ) names(ZZ) <- c("Record Identity", "Spare") ZZ$Spare <- NULL ZZ <- strip_whitespace(ZZ) # Add the rowid - ZZ$rowID <- seq(from = 1, to = length(types))[types == "ZZ"] + ZZ$rowID <- rowIds[types == "ZZ"] # Prep the main files if (!silent) { message(paste0(Sys.time(), " Preparing Imported Data")) } - stop_times <- dplyr::bind_rows(list(LO, LI, LT)) + + stop_times <- data.table::rbindlist(list(LO, LI, LT), use.names=FALSE) + + stop_times <- process_activity(stop_times, public_only) + + stop_times <- process_times( stop_times, working_timetable ) + + stop_times <- stop_times[, c("rowID", "Location", "Activity", "Arrival Time", "Departure Time")] + + stop_times <- strip_whitespace(stop_times) + stop_times <- stop_times[order(stop_times$rowID), ] + + #the BS record is followed by the LO, LI, LT records relating to it + #- so we effectively group by the BS record and apply the BS row ID to the 'schedule' column of the stop times relating to it. stop_times$schedule <- as.integer(as.character(cut(stop_times$rowID, c(BS$rowID, ZZ$rowID[1]), labels = BS$rowID @@ -666,8 +780,10 @@ importMCA <- function(file, stop_times$stop_sequence <- sequence(rle(stop_times$schedule)$lengths) - BX$rowIDm1 <- BX$rowID - 1 - BX$rowID <- NULL + # the BX record appears the row after the BS record, so it's rowId is one more than it's corresponding BS record. + # use this to join the two records together. + set(BX, j = "rowID", value = BX$rowID - 1) + setnames(BX, "rowID", "rowIDm1") schedule <- dplyr::left_join(BS, BX, by = c("rowID" = "rowIDm1")) if (full_import) { diff --git a/R/atoc_main.R b/R/atoc_main.R index cf8dd8f..518222f 100644 --- a/R/atoc_main.R +++ b/R/atoc_main.R @@ -8,9 +8,10 @@ #' @param schedule list of dataframes #' @param silent logical #' @param ncores number of cores to use +#' @param public_only filters to services / calls that are public pickup/set down only #' @noRd #' -schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = 1) { +schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = 1, public_only=TRUE) { ### SECTION 1: ############################################################################### @@ -19,27 +20,19 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = message(paste0(Sys.time(), " Building stop_times")) } - # Convert Activity to pickup_type and drop_off_type - stop_times$Activity[is.na(stop_times$Activity) & stop_times$stop_sequence == 1] <- "TB" # No activity specified at start - - # Fix arrival_time / departure_time being 0000 for pick up only or drop off only trains - stop_times$departure_time <- dplyr::if_else(stop_times$departure_time == "0000" & stop_times$Activity == "D", - stop_times$arrival_time, - stop_times$departure_time) - stop_times$arrival_time <- dplyr::if_else(stop_times$arrival_time == "0000" & stop_times$Activity == "U", - stop_times$departure_time, - stop_times$arrival_time) + stop_times <- fixStopTimeData( stop_times ) - upoffs <- clean_activities2(stop_times$Activity) + # Convert Activity to pickup_type and drop_off_type + upoffs <- clean_activities2(stop_times$Activity, public_only=public_only) stop_times <- cbind(stop_times, upoffs) - stop_times$arrival_time[is.na(stop_times$arrival_time)] <- stop_times$departure_time[is.na(stop_times$arrival_time)] - stop_times$departure_time[is.na(stop_times$departure_time)] <- stop_times$arrival_time[is.na(stop_times$departure_time)] stop_times <- stop_times[, c("arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type", "rowID", "schedule")] - stop_times <- stop_times[!(stop_times$pickup_type == 1 & stop_times$drop_off_type == 1), ] - - + if (public_only) + { + #remove calls that don't allow the public to board or alight (type==1) + stop_times <- stop_times[!(stop_times$pickup_type == 1 & stop_times$drop_off_type == 1), ] + } ### SECTION 2: ############################################################################### @@ -48,53 +41,72 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = message(paste0(Sys.time(), " Building calendar and calendar_dates")) } + # build the calendar file + res <- makeCalendar(schedule = schedule, ncores = ncores) + calendar <- res[[1]] + cancellation_dates <- res[[2]] + rm(res) + #remove columns we don't need any more schedule <- schedule[, c( - "Train UID", "Date Runs From", "Date Runs To", "Days Run", "Bank Holiday Running", "Train Status", "Train Category", - "Headcode", "STP indicator", "rowID", "ATOC Code", "Retail Train ID" + "Train UID", "Train Status", "Train Category", + "Headcode", "rowID", "ATOC Code", "Retail Train ID", "Power Type", "Train Identity" )] - # build the calendar file - res <- makeCalendar(schedule = schedule, ncores = ncores) - calendar <- res[[1]] - calendar_dates <- res[[2]] - # rm(res) + #making the calendar will duplicate rows where the base timetable has been layered over with cancellations etc. + #join back to the original to get the extra columns not returned from makeCalendar + calendar <- dplyr::left_join(calendar, schedule, by = c("rowID")) + + gc() + + # In CIF land, one service ID can have multiple operating patterns, expressed as multiple BS records (and their associated LO,LI,LT records) having the same UID + # that's the opposite of the GTFS concept where a single calendar (operating pattern) can be used by multiple trips, but a single trip can only have one operating pattern. + # The CIF concept of a single UID maps more closely onto GTFS routes. + # + # so we maintain a 1:1 relationship between Trips and Calendars, duplicating CIF service IDs as required to represent the multiplicity of possible operating patterns. + # + # see merging code where there is functionality to de-duplicate calendar patterns down to a unique set. (condenseServicePatterns) - calendar$trip_id <- 1:nrow(calendar) # not sure why this was here, but used in duplicate.stop_times - # calendar$service_id = 1:nrow(calendar) # For this purpose the serive and the trip are always the same - # clean calednars - # calendar = calendar[,c("UID","monday","tuesday","wednesday","thursday","friday","saturday","sunday", - # "start_date","end_date","rowID","trip_id")] + # clean calendars names(calendar)[names(calendar) == "UID"] <- "service_id" - calendar$start_date <- as.character(calendar$start_date) - calendar$start_date <- gsub("-", "", calendar$start_date) - calendar$end_date <- as.character(calendar$end_date) - calendar$end_date <- gsub("-", "", calendar$end_date) + calendar <- formatAttributesToGtfsSchema( calendar ) - calendar_dates <- calendar_dates[, c("UID", "start_date")] - names(calendar_dates) <- c("service_id", "date") - calendar_dates$date <- as.character(calendar_dates$date) - calendar_dates$date <- gsub("-", "", calendar_dates$date) - calendar_dates$exception_type <- 2 # all events passed to calendar_dates are single day cancellations + cancellation_dates <- cancellation_dates[, c("UID", "start_date")] + names(cancellation_dates) <- c("service_id", "date") + cancellation_dates <- formatAttributesToGtfsSchema( cancellation_dates ) + cancellation_dates$exception_type <- 2 # all events passed to cancellation_dates are single day cancellations ### SECTION 3: ############################################################################### + + calendar$trip_id <- 1:nrow(calendar) + # calendar$service_id = 1:nrow(calendar) + + + # When splitting the calendar rowIDs are duplicated # so create new system of trip_ids and duplicate the relevant stop_times if (!silent) { message(paste0(Sys.time(), " Duplicating necessary stop times")) } + #TODO if ncores > 1 this takes forever - the data being joined must somehow trigger massive memory copying ? + #also sets the trip_id on stop_times + stop_times <- duplicate_stop_times(calendar = calendar, stop_times = stop_times, ncores = 1) - stop_times <- duplicate.stop_times_alt(calendar = calendar, stop_times = stop_times, ncores = 1) ### SECTION 5: ############################################################################### - # make the trips.txt file by matching the calendar to the stop_times + # make make the trips.txt file by matching the calendar to the stop_times + + trips <- calendar[, c("service_id", "trip_id", "rowID", "ATOC Code", "Train Status", "Train Category", "Power Type", "Train Identity")] + trips <- longnames(routes = trips, stop_times = stop_times, stops=stops) + + + # Fix Times (and remove some fields) + stop_times <- afterMidnight(stop_times) - trips <- calendar[, c("service_id", "trip_id", "rowID", "ATOC Code", "Train Status")] - trips <- longnames(routes = trips, stop_times = stop_times, stops = stops) ### SECTION 4: ############################################################################### # make the routes.txt @@ -104,46 +116,64 @@ schedule2routes <- function(stop_times, stops, schedule, silent = TRUE, ncores = message(paste0(Sys.time(), " Building routes.txt")) } - routes <- trips - routes <- dplyr::group_by(routes, `ATOC Code`, route_long_name, `Train Status`) - routes <- dplyr::summarise(routes) - routes$route_id <- 1:nrow(routes) - - trips <- dplyr::left_join(trips, routes, by = c("ATOC Code" = "ATOC Code", "route_long_name" = "route_long_name", "Train Status" = "Train Status")) - - train_status <- data.frame( + #do the conversion to route_type before grouping because several statuses map to the same route_type and we get 'duplicate' routes that look the same. + train_status <- data.table( train_status = c("B", "F", "P", "S", "T", "1", "2", "3", "4", "5"), - route_type = c(3, NA, 2, 4, NA, 2, NA, NA, 4, 3), + route_type = c( 3, NA, 2, 4, NA, 2, NA, NA, 4, 3), stringsAsFactors = FALSE ) - routes$`Train Status` <- as.character(routes$`Train Status`) - routes <- dplyr::left_join(routes, train_status, by = c("Train Status" = "train_status")) + trips$`Train Status` <- as.character(trips$`Train Status`) + trips <- dplyr::left_join(trips, train_status, by = c("Train Status" = "train_status")) rm(train_status) - routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name")] - names(routes) <- c("route_id", "route_type", "agency_id", "route_long_name") + trips$route_type[trips$`Train Category` %in% c("EL", "OL") & trips$route_type == 2 ] <- 1 + # London Underground is Metro (unless already identified as a bus/ship etc) + # "OL" is also used for Tyne & Wear metro + + routes <- trips + + routes <- dplyr::group_by(routes, `ATOC Code`, route_long_name, `Train Category`, route_type ) + routes <- dplyr::summarise(routes) + routes$route_id <- 1:nrow(routes) + + #join route_id back into trip table + trips <- dplyr::left_join(trips, routes, by = c("ATOC Code", "route_long_name", "Train Category", "route_type")) + + routes <- routes[, c("route_id", "route_type", "ATOC Code", "route_long_name", "Train Category" )] + names(routes) <- c("route_id", "route_type", "agency_id", "route_long_name", "train_category" ) # IDs are not meaningful, just leave out routes$route_short_name <- "" # was: routes$route_id - routes$route_type[routes$agency_id == "LT"] <- 1 # London Underground is Metro ### Section 6: ####################################################### # Final Checks - # Fix Times - stop_times <- afterMidnight(stop_times) - # Ditch unneeded columns - routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type")] - trips <- trips[, c("trip_id", "route_id", "service_id")] + + #gtfs systems tend to be tolerant of additional fields, so expose the train_category and power_type so that the consumer can do analysis on them. + #e.g. filter out ECS moves + + #https://developers.google.com/transit/gtfs/reference#tripstxt 'trip_short_name' + #"Public facing text used to identify the trip to riders, e.g.to identify train numbers for commuter rail trips. + #If provided, should uniquely identify a trip within a service day; it should not be used for destination names or limited/express designations." + #best mapping from alpha headcode onto standard GTFS field is 'trip_short_name' since it's unique within a single day. + + # Ditch unnecessary columns + routes <- routes[, c("route_id", "agency_id", "route_short_name", "route_long_name", "route_type", "train_category")] + trips <- trips[, c("trip_id", "route_id", "service_id", "Train Identity", "Power Type")] + names(trips) <- c("trip_id", "route_id", "service_id", "trip_short_name", "power_type") stop_times <- stop_times[, c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "pickup_type", "drop_off_type")] calendar <- calendar[, c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] # end of function - timetables <- list(calendar, calendar_dates, routes, stop_times, trips) + timetables <- list(calendar, cancellation_dates, routes, stop_times, trips) names(timetables) <- c("calendar", "calendar_dates", "routes", "stop_times", "trips") + return(timetables) } + + + diff --git a/R/atoc_nr.R b/R/atoc_nr.R index 93d73b5..f1ea53a 100644 --- a/R/atoc_nr.R +++ b/R/atoc_nr.R @@ -9,6 +9,8 @@ #' @param locations where to get tiploc locations (see details) #' @param agency where to get agency.txt (see details) #' @param shapes Logical, should shapes.txt be generated (default FALSE) +#' @param working_timetable Logical, should WTT times be used instead of public times (default FALSE) +#' @param public_only Logical, only return calls/services that are for public passenger pickup/set down (default TRUE) #' @family main #' @return A gtfs list #' @@ -37,22 +39,9 @@ nr2gtfs <- function(path_in, ncores = 1, locations = "tiplocs", agency = "atoc_agency", - shapes = FALSE) { - - if(inherits(locations,"character")){ - if(locations == "tiplocs"){ - load_data("tiplocs") - locations = tiplocs - } - } - - if(inherits(agency,"character")){ - if(agency == "atoc_agency"){ - load_data("atoc_agency") - agency = atoc_agency - } - } - + shapes = FALSE, + working_timetable = FALSE, + public_only = TRUE) { # checkmate checkmate::assert_character(path_in, len = 1) checkmate::assert_file_exists(path_in) @@ -61,11 +50,13 @@ nr2gtfs <- function(path_in, checkmate::assert_logical(shapes) if (ncores == 1) { - message(paste0( - Sys.time(), - " This will take some time, make sure you use 'ncores' to enable multi-core processing" - )) + message(paste0(Sys.time(), " This will take some time, make sure you use 'ncores' to enable multi-core processing")) } + + agency = getCachedAgencyData( agency ) + + stops = getCachedLocationData( locations ) + # Is input a zip or a folder if (!grepl(".gz", path_in)) { stop("path_in is not a .gz file") @@ -74,56 +65,42 @@ nr2gtfs <- function(path_in, # Read In each File mca <- importMCA( file = path_in, - silent = silent, ncores = 1 + silent = silent, + ncores = 1, + working_timetable = working_timetable, + public_only = public_only ) - # Get the Station Locations - if ("sf" %in% class(locations)) { - stops <- cbind(locations, sf::st_coordinates(locations)) - stops <- as.data.frame(stops) - stops <- stops[, c( - "stop_id", "stop_code", "stop_name", - "Y", "X" - )] - names(stops) <- c( - "stop_id", "stop_code", "stop_name", - "stop_lat", "stop_lon" - ) - stops$stop_lat <- round(stops$stop_lat, 5) - stops$stop_lon <- round(stops$stop_lon, 5) - } else { - stops <- utils::read.csv(locations, stringsAsFactors = FALSE) - } - # Construct the GTFS stop_times <- mca[["stop_times"]] schedule <- mca[["schedule"]] rm(mca) gc() - # rm(alf, flf, mca, msn) + stop_times <- stop_times[, c( - "Arrival Time", - "Departure Time", - "Location", "stop_sequence", - "Activity", "rowID", "schedule" - )] + "Arrival Time", "Departure Time", "Location", "stop_sequence", "Activity", "rowID", "schedule")] names(stop_times) <- c( - "arrival_time", "departure_time", "stop_id", - "stop_sequence", "Activity", "rowID", "schedule" - ) + "arrival_time", "departure_time", "stop_id", "stop_sequence", "Activity", "rowID", "schedule") # remove any unused stops stops <- stops[stops$stop_id %in% stop_times$stop_id, ] + if ( nrow(stops)<=0 ) + { + stop("Could not match any stops in input data to stop database.") + } + + # Main Timetable Build timetables <- schedule2routes( stop_times = stop_times, stops = stops, schedule = schedule, silent = silent, - ncores = ncores + ncores = ncores, + public_only = public_only ) rm(schedule) @@ -139,3 +116,65 @@ nr2gtfs <- function(path_in, return(timetables) } + + + + +getCachedAgencyData <- function(agency = "atoc_agency") +{ + if(inherits(agency,"character")) + { + if(agency == "atoc_agency") + { + load_data("atoc_agency") + agency = atoc_agency + } + else #TODO test column names + { + checkmate::check_file_exists(agency) + agency <- utils::read.csv(agency, stringsAsFactors = FALSE) + } + + if ( !inherits(agency, "data.frame") || 0==nrow(agency) ){ stop("failed to load atoc_agency data.") } + } + + return (agency) +} + + +getCachedLocationData <- function(locations = "tiplocs") +{ + if(inherits(locations,"character")) + { + if(locations == "tiplocs") + { + load_data("tiplocs") + locations = tiplocs + } + else + { + checkmate::check_file_exists(locations) + locations <- utils::read.csv(locations, stringsAsFactors = FALSE) + } + + if ( !inherits(locations, "data.frame") || 0==nrow(locations) ){ stop("failed to tiploc data.") } + } + + # Get the Station Locations + if (inherits(locations, "sf")) + { + stops <- cbind(locations, sf::st_coordinates(locations)) + stops <- sf::st_drop_geometry(stops) + stops <- as.data.table(stops) + setnames(stops, old = c("Y", "X"), new = c("stop_lat", "stop_lon")) + } + else #TODO test column names + { + stops = locations + } + + stops$stop_lat <- round(stops$stop_lat, 5) + stops$stop_lon <- round(stops$stop_lon, 5) + + return (stops) +} diff --git a/R/atoc_overlay.R b/R/atoc_overlay.R new file mode 100644 index 0000000..9998ee0 --- /dev/null +++ b/R/atoc_overlay.R @@ -0,0 +1,888 @@ +#functions relating to processing timetable overlay rules + + + +setValueInThisEnvironment <- function( name, value ) +{ + env <- asNamespace("UK2GTFS") + unlockBinding(name, env) + assign(name, value, envir = env) + lockBinding(name, env) +} + + + +assign("STOP_PROCESSING_UID", NULL ) + +set_STOP_PROCESSING_UID <- function( value ) +{ + setValueInThisEnvironment("STOP_PROCESSING_UID", value) + + if(!is.null(value)) + { + message(paste0(Sys.time(), " Set STOP_PROCESSING_UID to [", get("STOP_PROCESSING_UID"), "]")) + } +} + + + +#need to ensure these get set consistently into any worker processes as well as main thread + +assign("TREAT_DATES_AS_INT", FALSE ) + +set_TREAT_DATES_AS_INT <- function( value ) +{ + setValueInThisEnvironment("TREAT_DATES_AS_INT", as.logical(value)) +} + + +assign("WDAY_LOOKUP_MIN_VALUE", NULL ) + +set_WDAY_LOOKUP_MIN_VALUE <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MIN_VALUE", as.integer(as.integer(value)-1L) ) +} + +assign("WDAY_LOOKUP_MAX_VALUE", NULL ) + +set_WDAY_LOOKUP_MAX_VALUE <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MAX_VALUE", as.integer(value)) +} + +assign("WDAY_LOOKUP_MAP", NULL ) + +set_WDAY_LOOKUP_MAP <- function( value ) +{ + setValueInThisEnvironment("WDAY_LOOKUP_MAP", value) +} + + + +local_lubridate_wday <- function( date, label = FALSE, week_start=1L ) +{ + if (TRUE==TREAT_DATES_AS_INT) + { + if( any(date<=WDAY_LOOKUP_MIN_VALUE) || any(date>WDAY_LOOKUP_MAX_VALUE) ) + { + stop("requested value index [", date, "] is outside lookup table") + } + + return ( WDAY_LOOKUP_MAP[ date-WDAY_LOOKUP_MIN_VALUE ] ) + } + else + { + return ( lubridate::wday( date, label = FALSE, week_start=1L ) ) + } +} + + +local_seq_date<-function( from, to, by ) +{ + if (TRUE==TREAT_DATES_AS_INT) + { + return ( seq.int(from = from, to = to) ) + } + else + { + return ( seq.Date(from = from, to = to, by = by) ) + } +} + + +setupDatesCache<-function( calendar ) +{ + minDt = min(calendar$start_date) + maxDt = max(calendar$end_date) + + set_WDAY_LOOKUP_MIN_VALUE( minDt ) + set_WDAY_LOOKUP_MAX_VALUE( maxDt ) + + firstWeek = as.integer(lubridate::wday( seq.Date(from = minDt, to = minDt+6L, by = "day"), label = FALSE, week_start=1L )) + allWeeks = rep( firstWeek, length.out=( as.integer(maxDt) - as.integer(minDt) +1L ) ) + set_WDAY_LOOKUP_MAP( allWeeks ) +} + + + +#performance - this is slow, might be generating on the fly each time subset happens - cache it. - +LETTERS <- letters[1L:26L] +TWO_LETTERS <- paste0(rep(letters, each = 26L), rep(letters, times = 26L)) + +# Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique +appendLetterSuffix <- function( cal ) +{ + rows = nrow(cal) + + if (rows > 1L) + { + if (rows <= 26L) + { + cal$UID <- paste0(cal$UID, " ", LETTERS[1L:rows]) + } + else + { + # Cases where we need extra letters, gives up to 676 ids + cal$UID <- paste0(cal$UID, " ", TWO_LETTERS[1L:rows]) + } + } + + return (cal) +} + + +# Append to the UID to note the changes - and ensure that all service_id's in the output file remain unique +appendNumberSuffix<-function( cal, numToAppend ) +{ + if( numToAppend>1L ) #don't need to append a new number if we only have one pattern + { + # further differentiate the UID by appending a number to the end for each different days pattern + cal$UID <- paste0(cal$UID, numToAppend) + } + + return (cal) +} + + + +# in a week bitmask, if there are non-operating days between the first and last operating day of the week - will return TRUE +# e.g. 0010000 = FALSE 0011100 = FALSE 0101000 = TRUE +hasGapInOperatingDays <- function( daysBitmask ) +{ + firstDay = stringi::stri_locate_first( daysBitmask, fixed = "1" )[,1L] + lastDay = stringi::stri_locate_last( daysBitmask, fixed = "1" )[,1L] + + operatingDayCount = stringi::stri_count( daysBitmask, fixed = "1" ) + + res = ( lastDay-firstDay+1L != operatingDayCount ) + + res[is.na(res)] <- FALSE #shouldn't really get this, probably operating days are '0000000' + + return( res ) +} + + + +DATE_EPOC <- as.Date(lubridate::origin) # 01/01/1970 +WEEKDAY_NAME_VECTOR <- c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") +START_PATTERN_VECTOR = c("1","01","001","0001","00001","000001","0000001") +END_PATTERN_VECTOR = c("1000000","100000","10000","1000","100","10","1") + +#calendars should start on the first day they are effective, and end on the last day. +#i.e. if the first day in the day bitmask is Tuesday - then the start date should be Tuesday, not some other day. +validateCalendarDates <- function( calendar ) +{ + start_day_number = local_lubridate_wday( calendar$start_date, label = FALSE, week_start=1L ) + end_day_number = local_lubridate_wday( calendar$end_date, label = FALSE, week_start=1L ) + + startOk <- START_PATTERN_VECTOR[ start_day_number ] == stringi::stri_sub(calendar$Days, 1L, start_day_number) + endOk <- END_PATTERN_VECTOR[ end_day_number ] == stringr::str_sub(calendar$Days, end_day_number, 7L) + + return (startOk & endOk) +} + + + + +#' split and rebind bitmask +#' +#' @details +#' splits 'Days' bitmask into individual logical fields called monday, tuesday, etc... +#' +#' @param calendar data.table of calendar items +#' @noRd +#' +splitAndRebindBitmask <- function( calendar ) +{ + calMat = splitBitmaskMat( calendar$Days, asInteger=FALSE ) + + #this function gets expensive if you call it a lot, creating data.table takes a while + return (cbind( calendar, as.data.table(calMat) ) ) +} + +splitBitmaskMat <- function( bitmaskVector, asInteger=FALSE ) +{ + splitDays = splitBitmask( bitmaskVector, asInteger=asInteger ) + + return (matrix(splitDays, ncol=7, byrow=TRUE, dimnames=list(NULL,WEEKDAY_NAME_VECTOR))) +} + +splitBitmask <- function( bitmask, asInteger=FALSE ) +{ + duff = which( nchar(bitmask) != 7L ) + + bitmask[duff] = "0000000" + + splitDays = ( "0"!= unlist( strsplit(bitmask, "") ) ) + #performance, calling as.integer on string is surprisingly expensive, so do it this way instead which is twice as fast overall + + if (asInteger) + { + splitDays = as.integer(splitDays) + } + + return (splitDays) +} + + + +checkOperatingDayActive <- function(calendar) { + + if (all(calendar$duration >= 7L)) + { + return (calendar$Days!="0000000") + } + + #get a list of days of week that the timetable is valid on + opDays <- splitBitmaskMat( calendar$Days, asInteger=FALSE ) + opDays <- split(opDays, row(opDays)) + + #performance - precalculate all the days + veryfirstDay = min(calendar$start_date) + + if( TRUE==TREAT_DATES_AS_INT) + { + allDays = WDAY_LOOKUP_MAP[ (veryfirstDay - WDAY_LOOKUP_MIN_VALUE) : (max(calendar$end_date) - WDAY_LOOKUP_MIN_VALUE) ] + } + else + { + allDays = local_lubridate_wday( local_seq_date(from = veryfirstDay, to = max(calendar$end_date), by = "day") + , label = FALSE, week_start=1L ) + } + veryfirstDay = veryfirstDay - 1L + + checkValid <- function(dur, sd, ed, od ){ + + if (dur >= 7L) + { + return (any(od)) + } + + firstDay = as.integer(sd)-as.integer(veryfirstDay) + lastDay = as.integer(ed)-as.integer(veryfirstDay) + + dayNumbers <- allDays[ firstDay:lastDay ] + + return ( any(od[dayNumbers]) ) + } + + validCalendars <- mapply( checkValid, calendar$duration, + calendar$start_date, calendar$end_date, + opDays, SIMPLIFY = TRUE ) + return (validCalendars) +} + + + +CALENDAR_UNIQUE_CHECK_COLUMN_NAMES <- c("originalUID","start_date","end_date","Days","STP","duration" ) + +# We have lots of complex logic, which means that when we have multiple base timetables that are separated +# in the temporal domain e.g. march, april - we end up duplicating the overlays +# +# this is a bit of a gluey hack that could be fixed by looking in the temporal domain when deciding what overlaps +# see test case no.10 ("10:test makeCalendarInner") that triggered addition of this logic +# +makeCalendarsUnique <- function ( calendar ) +{ + calendar <- calendar[ !duplicated( calendar, by=CALENDAR_UNIQUE_CHECK_COLUMN_NAMES ) ] + + return( calendar ) +} + + + + +countIntersectingDayPatterns <- function( dayPatterns ) +{ + unpacked = splitBitmaskMat( dayPatterns, asInteger = TRUE ) + sums = colSums(unpacked) #add up number of intersections for monday etc... + names(sums) <- NULL #makes unit test construction easier + return ( sums ) +} + +intersectingDayPattern <- function( dayPattern1, dayPattern2 ) +{ + return (any( countIntersectingDayPatterns( c(dayPattern1,dayPattern2) ) > 1L) ) +} + + +intersectingDayPatterns <- function( dayPatternBase, dayPatternOverlay ) +{ + if (is.null(dayPatternOverlay) || is.null(dayPatternBase) ) return (NULL) + + unpackedOverlay = splitBitmaskMat( dayPatternOverlay, asInteger = FALSE ) + unpackedBase = splitBitmaskMat( dayPatternBase, asInteger = FALSE ) + + #repeat the base for every Overlay + unpackedBaseRep = rep( unpackedBase, length(dayPatternOverlay) ) + unpackedBaseRepmat = matrix(unpackedBaseRep, ncol=7, byrow=TRUE) + + intersects = unpackedBaseRepmat & unpackedOverlay + + res <- apply(intersects, 1L, any) + + return ( res ) +} + + + +SINGLE_DAY_PATTERN_VECTOR = c("1000000","0100000","0010000","0001000","0000100","0000010","0000001") + +SINGLE_DAY_PATTERN_LIST = list(c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE), + c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)) + + +makeReplicationDates <- function(cal, startDayNum, endDayNum){ + + #make a sequences of dates, offsetting the start date so it's always monday (aligning with bitmask start day) + # and the end date so it's always sunday + firstDate = min(cal$start_date) - 7L + lastDate = max(cal$end_date) + 7L + allDates = local_seq_date(from = firstDate, to = lastDate, by = "day") + + offset = as.integer(cal$start_date)-startDayNum+2L-as.integer(firstDate) + end = as.integer(cal$end_date)+8L-endDayNum-as.integer(firstDate) + + dates <- Map(function(o, e) allDates[o:e], offset, end) + + if (TRUE==TREAT_DATES_AS_INT) + { + res = unlist(dates) + } + else + { + res = as.Date( unlist(dates), origin = DATE_EPOC ) + } + + return (res) +} + + + +#' replicates the input calendar objects into single day duration calendar objects +#' calender objects should NOT have had the 'days' bitmask field unpacked +#' (will still produce an output but the unpacked monday, tuesday etc fields will no longer be consistent with the packed 'Days' bitmask) +#' +#' @param cal data.table containing all the calendars to be split up into individual days +#' @noRd +#' +makeAllOneDay <- function( cal ) +{ + duration <- cal$end_date - cal$start_date + 1L + + if ( 0L==nrow(cal) || all(1L == duration)) + { + #nothing to do + return (cal) + } + + #make a list of dates for each object being replicated + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1L ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1L ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + #work out how many time we need to replicate each item: number of operating days in week * num weeks + bitmaskMat = splitBitmaskMat( cal$Days, asInteger=FALSE ) + dayCount = rowSums(bitmaskMat) + numWeeks <- ceiling(as.integer(cal$duration) / 7L) + repetitions = dayCount * numWeeks + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = repetitions)] + + #get a mask of operating days + operatingDayLogical <- rep( split(bitmaskMat, row(bitmaskMat)), times = numWeeks) + + #set the start and end date for each calender item to the single day identified earlier + selectedDates = dateSequence[unlist(operatingDayLogical)] + replicatedcal$end_date <- replicatedcal$start_date <- selectedDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- 1L + replicatedcal$Days = SINGLE_DAY_PATTERN_VECTOR[ local_lubridate_wday( replicatedcal$start_date, label = FALSE, week_start=1L ) ] + + return (replicatedcal) +} + + + + +#' along a similar line to 'makeAllOneDay' duplicates input calendar objects into single WEEK duration calendar objects +#' +#' @param cal data.table containing all the calendars to be split up into individual weeks +#' @noRd +#' +expandAllWeeks <- function( cal ) +{ + if ( 0L==nrow(cal) ) + { + #nothing to do + return (cal) + } + + #duration <- cal$end_date - cal$start_date + 1 + + #make a list of dates for each object being replicated + startDayNum = local_lubridate_wday( cal$start_date, label = FALSE, week_start=1L ) + endDayNum = local_lubridate_wday( cal$end_date, label = FALSE, week_start=1L ) + dateSequence = makeReplicationDates( cal, startDayNum, endDayNum ) + + numWeeks <- ceiling(as.integer(cal$duration) / 7L) + + #replicate a logical vector for the start date and use that to select the relevant dates from the date sequence + startDayLogical <- SINGLE_DAY_PATTERN_LIST[startDayNum] + startDays <- rep(startDayLogical, times = numWeeks) + startDates <- dateSequence[ unlist(startDays) ] + + #replicate a logical vector for the end date and use that to select the relevant dates from the date sequence + endDayLogical <- SINGLE_DAY_PATTERN_LIST[endDayNum] + endDays <- rep(endDayLogical, times = numWeeks) + endDates <- dateSequence[ unlist(endDays) ] + + #replicate the calendar rows the appropriate number of times + replicatedcal <- cal[rep(seq_len(.N), times = numWeeks)] + + #set the start and end date for each calender item + replicatedcal$start_date <- startDates + replicatedcal$end_date <- endDates + + #tidy up the values so they are correct for the spilt items + replicatedcal$duration <- replicatedcal$end_date - replicatedcal$start_date + 1L + + return (replicatedcal) +} + + + +#' allocate Cancellations Across Calendars +#' +#' @details +#' expects input calendar items to have been separated out into non-overlapping dates +#' and 'Days' bitmask unpacked into separate int or logical attributes +#' +#' "originalUID" is used to identify where the cancellations originally came from +#' after allocating across the split calender items the cancellations will have an updated +#' "UID" that says which calender they are now associated with +#' +#' @param calendar data.table of calendar items that are NOT cancellations (that has 'Days' bitmask unpacked ) +#' @param cancellations data.table of calender items that ARE cancellations (that has 'Days' bitmask unpacked ) +#' @noRd +#' +allocateCancellationsAcrossCalendars <- function( calendar, cancellations ) +{ + tempNames = names(calendar) + + #stash some join fields away because we want to keep the data from the cancellations rather than the calendar table + #which otherwise get over-written by the join process + cancellations$start_date2 <- cancellations$start_date + cancellations$end_date2 <- cancellations$end_date + cancellations$UID <- NULL #we want the new UID from the calendar entries + + #left join cancellations to calendar by the original service ID + #and the date of the cancellation lying inside the period of the calendar + #and the day of the cancellation is an operating day of the calendar item + joined = cancellations[calendar, on = .(originalUID==originalUID, + start_date>=start_date, + end_date<=end_date), nomatch = 0L][ + ((i.monday&monday) | (i.tuesday&tuesday) | (i.wednesday&wednesday) + | (i.thursday&thursday) | (i.friday&friday) | (i.saturday&saturday) | (i.sunday&sunday)), ] + #revert the stashed (join) fields + joined$start_date <- joined$start_date2 + joined$start_date2 <- NULL + joined$end_date <- joined$end_date2 + joined$end_date2 <- NULL + + #remove joined fields we don't need + joined <- joined[, .SD, .SDcols=tempNames] + + #belt and braces - fix any NA fields by reverting from the original UID + joined[is.na(UID), UID := originalUID] + + return( joined ) +} + + + +CALENDAR_COLS_TO_COPY <- c("UID", "Days", "STP", "rowID" ) + +NOT_NEEDED <- c("__NOT_NEEDED_MARKER__~@$$%&*((") + + +#this function is massively performance critical - profile any changes to it +selectOverlayTimeableAndCopyAttributes <- function(cal, calNew, rowIndex) +{ + #if we have two adjacent complete items e.g. ....end 13th Jan start 14th jan..... + #then it's not a real gap and just an artefact of the algorithm use to generate the dates + if( rowIndex>1L && rowIndex= calNew$end_date[rowIndex],,which=TRUE] + + #are we in a gap between two base timetables with no overlays + if ( length(baseTimetableIndexes)<=0L ) + { + calNew$UID[rowIndex] <- NOT_NEEDED + return (calNew) + } + + + # apply timetable overlay selection logic - pick highest priority timetable type + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + + #pick the lowest alphabetic STP (highest priority), and just in case there is more than one, the shortest duration one. + + #priorityTimetable <- baseTimetables[order(STP, duration), head(.SD, 1)] + #performance we pre-sort all the entries by the priority & duration + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #so we don't need to sort again here, just pick the top filtered result + + set( calNew, i = rowIndex, j = CALENDAR_COLS_TO_COPY, value = cal[ baseTimetableIndexes[1L], CALENDAR_COLS_TO_COPY, with=FALSE ] ) + #set() runs 3x faster than this style of copy calNew[rowIndex,] <- cal[baseTimetableIndexes[1],] + + return (calNew) +} + + + +# triggered by test case "10:test makeCalendarInner" +# when we have a 1 day overlay sitting on the start/end date of a base timetable +# the dates overlap - fix it +fixOverlappingDates <- function( cal ) +{ + rowCount = nrow(cal) + + #forwards + for (j in seq(1L, rowCount)) { + + #adjust our end date if next item a higher priority overlay + if (j1L && !is.na(cal$UID[j-1L]) && cal$STP[j-1L] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1L] +1L + } + } + } + + #backwards + for (j in seq(rowCount, 1L)) { + + #adjust our end date if previous item a higher priority overlay + if (j>1L && !is.na(cal$UID[j]) && !is.na(cal$UID[j-1L]) ) + { + if ( cal$STP[j-1L] < cal$STP[j] ) + { + cal$start_date[j] <- cal$end_date[j-1L] +1L + } + + if(j1L && !is.na(calNew$UID[j-1L]) && NOT_NEEDED != calNew$UID[j-1L] ) + { + calNew$start_date[j] <- calNew$end_date[j-1L] +1L + } + } + } + + #backwards + for (j in seq(rowCount, 1L)) { + + #if we are not valid & the previous item is already valid, fill in our details and adjust our start date + if (j>1L && is.na(calNew$UID[j]) && !is.na(calNew$UID[j-1L]) ) + { + calNew <- selectOverlayTimeableAndCopyAttributes(cal, calNew, j) + + if ( NOT_NEEDED != calNew$UID[j-1L]) + { + calNew$start_date[j] <- calNew$end_date[j-1L] +1L + } + + #if next item valid adjust our start date + if(j 0L, ] + + #performance, do all subsets in one go + calNew <- calNew[ (!is.na(UID)) & (get("NOT_NEEDED") != UID) & (STP != "C") & (duration > 0L), ] + + # Append UID to note the changes + if (nrow(calNew) > 0L) + { + calNew <- appendLetterSuffix( calNew ) + } + else + { + calNew <- NA + } + + return(calNew) +} + + + + + +#' make calendar helper function +#' this originally expected and dealt with cancellations too. This worked ok for single day duration cancellations +#' but had problems with multi-day cancellations when combined with overlays +#' code hasn't been changed to reject / avoid cancellations but results may not be predictable / tested scenarios +#' @param calendarSub data.table containing all the calendars (aka CIF operating patterns) for a single service +#' @noRd +#' +makeCalendarInner <- function(calendarSub) { + + if ( 1L == nrow(calendarSub) ) + { + # make into an single entry + res = list(calendarSub, NA) + } + else + { + if (length(unique(calendarSub$UID)) > 1L) + { + stop(paste("Error: makeCalendarInner was passed more than one service to work on. service=", unique(calendarSub$UID))) + } + + # check duration and types + allTypes <- calendarSub$STP + + # as per https://wiki.openraildata.com/index.php/SCHEDULE + # "Conveniently, it also means that the lowest alphabetical STP indicator wins - 'C' and 'O' are both lower in the alphabet than 'P'." + baseType = max(allTypes) #usually we expect 'P' to be the base timetable... but it can also be STP service in which case it will be 'N' + + overlayDurations <- as.numeric(calendarSub$duration[calendarSub$STP != baseType]) + overlayTypes <- calendarSub$STP[calendarSub$STP != baseType] + + if( length(overlayDurations) <= 0L ) + { + #assume the input data is good and the base timetables don't break any of the overlaying /operating day rules + res = list( appendLetterSuffix(calendarSub), NA) + } + #if every overlay is a one day cancellation + else if ( all(overlayDurations == 1L) && all(overlayTypes == "C") ) + { + #warning("Unexpected item in the makeCalendarInner-ing area, cancellations should now be handled at a higher level (1)") + + # Apply the cancellation via entries in calendar_dates.txt + res = list( appendLetterSuffix( calendarSub[calendarSub$STP != "C", ] ), + calendarSub[calendarSub$STP == "C", ]) + } + else + { + uniqueDayPatterns <- unique(calendarSub$Days[calendarSub$STP != "C"]) + + # if the day patterns are all identical + if (length(uniqueDayPatterns) <= 1L ) + { + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #calendarSub = calendarSub[ order(STP, duration), ] + setkey( calendarSub, STP, duration ) + setindex( calendarSub, start_date, end_date) + + calendar_new <- makeCalendarsUnique( splitDates(calendarSub) ) + res = list(calendar_new, NA) + } + else # split by day pattern + { + res = makeCalendarForDifferentDayPatterns( calendarSub, uniqueDayPatterns ) + } + } + } + + + if ( !is.null(STOP_PROCESSING_UID) ) + { + if ( any( STOP_PROCESSING_UID==calendarSub$UID) ) + { + message(paste0(Sys.time(), " Reached STOP_PROCESSING_UID value [", unique(calendarSub$UID), "] length=", length(calendarSub$UID))) + stop("Option:UK2GTFS_option_stopProcessingAtUid has been set: Stopped processing at UID=", STOP_PROCESSING_UID) + } + } + + return (res) +} + + + + +# this is a complex case where the overlays don't have the same day pattern as the base timetable +# +# e.g base is mon-sat, and we have some engineering work for 3 weeks tue-thur +# +# the approach we take is to duplicate the overlay timetables for every week they are in effect, then overlay them. +# +# aha but the complexity isn't finished. If the overlay is tue+thur then wed is the base timetable. +# +# when we get to this latter complexity we just split the overlay into individual days and apply it that way. +# +makeCalendarForDifferentDayPatterns <- function( calendar, uniqueDayPatterns ) +{ + baseType = max(calendar$STP) + baseTimetables = calendar[calendar$STP == baseType] + overlayTimetables = calendar[calendar$STP != baseType] + + #do the day patterns overlap each other in any way ? + #e.g. a mon-sat pattern with a wed-fri overlap. + if ( any( countIntersectingDayPatterns(uniqueDayPatterns) > 1L) ) + { + gappyOverlays = overlayTimetables[ hasGapInOperatingDays(overlayTimetables$Days) ] + continiousOverlays = overlayTimetables[ !hasGapInOperatingDays(overlayTimetables$Days) ] + + gappyOverlays = makeAllOneDay( gappyOverlays ) + continiousOverlays = expandAllWeeks( continiousOverlays ) + + overlayTimetables = data.table::rbindlist( list(continiousOverlays,gappyOverlays), use.names=FALSE) + } + + splits <- list() + + distinctBasePatterns = unique( baseTimetables$Days ) + + for (k in seq(1L, length(distinctBasePatterns))) { + + theseBases = baseTimetables[baseTimetables$Days == distinctBasePatterns[k] ] + + theseOverlays = overlayTimetables[ intersectingDayPatterns( distinctBasePatterns[k], overlayTimetables$Days ) ] + + if (nrow(theseOverlays) <= 0L) + { + splits[[k]] <- appendNumberSuffix( appendLetterSuffix( theseBases ), k ) + } + else + { + timetablesForThisPattern = data.table::rbindlist( list( theseBases, theseOverlays ), use.names=FALSE) + + #performance pre-sort all the entries by the priority + #this speeds things up when we look up the required priority overlay **SEE_NOTE** + #timetablesForThisPattern = timetablesForThisPattern[ order(STP, duration), ] + setkey( timetablesForThisPattern, STP, duration ) + setindex( timetablesForThisPattern, start_date, end_date) + + thisSplit <- splitDates( timetablesForThisPattern ) + + # reject NAs + if (inherits(thisSplit, "data.frame")) { + splits[[k]] <- appendNumberSuffix( thisSplit, k ) + } + } + } + + splits <- data.table::rbindlist(splits, use.names=FALSE) + + splits <- makeCalendarsUnique( splits ) + + # after all this faffing about and splitting and joining, it's quite likely we've created some + # small fragments of base timetable that aren't valid (e.g mon-fri service but start and end date on weekend) + splits <- splits[ checkOperatingDayActive( splits ) ] + + return(list(splits, NA)) +} + + + + diff --git a/R/extdata.R b/R/extdata.R index 2703066..1025c77 100644 --- a/R/extdata.R +++ b/R/extdata.R @@ -7,11 +7,13 @@ #' Raw data can be viewed and contributed to at #' https://github.com/ITSLeeds/UK2GTFS-data #' +#' @param timeout maximum duration (in seconds) to wait for a response from the server (github.com) +#' #' @export #' -update_data <- function(){ +update_data <- function( timeout=60 ){ - check <- check_data() + check <- check_data( timeout=timeout ) if(check$date_package != check$date){ @@ -59,7 +61,7 @@ download_data <- function(tag_name, package_location, date){ utils::unzip(file.path(tempdir(),"UK2GTFS_load/all.zip"), exdir = file.path(package_location, "extdata")) unlink(file.path(tempdir(),"UK2GTFS_load"), recursive = TRUE) - writeLines(date, file.path(package_location, "extdata/date.txt")) + writeLines( as.character(date), file.path(package_location, "extdata/date.txt")) } @@ -72,12 +74,12 @@ download_data <- function(tag_name, package_location, date){ #' @return TRUE if data is up-to-date or if unable to check #' @noRd -check_data <- function(default_tag = "v0.1.2"){ +check_data <- function( timeout = 60, default_tag = "v0.1.2"){ # Try not to hammer the API Sys.sleep(5) # Check date on data repo - res = try(httr::GET("https://api.github.com/repos/ITSleeds/UK2GTFS-data/releases"), - silent = TRUE) + res = try(httr::GET("https://api.github.com/repos/ITSleeds/UK2GTFS-data/releases", httr::timeout(get("timeout")), + silent = TRUE )) if(inherits(res, "try-error")){ message("Unable to check for latest data") date = Sys.time() @@ -94,13 +96,20 @@ check_data <- function(default_tag = "v0.1.2"){ } } + date = as.Date(date) #make it less sensitive by only comparing date rather than date+time + #Check if date.txt in package package_location <- system.file(package = "UK2GTFS") if(!file.exists(file.path(package_location, "extdata/date.txt"))){ writeLines("nodata", file.path(package_location, "extdata/date.txt")) } - date_package <- readLines(file.path(package_location, "extdata/date.txt")) + tryCatch({ + date_package <- as.Date( readLines(file.path(package_location, "extdata/date.txt")) ) + }, error = function(err) { + date_package = "nodata" + }) + return(list(date_package = date_package, date = date, tag_name = tag_name, package_location = package_location)) diff --git a/R/get_cal.R b/R/get_cal.R index a009b84..45492f9 100644 --- a/R/get_cal.R +++ b/R/get_cal.R @@ -7,7 +7,7 @@ #' @return data frame #' @details TransXchange records bank holidays by name (e.g. Christmas Day), #' some UK bank holidays move around, so this function downloads the official -#' bank holiday calendar. The offical feed only covers a short period of time +#' bank holiday calendar. The official feed only covers a short period of time #' so this may not be suitable for converting files from the past / future. #' @export #' diff --git a/R/get_naptan.R b/R/get_naptan.R index 4a34ba8..c5a39d8 100644 --- a/R/get_naptan.R +++ b/R/get_naptan.R @@ -1,6 +1,6 @@ #' Get naptan #' -#' Download the NaPTAN stop locations for more information on NaPTAN see +#' Download the NaPTAN stop locations in CSV format. For more information on NaPTAN see #' https://data.gov.uk/dataset/ff93ffc1-6656-47d8-9155-85ea0b8f2251/national-public-transport-access-nodes-naptan #' @param url character, url to the csv format NaPTAN #' @param naptan_extra data frame of missing stops default uses `naptan_missing` @@ -9,7 +9,7 @@ #' functions downloads them from the offical DfT source. NaPTAN has some #' missing bus stops which are added by UK2GTFS. See `naptan_missing` #' -#' +#' Do not use this function for heavy rail use - download in XML format which includes TIPLOC identifiers #' #' @export @@ -19,10 +19,13 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data load_data("naptan_missing") dir.create("temp_naptan") - utils::download.file(url = url, destfile = "temp_naptan/Stops.csv", mode = "wb", quiet = TRUE) - naptan <- readr::read_csv("temp_naptan/Stops.csv", progress = FALSE, show_col_types = FALSE) - unlink("temp_naptan", recursive = TRUE) - #file.remove("naptan.zip") + + tryCatch({ + utils::download.file(url = url, destfile = "temp_naptan/Stops.csv", mode = "wb", quiet = TRUE) + naptan <- readr::read_csv("temp_naptan/Stops.csv", progress = FALSE, show_col_types = FALSE) + }, finally = { + unlink("temp_naptan", recursive = TRUE) + }) # clean file naptan <- naptan[, c("ATCOCode", "NaptanCode", "CommonName", "Easting", "Northing")] @@ -36,7 +39,7 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data naptan$stop_lon <- format(round(naptan$stop_lon, 6), scientific = FALSE) naptan$stop_lat <- format(round(naptan$stop_lat, 6), scientific = FALSE) - # Append alterative tags + # Append alternative tags naptan_extra <- naptan_extra[!naptan_extra$stop_id %in% naptan$stop_id,] naptan <- rbind(naptan, naptan_extra) @@ -66,3 +69,219 @@ get_naptan <- function(url = "https://naptan.api.dft.gov.uk/v1/access-nodes?data # # return(naptan) # } + + + +#' Get naptan xml doc +#' +#' Download the NaPTAN stop locations in XML format. +#' For more information on NaPTAN see https://beta-naptan.dft.gov.uk/ +#' @param url character, url to the xml format NaPTAN +#' @param timeout int, timeout in seconds to wait for download to complete +#' @return xml document node +#' @details TransXchange does not store the location of bus stops, so this +#' functions downloads them from the offical DfT source. +#' +#' NrStations with multiple tiplocs seem to be represented as multiple natpan nodes, +#' with one AnnotatedRailRef per naptan node, despite the schema supporting a multiplicity relationship. +#' e.g. stourbridge, clapham junction. +#' these can be joined together on the CRS +#' +#' As of 2023 Naptan is published under a more permissive (OGL3) licence than ATOC data (creative commons licence). +#' +#' http://naptan.dft.gov.uk/naptan/schema/2.5.1/napt/NaPT_stop-v2-5-1.xsd +#' +#' +#' +#' sequence of optional elements +#' +#' +#' +#' +#' +#' +#' +#' +#' +#' +#' %', '.', 'Activity', 'Arrival Time', 'Departure Time', 'N', 'Public Arrival Time', + 'Public Departure Time','STP', 'Scheduled Arrival Time', 'Scheduled Departure Time', + 'Train Category', 'V1', '_TEMP_', '__TEMP__', 'duration', + 'i.friday', 'i.monday', 'i.saturday', 'i.sunday', 'i.thursday', 'i.tuesday', 'i.wednesday', 'originalUID', + 'route_id_new', 'route_type', 'service_id', 'service_id_new', 'stop_name', 'trip_id_new' )) + + +#' UK2GTFS option stopProcessingAtUid +#' @description sets/gets a UID value at which processing will stop - used for debugging +#' @param value option value to be set (char) +#' @details If no value passed in will return the current setting of the option. (Usually NULL) +#' If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() +#' when an exact match for that value is encountered. +#' +#' THIS ONLY WORKS WITH ncores==1 +#' +#' @export +UK2GTFS_option_stopProcessingAtUid <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_stopProcessingAtUid", default=NULL) ) + } + else + { + if ( !is.null(value) && !inherits(value, "character") ){ value = as.character( value ) } + + if ( !is.null(value) && 0==nchar(value) ){ value=NULL } + + return( options(UK2GTFS_opt_stopProcessingAtUid = value ) ) + } +} + + + + +#' UK2GTFS option treatDatesAsInt +#' @description sets/gets a logical value which determines how dates are processed while building calendar - used for debugging +#' @param value option value to be set (logical) +#' @details In the critical part of timetable building, handling dates as dates is about half the speed of handling as int +#' so we treat them as integers. However that's a complete pain for debugging, so make it configurable. +#' if errors are encountered during the timetable build phase, try setting this value to FALSE +#' +#' @export +UK2GTFS_option_treatDatesAsInt <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_treatDatesAsInt", default=TRUE) ) + } + else + { + return( options(UK2GTFS_opt_treatDatesAsInt = as.logical(value) ) ) + } +} + + + +#' UK2GTFS option updateCachedDataOnLibaryLoad +#' @description sets/gets a logical value which determines if the data cached in the library is checked for update when loaded +#' @param value option value to be set (logical) +#' @details when child processes are initialised we want to suppress this check, so it is also used for that purpose +#' +#' @export +UK2GTFS_option_updateCachedDataOnLibaryLoad <- function(value) +{ + if (missing(value)) + { + return( getOption("UK2GTFS_opt_updateCachedDataOnLibaryLoad", default=TRUE) ) + } + else + { + return( options(UK2GTFS_opt_updateCachedDataOnLibaryLoad = as.logical(value) ) ) + } +} + + + diff --git a/R/gtfs_cleaning.R b/R/gtfs_cleaning.R index 651a29c..3b561aa 100644 --- a/R/gtfs_cleaning.R +++ b/R/gtfs_cleaning.R @@ -194,57 +194,126 @@ gtfs_fast_stops <- function(gtfs, maxspeed = 83) { # } +PUBLIC_SERVICE_CATEGORY = c("OL", "OU", "OO", "OW", "XC", "XD", "XI", + "XR", "XU", "XX", "XZ", "BR", "BS", "SS" ) + + #' Clean simple errors from GTFS files #' #' @param gtfs gtfs list -#' @param removeNonPublic logical if TRUE remove routes with route_type missing +#' @param public_only Logical, if TRUE remove routes with route_type missing #' @details #' Task done: #' #' 0. Remove stops with no coordinates #' 1. Remove stops with no location information -#' 2. Remove stops that are never used -#' 3. Replace missing agency names with "MISSINGAGENCY" -#' 4. If service is not public and removeNonPublic=TRUE then remove it (freight, 'trips' aka charters) +#' 2. Remove trips with less than two stops +#' 3. Remove stops that are never used +#' 4. Replace missing agency names with "MISSINGAGENCY" +#' 5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters) #' (these have a null route_type, so loading into OpenTripPlanner fails if these are present) +#' 6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock) +#' 7. Remove shapes that no longer have any trips +#' 8 Remove frequencies that no longer have any trips #' #' @export -gtfs_clean <- function(gtfs, removeNonPublic = FALSE) { +gtfs_clean <- function(gtfs, public_only = FALSE) { # 0 Remove stops with no coordinates gtfs$stops <- gtfs$stops[!is.na(gtfs$stops$stop_lon) & !is.na(gtfs$stops$stop_lat), ] # 1 Remove stop times with no locations gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% unique(gtfs$stops$stop_id), ] - # 2 Remove stops that are never used + # 2 Remove trips with less than two stops + stop_count <- gtfs$stop_times[, .N, by = "trip_id"] + gtfs$trips <- gtfs$trips[!("trip_id" %in% stop_count[N<2]$trip_id)] + + # 3 Remove stops that are never used gtfs$stops <- gtfs$stops[gtfs$stops$stop_id %in% unique(gtfs$stop_times$stop_id), ] - # 3 Replace "" agency_id with dummy name + # 4 Replace "" agency_id with dummy name gtfs$agency$agency_id[gtfs$agency$agency_id == ""] <- "MISSINGAGENCY" gtfs$routes$agency_id[gtfs$routes$agency_id == ""] <- "MISSINGAGENCY" gtfs$agency$agency_name[gtfs$agency$agency_name == ""] <- "MISSINGAGENCY" - # 4 remove calls, trips and routes that have an empty route_type (non public services) - if (removeNonPublic) + # 5 remove calls, trips and routes that have an empty route_type (non public services) + # in addition to all the previous filtering - ECS moves were still making it into the output GTFS file, this removes them + if (public_only) { joinedTrips <- merge(gtfs$trips, gtfs$routes, by = "route_id", all.x = TRUE) joinedCalls <- merge(gtfs$stop_times, joinedTrips, by = "trip_id", all.x = TRUE) - filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type ), ] - gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times )] - - joinedCalendar <- merge(gtfs$calendar, joinedTrips, by = "service_id", all.x = TRUE) - filteredCalendar <- joinedCalendar[ !is.na( joinedCalendar$route_type ), ] - gtfs$calendar <- filteredCalendar[, names( gtfs$calendar )] - joinedCalendarDates <- merge(gtfs$calendar_dates, joinedTrips, by = "service_id", all.x = TRUE) - filteredCalendarDates <- joinedCalendarDates[ !is.na( joinedCalendarDates$route_type ), ] - gtfs$calendar_dates <- filteredCalendarDates[, names( gtfs$calendar_dates )] + if ("train_category" %in% names(joinedCalls) ) + { + filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type) & + joinedCalls$train_category %in% PUBLIC_SERVICE_CATEGORY, ] + } + else + { + filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type), ] + } + + gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times ), with=FALSE] + + + #what is this batty code I hear you cry ?! + gtfs$stop_times$arrival_time = gtfs$stop_times$arrival_time[ 1: nrow(gtfs$stop_times) ] + gtfs$stop_times$departure_time = gtfs$stop_times$departure_time[ 1: nrow(gtfs$stop_times) ] + #well, it's a bug workaround. Not entirely sure of the trigger, but when we have 30M stop times, and filter down to 1M + #the hour and minute component of the Period 'object' report a length of 30M, when there is only supposed to be 1M of them. + #The nrow() in the data.table says 1M, and the number of seconds in the period 'object' says 1M. + #Clearly 'object' is in big air quotes...... + #as a result it blows up in gtfs_write() when writing because the sprintf moans about the input vectors being different lengths. + #This fixes it. + # see also stops_interpolate() "Needed because rbindlist doesn't work with periods for some reason" which smells like a similar workaround. + #stamping on the gc() button at the end of this fn for good measure. + #- remember kids, R is not suitable for production use..... + + + #after merging GTFS files we may have compressed the calendar and calendar_dates so a service pattern is used by + #multiple trips - so don't remove calendar and calendar_dates that link to routes with NA route_type in case + #it's in use by multiple trips/routes + + if ("train_category" %in% names(joinedTrips) ) + { + filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ) & + joinedTrips$train_category %in% PUBLIC_SERVICE_CATEGORY, ] + } + else + { + filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ), ] + } + + gtfs$trips <- filteredTrips[, names( gtfs$trips ), with=FALSE] + + if ("train_category" %in% names(gtfs$routes) ) + { + gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ) & + gtfs$routes$train_category %in% PUBLIC_SERVICE_CATEGORY, ] + } + else + { + gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ] + } + + rm(joinedCalls) + rm(joinedTrips) + rm(filteredCalls) + rm(filteredTrips) + gc() + } - filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ), ] - gtfs$trips <- filteredTrips[, names( gtfs$trips )] + # 7 remove shapes that no longer have any trips + if ("shapes" %in% names(gtfs)) + { + gtfs$shapes <- gtfs$shapes[gtfs$shapes$shape_id %in% gtfs$trips$shape_id, ] + } - gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ] + # 8 remove frequencies that no longer have any trips + if ("frequencies" %in% names(gtfs)) + { + gtfs$frequencies <- gtfs$frequencies[gtfs$frequencies$trip_id %in% gtfs$trips$trip_id, ] } return(gtfs) diff --git a/R/gtfs_interpolate_times.R b/R/gtfs_interpolate_times.R index 56bc9d4..e5fd107 100644 --- a/R/gtfs_interpolate_times.R +++ b/R/gtfs_interpolate_times.R @@ -39,7 +39,11 @@ gtfs_interpolate_times <- function(gtfs, ncores = 1){ stop_times <- pbapply::pblapply(stop_times, stops_interpolate) } else { cl <- parallel::makeCluster(ncores) - parallel::clusterEvalQ(cl, {loadNamespace("UK2GTFS")}) + parallel::clusterEvalQ(cl, { + #put any setup required for all worker processes in here + options( UK2GTFS_opt_updateCachedDataOnLibaryLoad = FALSE ) + loadNamespace("UK2GTFS") + }) stop_times <- pbapply::pblapply(stop_times, stops_interpolate, cl = cl diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index be07a49..5e1bbdb 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -1,47 +1,86 @@ #' merge a list of gtfs files #' +#' !WARNING! only the tables: +#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies +#' are processed, any other tables in the input timetables are passed through +#' +#' if duplicate IDs are detected then completely new IDs for all rows will be generated in the output. +#' #' @param gtfs_list a list of gtfs objects to be merged #' @param force logical, if TRUE duplicated values are merged taking the fist -#' @param quiet logical, if TRUE less messages #' instance to be the correct instance, in most cases this is ok, but may #' cause some errors +#' @param quiet logical, if TRUE less messages +#' @param condenseServicePatterns logical, if TRUE service patterns across all routes are condensed into a unique set of patterns #' @export -gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { +gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePatterns = TRUE) { - # remove any NULLS + # remove any empty input GTFS objects gtfs_list <- gtfs_list[lengths(gtfs_list) != 0] + flattened <- unlist(gtfs_list, recursive = FALSE) + rm(gtfs_list) - # Split out lists - agency <- sapply(gtfs_list, "[", "agency") - stops <- sapply(gtfs_list, "[", "stops") - routes <- sapply(gtfs_list, "[", "routes") - trips <- sapply(gtfs_list, "[", "trips") - stop_times <- sapply(gtfs_list, "[", "stop_times") - calendar <- sapply(gtfs_list, "[", "calendar") - calendar_dates <- sapply(gtfs_list, "[", "calendar_dates") + #The Atoc code has moved from data.frame to data.table for performance reasons, but the transXchange code hasn't migrated yet. + #this is a breaking change for some items because the behaviour for data.table isn't the same as data.frame, despite extending data.frame. nice. + #least painful way to fix this for now is to convert to data.table if supplied data.frame + flattened <- lapply( flattened, function(item) + { + if ( inherits(item, "data.table" ) ) return (item) + return (data.table(item)) + } ) + + #get unique input table names + tableNames <- unique(names(flattened)) + + grouped_list <- list() + + # Loop through table names names and group data frames + for (tableName in tableNames) { + + matched <- purrr::imap( flattened, function( item, name ) { + if (name == tableName) { + return(item) + } + }) + + #remove input tables not matching tableName + matched <- matched[lengths(matched) != 0] - # bind together - names(agency) <- seq(1, length(agency)) - suppressWarnings(agency <- dplyr::bind_rows(agency, .id = "file_id")) + #assign each instance of the input table a unique number + names(matched) <- seq(1, length(matched)) - names(stops) <- seq(1, length(stops)) - suppressWarnings(stops <- dplyr::bind_rows(stops, .id = "file_id")) + #add a column to the data frame containing this unique number + suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id")) + matched$file_id <- as.integer(matched$file_id) - names(routes) <- seq(1, length(routes)) - suppressWarnings(routes <- dplyr::bind_rows(routes, .id = "file_id")) + #if("calendar_dates"==tableName) + #{ + # #don't understand what this complex line is doing ? comment would be nice. + # calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + # #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + #} - names(trips) <- seq(1, length(trips)) - suppressWarnings(trips <- dplyr::bind_rows(trips, .id = "file_id")) + #add to map + grouped_list[[tableName]] <- matched + } + + rm(flattened) - names(stop_times) <- seq(1, length(stop_times)) - suppressWarnings(stop_times <- dplyr::bind_rows(stop_times, .id = "file_id")) + # Split out lists + agency <- grouped_list$agency + stops <- grouped_list$stops + routes <- grouped_list$routes + trips <- grouped_list$trips + stop_times <- grouped_list$stop_times + calendar <- grouped_list$calendar + calendar_dates <- grouped_list$calendar_dates + shapes <- grouped_list$shapes + frequencies <- grouped_list$frequencies - names(calendar) <- seq(1, length(calendar)) - suppressWarnings(calendar <- dplyr::bind_rows(calendar, .id = "file_id")) + #remove items from map. + grouped_list <- grouped_list[setdiff(names(grouped_list), + c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes", "frequencies" ))] - names(calendar_dates) <- seq(1, length(calendar_dates)) - calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] - suppressWarnings(calendar_dates <- dplyr::bind_rows(calendar_dates, .id = "file_id")) # fix typo agency$agency_name <- as.character(agency$agency_name) @@ -95,8 +134,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { agency_lang = agency_lang[1] ) } else { - stop(paste0("Duplicated Agency IDs ", - paste(unique(agency.check$agency_id[duplicated(agency.check$agency_id)]), collapse = " "))) + stop("Duplicated Agency IDs: ", + paste(unique(agency.check$agency_id[duplicated(agency.check$agency_id)]), collapse = " ")) } } else { agency <- agency[!duplicated(agency$agency_id), ] @@ -110,48 +149,64 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { if(force){ stops <- stops[!duplicated(stops$stop_id),] } else { - stop("Duplicated Stop IDS") + stop("Duplicated Stop IDS: ", paste( unique(stops$stop_id[duplicated(stops$stop_id)]), collapse = " ")) } - - } # routes if (any(duplicated(routes$route_id))) { if(!quiet){message("De-duplicating route_id")} - route_id <- routes[, c("file_id", "route_id")] - if (any(duplicated(route_id))) { + + retainedColumnNames <- colnames(routes)[!(colnames(routes) %in% c("route_id", "file_id"))] + + new_route_id <- routes[, c("file_id", "route_id")] + if (any(duplicated(new_route_id))) { if(force){ - routes <- routes[!duplicated(route_id), ] - route_id <- routes[, c("file_id", "route_id")] + routes <- routes[!duplicated(new_route_id), ] + new_route_id <- routes[, c("file_id", "route_id")] } else { - stop("Duplicated route_id within the same GTFS file, try using force = TRUE") + stop("Duplicated route_id within the same GTFS file, try using force = TRUE :", + paste( unique(new_route_id$route_id[duplicated(new_route_id)]), collapse = " ")) } } - route_id$route_id_new <- seq(1, nrow(route_id)) - routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id")) - routes <- routes[, c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")] - names(routes) <- c("route_id", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") + new_route_id$route_id_new <- seq(1, nrow(new_route_id)) + routes <- dplyr::left_join(routes, new_route_id, by = c("file_id", "route_id")) + + routes <- routes[, c("route_id_new", retainedColumnNames), with=FALSE] + routes <- routes %>% dplyr::rename(route_id = route_id_new) } # calendar - if (any(duplicated(calendar$service_id))) { + calendar_dates_key <- paste(calendar_dates$service_id, calendar_dates$date, calendar_dates$exception_type, sep="#") + + if (any(duplicated(calendar$service_id)) || any(duplicated(calendar_dates_key))) { if(!quiet){message("De-duplicating service_id")} - service_id <- calendar[, c("file_id", "service_id")] - if (any(duplicated(service_id))) { - stop("Duplicated service_id within the same GTFS file") + + new_service_id <- calendar[, c("file_id", "service_id")] + if (any(duplicated(new_service_id))) { + stop("Duplicated service_id within the same GTFS file: ", + paste( unique(new_service_id$service_id[duplicated(new_service_id)]), collapse = " ")) } - service_id$service_id_new <- seq(1, nrow(service_id)) - calendar <- dplyr::left_join(calendar, service_id, by = c("file_id", "service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + + # it is valid to have calendar_dates with no associated calendar (see comments further down) + # so create the distinct set of service_id in both calendar and calendar_dates + new_service_id <- dplyr::union(unique(new_service_id), unique(calendar_dates[, c("file_id", "service_id")])) + + new_service_id$service_id_new <- seq(1, nrow(new_service_id)) + + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] + calendar <- dplyr::left_join(calendar, new_service_id, by = c("file_id", "service_id")) + calendar <- calendar[, c("service_id_new", retainedColumnNames), with=FALSE] + names(calendar) <- c("service_id", retainedColumnNames) if (nrow(calendar_dates) > 0) { - calendar_dates <- dplyr::left_join(calendar_dates, service_id, by = c("file_id", "service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] + + calendar_dates <- dplyr::left_join(calendar_dates, new_service_id, by = c("file_id", "service_id")) + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames), with=FALSE] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) } } @@ -159,47 +214,72 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { # Trips if (any(duplicated(trips$trip_id))) { if(!quiet){message("De-duplicating trip_id")} - trip_id <- trips[, c("file_id", "trip_id")] - if (any(duplicated(trip_id))) { + + new_trip_id <- trips[, c("file_id", "trip_id")] + if (any(duplicated(new_trip_id))) { if(force){ trips <- unique(trips) stop_times <- unique(stop_times) - trip_id <- trips[, c("file_id", "trip_id")] + new_trip_id <- trips[, c("file_id", "trip_id")] } else{ - stop("Duplicated trip_id within the same GTFS file") + stop("Duplicated trip_id within the same GTFS file :", + paste( unique( new_trip_id$trip_id[duplicated(new_trip_id)]), collapse = " ")) } } - trip_id$trip_id_new <- seq(1, nrow(trip_id)) - trips <- dplyr::left_join(trips, trip_id, by = c("file_id", "trip_id")) - trips <- trips[, c("route_id", "service_id", "trip_id_new", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") - - - stop_times <- dplyr::left_join(stop_times, trip_id, by = c("file_id", "trip_id")) - stop_times <- stop_times[, c("trip_id_new", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint")] - names(stop_times) <- c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint") + new_trip_id$trip_id_new <- seq(1, nrow(new_trip_id)) + + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("trip_id"))] + trips <- dplyr::left_join(trips, new_trip_id, by = c("file_id", "trip_id")) + trips <- trips[, c("trip_id_new", retainedColumnNames), with=FALSE] + trips <- trips %>% dplyr::rename(trip_id = trip_id_new) + + retainedColumnNames <- colnames(stop_times)[!(colnames(stop_times) %in% c("trip_id", "file_id"))] + stop_times <- dplyr::left_join(stop_times, new_trip_id, by = c("file_id", "trip_id")) + stop_times <- stop_times[, c("trip_id_new", retainedColumnNames), with=FALSE] + stop_times <- stop_times %>% dplyr::rename(trip_id = trip_id_new) + + if ( length(frequencies) > 0 ) + { + retainedColumnNames <- colnames(frequencies)[!(colnames(frequencies) %in% c("trip_id", "file_id"))] + frequencies <- dplyr::left_join(frequencies, new_trip_id, by = c("file_id", "trip_id")) + frequencies <- frequencies[, c("trip_id_new", retainedColumnNames), with=FALSE] + frequencies <- frequencies %>% dplyr::rename(trip_id = trip_id_new) + } } - if (exists("service_id")) { - trips <- dplyr::left_join(trips, service_id, by = c("file_id", "service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_service_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id"))] + trips <- dplyr::left_join(trips, new_service_id, by = c("file_id", "service_id")) + trips <- trips[, c(retainedColumnNames, "service_id_new"), with=FALSE] + trips <- trips %>% dplyr::rename(service_id = service_id_new) } - if (exists("route_id")) { - trips <- dplyr::left_join(trips, route_id, by = c("file_id", "route_id")) - trips <- trips[, c("route_id_new", "service_id", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_route_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("route_id"))] + trips <- dplyr::left_join(trips, new_route_id, by = c("file_id", "route_id")) + trips <- trips[, c("route_id_new", retainedColumnNames), with=FALSE] + trips <- trips %>% dplyr::rename(route_id = route_id_new) } - trips <- trips[, c("route_id", "service_id", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips$file_id <- NULL # Condense Duplicate Service patterns - if (nrow(calendar_dates) > 0) { + + # in an ideal world we should not have a trip without a service pattern, and not have calendar_dates with no associated calendar, + # but the real world data isn't that tidy. + # In a typical all GB BODS extract Around 0.2% of trips have a calendar ID but no row in calendar, + # 0.2% of calendar_dates have no trips, 0.1% of calendar_dates have no corresponding calendar. + # we need to guard against this to make sure we don't end up putting null values into any key fields + # This documentation https://gtfs.org/schedule/reference/#calendar_datestxt specifically mentions calendar dates without calendars + # as being a legitimate way to construct the data. + if (condenseServicePatterns && nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} + + #find every unique combination of calendar_dates and calendar values calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) - if(inherits(calendar_dates_summary$date, "Date")){ + if( inherits(calendar_dates_summary$date, "Date") ){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, pattern = paste(c(as.character(date), exception_type), collapse = "") ) @@ -209,36 +289,62 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { ) } - calendar_summary <- dplyr::left_join(calendar, calendar_dates_summary, by = "service_id") + #we want to keep all rows in calendar_dates even if they don't have a row in calendar + calendar_summary <- dplyr::full_join(calendar, calendar_dates_summary, by = "service_id") calendar_summary <- dplyr::group_by( calendar_summary, start_date, end_date, monday, tuesday, wednesday, thursday, friday, saturday, sunday, pattern ) + + #give every unique combination of dates / days / exceptions a new distinct service ID calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary) calendar_summary <- calendar_summary[, c("service_id_new", "service_id")] + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id", "route_id"))] trips <- dplyr::left_join(trips, calendar_summary, by = c("service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips <- trips[, c("route_id", "service_id_new", retainedColumnNames), with=FALSE] + trips <- trips %>% dplyr::rename(service_id = service_id_new) + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] calendar <- dplyr::left_join(calendar, calendar_summary, by = c("service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + calendar <- calendar[, c("service_id_new", retainedColumnNames), with=FALSE] + calendar <- calendar %>% dplyr::rename(service_id = service_id_new) calendar <- calendar[!duplicated(calendar$service_id), ] - + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] calendar_dates <- dplyr::left_join(calendar_dates, calendar_summary, by = c("service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames), with=FALSE] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } - stop_times$file_id <- NULL + + # shapes in a BODS extract are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object + composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#") + if (any(duplicated(composite_key))) { + if(force){ + shapes <- shapes[!duplicated(composite_key),] + } else { + stop("Duplicated Shapes IDS :", paste( unique( composite_key[duplicated(composite_key)]), collapse = " ")) + } + } + + if ("file_id" %in% colnames(stop_times) ) stop_times$file_id <- NULL + if ("file_id" %in% colnames(calendar) ) calendar$file_id <- NULL + shapes$file_id <- NULL routes$file_id <- NULL - calendar$file_id <- NULL + frequencies$file_id <- NULL + res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) + names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") + + #for tables we don't explicitly process - hope items are unique + for (item in grouped_list) { + item$file_id <- NULL + } + + #remove nulls (e.g. tables that are often empty like frequencies) + res_final <- Filter(Negate(is.null), res_final) - res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates) - names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates") - return(res_final) + return (c(res_final, grouped_list)) } diff --git a/R/gtfs_read.R b/R/gtfs_read.R index 5f58818..a5894e2 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -16,60 +16,114 @@ gtfs_read <- function(path){ files <- list.files(tmp_folder, pattern = ".txt") gtfs <- list() - message_log <- c("Unable to find optional files: ") if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ - gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), - col_types = readr::cols(agency_id = readr::col_character()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$agency <- fread( + file.path(tmp_folder, "agency.txt"), + colClasses = c( + agency_id = "character", + agency_noc = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: agency.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ - gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), - col_types = readr::cols(stop_id = readr::col_character(), - stop_code = readr::col_character(), - stop_name = readr::col_character(), - stop_lat = readr::col_number(), - stop_lon = readr::col_number()), + gtfs$stops <- fread( + file.path(tmp_folder, "stops.txt"), + colClasses = c( + stop_id = "character", + stop_code = "character", + stop_name = "character", + stop_lat = "numeric", + stop_lon = "numeric", + wheelchair_boarding = "integer", #enum value 2 is valid but rarely seen outside the spec document + location_type = "integer", + parent_station = "character", + platform_code = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) - lazy = FALSE, show_col_types = FALSE) } else { warning("Unable to find required file: stops.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"routes.txt"))){ - gtfs$routes <- readr::read_csv(file.path(tmp_folder,"routes.txt"), - col_types = readr::cols(route_id = readr::col_character(), - agency_id = readr::col_character(), - route_short_name = readr::col_character(), - route_long_name = readr::col_character()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$routes <- fread( + file.path(tmp_folder, "routes.txt"), + colClasses = c( + route_id = "character", + agency_id = "character", + route_short_name = "character", + route_long_name = "character", + route_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: routes.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"trips.txt"))){ - gtfs$trips <- readr::read_csv(file.path(tmp_folder,"trips.txt"), - col_types = readr::cols(trip_id = readr::col_character(), - route_id = readr::col_character()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$trips <- fread( + file.path(tmp_folder, "trips.txt"), + colClasses = c( + trip_id = "character", + route_id = "character", + service_id = "character", + block_id = "character", + shape_id = "character", + wheelchair_accessible = "integer" #enum value 2 is valid but rarely seen outside the spec document + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } else { warning("Unable to find required file: trips.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"stop_times.txt"))){ - gtfs$stop_times <- readr::read_csv(file.path(tmp_folder,"stop_times.txt"), - col_types = readr::cols(trip_id = readr::col_character(), - departure_time = readr::col_character(), - arrival_time = readr::col_character()), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$stop_times <- fread( + file.path(tmp_folder, "stop_times.txt"), + colClasses = c( + trip_id = "character", + stop_id = "character", + stop_sequence = "integer", + departure_time = "character", + arrival_time = "character", + shape_dist_traveled = "numeric", + timepoint = "integer", + pickup_type = "integer", + drop_off_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + gtfs$stop_times$arrival_time <- lubridate::hms(gtfs$stop_times$arrival_time) gtfs$stop_times$departure_time <- lubridate::hms(gtfs$stop_times$departure_time) @@ -78,64 +132,94 @@ gtfs_read <- function(path){ } if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ - gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), - col_types = readr::cols(start_date = readr::col_date(format = "%Y%m%d"), - end_date = readr::col_date(format = "%Y%m%d")), - show_col_types = FALSE, - lazy = FALSE) + + gtfs$calendar <- fread( + file.path(tmp_folder, "calendar.txt"), + colClasses = c( + service_id = "character", + monday = "integer", + tuesday = "integer", + wednesday = "integer", + thursday = "integer", + friday = "integer", + saturday = "integer", + sunday = "integer", + start_date = "character", + end_date = "character" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + + gtfs$calendar[, start_date := as.IDate(start_date, "%Y%m%d")] + gtfs$calendar[, end_date := as.IDate(end_date, "%Y%m%d")] } else { message("Unable to find conditionally required file: calendar.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"calendar_dates.txt"))){ - gtfs$calendar_dates <- readr::read_csv(file.path(tmp_folder,"calendar_dates.txt"), - col_types = readr::cols(date = readr::col_date(format = "%Y%m%d")), - show_col_types = FALSE, - lazy = FALSE) - } else { - message("Unable to find conditionally required file: calendar_dates.txt") - } - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_attributes.txt"))){ - gtfs$fare_attributes <- readr::read_csv(file.path(tmp_folder,"fare_attributes.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "fare_attributes.txt") - } + gtfs$calendar_dates <- fread( + file.path(tmp_folder, "calendar_dates.txt"), + colClasses = c( + service_id = "character", + date = "character", + exception_type = "integer" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + gtfs$calendar_dates[, date := as.IDate(date, "%Y%m%d")] - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_rules.txt"))){ - gtfs$fare_rules <- readr::read_csv(file.path(tmp_folder,"fare_rules.txt"), - show_col_types = FALSE, - lazy = FALSE) } else { - message_log <- c(message_log, "fare_rules.txt") + message("Unable to find conditionally required file: calendar_dates.txt") } if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ - gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "shapes.txt") - } - if(checkmate::test_file_exists(file.path(tmp_folder,"transfers.txt"))){ - gtfs$transfers <- readr::read_csv(file.path(tmp_folder,"transfers.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "transfers.txt") + gtfs$shapes <- data.table::fread( + file.path(tmp_folder, "shapes.txt"), + colClasses = c( + shape_id = "character", + shape_pt_lat = "numeric", + shape_pt_lon = "numeric", + shape_pt_sequence = "integer", + shape_dist_traveled = "numeric" + ), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) + } - unlink(tmp_folder, recursive = TRUE) + #load any other tables in the .zip file + filenamesOnly <- tools::file_path_sans_ext(basename(files)) + notLoadedFiles = setdiff( filenamesOnly, names(gtfs) ) + + for (fileName in notLoadedFiles) + { + table <- fread( + file.path(tmp_folder, paste0(fileName, ".txt")), + showProgress = FALSE, + sep=',', + header=TRUE, + data.table = TRUE + ) - if(length(message_log) > 0){ - message(paste(message_log, collapse = " ")) + gtfs[[fileName]] <- table } + #remove temp directory + unlink(tmp_folder, recursive = TRUE) + return(gtfs) } diff --git a/R/gtfs_subset.R b/R/gtfs_subset.R index b824899..c26ff8d 100644 --- a/R/gtfs_subset.R +++ b/R/gtfs_subset.R @@ -36,7 +36,7 @@ gtfs_clip <- function(gtfs, bounds) { gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% stops_inc, ] # Check for single stop trips n_stops <- table(gtfs$stop_times$trip_id) - single_stops <- as.integer(names(n_stops[n_stops == 1])) + single_stops <- names(n_stops[n_stops == 1]) gtfs$stop_times <- gtfs$stop_times[!gtfs$stop_times$trip_id %in% single_stops, ] # Check for any unused stops diff --git a/R/gtfs_write.R b/R/gtfs_write.R new file mode 100644 index 0000000..f8b20e3 --- /dev/null +++ b/R/gtfs_write.R @@ -0,0 +1,208 @@ +#' Write GTFS +#' +#' Takes a list of data frames representing the GTFS format and saves them as GTFS +#' Zip file. +#' +#' @param gtfs named list of data.frames +#' @param folder folder to save the gtfs file to +#' @param name the name of the zip file without the .zip extension, default "gtfs" +#' @param stripComma logical, should commas be stripped from text, default = TRUE +#' @param stripTab logical, should tab be stripped from text, default = TRUE +#' @param stripNewline logical, should newline tag be stripped from text, default = TRUE +#' @param quote logical, should strings be quoted, default = FALSE, passed to data.table::fwrite +#' @export +#' +gtfs_write <- function(gtfs, + folder = getwd(), + name = "gtfs", + stripComma = FALSE, + stripTab = FALSE, + stripNewline = FALSE, + quote = TRUE) { + + if (stripComma) { + for (i in seq_len(length(gtfs))) { + gtfs[[i]] <- stripCommas(gtfs[[i]]) + } + } + + if (stripTab) { + for (i in seq_len(length(gtfs))) { + gtfs[[i]] <- stripTabs(gtfs[[i]], stripNewline) + } + } + + + if (FALSE) + { + #Format times + if(inherits(gtfs$stop_times$arrival_time, "Period")){ + gtfs$stop_times$arrival_time <- period2gtfs(gtfs$stop_times$arrival_time) + } + + if(inherits(gtfs$stop_times$departure_time, "Period")){ + gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) + } + + if("frequencies" %in% names(gtfs)) + { + if("difftime" %in% class(gtfs$frequencies$start_time)){ + gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S") + } + + if("difftime" %in% class(gtfs$frequencies$end_time)){ + gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S") + } + } + } + + dir.create(paste0(tempdir(), "/gtfs_temp")) + + for ( tableName in names(gtfs) ) + { + table <- gtfs[[tableName]] + + table <- formatAttributesToGtfsSchema( table ) + + if ( !is.null(table) & nrow(table) > 0 ) + { + data.table::fwrite(table, file.path(tempdir(), "gtfs_temp", paste0(tableName, ".txt")), row.names = FALSE, quote = quote) + } + } + + zip::zipr(paste0(folder, "/", name, ".zip"), list.files(paste0(tempdir(), "/gtfs_temp"), full.names = TRUE), recurse = FALSE) + + unlink(paste0(tempdir(), "/gtfs_temp"), recursive = TRUE) +} + + +#' Strip Commas +#' +#' Remove commas from data frame +#' +#' @param df data frame +#' @noRd +#' +stripCommas <- function(df) { + df[] <- lapply(df, function(x) { + if (inherits(x, "character")) { + if(!all(validUTF8(x))){ + Encoding(x) <- "latin1" + x <- enc2utf8(x) + } + x <- gsub(",", " ", x, fixed = TRUE) + } + return(x) + }) + return(df) +} + +#' Strip tabs +#' +#' Remove tabs from data frame +#' +#' @param df data frame +#' @param stripNewline logical +#' @noRd +#' +stripTabs <- function(df, stripNewline) { + df[] <- lapply(df, function(x) { + if (inherits(x, "character")) { + if(!all(validUTF8(x))){ + Encoding(x) <- "latin1" + x <- enc2utf8(x) + } + if(stripNewline){ + x <- gsub("[\r\n]", " ", x, fixed = FALSE) + } + x <- gsub("\t", " ", x, fixed = TRUE) + } + return(x) + }) + return(df) +} + + +#' Convert Period to GTFS timestamps +#' When writing a 400mb (zipped) file, we spend nearly 4 minutes in this fn(), about 10x longer than writing the files to the filesystem. +#' profiler reports this being mostly nchar(), so we optimise down to one sprintf which reduces the time to 1 minute +#' .format() is about 7x slower than sprintf() in this situation. Performance may be different on a data.table +#' +#' @param x periods +#' @noRd +#' +period2gtfs <- function(x) { + + # Check for days + dys <- lubridate::day(x) + if(sum(dys, na.rm = TRUE) > 0){ + stop("Days detected in period objects, incorectly formatted period object") + } + + return( sprintf("%02d:%02d:%02d", lubridate::hour(x), lubridate::minute(x), lubridate::second(x)) ) +} + + +formatAttributesToGtfsSchema <- function(dt) +{ + + { + periodColumnsToFormat <- names(dt)[ sapply(dt, function(x){ inherits(x, "Period") }) ] + + if (length(periodColumnsToFormat) > 0) + { + dt[, (periodColumnsToFormat) := lapply(.SD, period2gtfs), .SDcols = periodColumnsToFormat] + } + } + + { + diffTimeColumnsToFormat <- names(dt)[ sapply(dt, function(x){ "difftime" %in% class( x ) }) ] + + if (length(diffTimeColumnsToFormat) > 0) + { + dt[, (diffTimeColumnsToFormat) := lapply(.SD, format, format = "%H:%M:%S"), .SDcols = diffTimeColumnsToFormat] + } + } + + { + dateColumnsToFormat <- names(dt)[ sapply(dt, function(x){ inherits(x, "Date") }) ] + + if (length(dateColumnsToFormat) > 0) + { + dt[, (dateColumnsToFormat) := lapply(.SD, + function(d){ as.integer(sprintf("%04d%02d%02d", lubridate::year(d), lubridate::month(d), lubridate::mday(d))) } ), + .SDcols = dateColumnsToFormat] + #performance, sprintf runs 6x faster than as.character() (which calls format()), followed by gsub() + #coercing back to int prevents value being quoted in output file + } + } + + { + factorColumnsToFormat <- names(dt)[ sapply(dt, function(x){ is.factor( x ) }) ] + + if (length(factorColumnsToFormat) > 0) + { + dt[, (factorColumnsToFormat) := lapply(.SD, as.character), .SDcols = factorColumnsToFormat] + } + } + + + { + logicalColumnsToFormat <- names(dt)[ sapply(dt, function(x){ is.logical( x ) }) ] + + if (length(logicalColumnsToFormat) > 0) + { + dt[, (logicalColumnsToFormat) := lapply(.SD, as.integer), .SDcols = logicalColumnsToFormat] + } + } + + return (dt) +} + + + + + + + + diff --git a/R/nptdr_export.R b/R/nptdr_export.R index 3b68d00..addceed 100644 --- a/R/nptdr_export.R +++ b/R/nptdr_export.R @@ -108,7 +108,7 @@ nptdr_makeCalendar <- function(schedule, exceptions, historic_bank_holidays = hi calendar$bank_holiday <- NULL # TODO: Detect Scotland and NI - message("Unique Scotland and NI bank holidays not correctly handeled") + message("Unique Scotland and NI bank holidays are not correctly handled") bh <- historic_bank_holidays[historic_bank_holidays$date >= min(calendar$start_date, na.rm = TRUE),] bh <- bh[bh$date <= min(calendar$end_date, na.rm = TRUE), ] diff --git a/R/transxchange.R b/R/transxchange.R index 94ebafc..30d120b 100644 --- a/R/transxchange.R +++ b/R/transxchange.R @@ -8,22 +8,23 @@ #' @param cal Calendar object from get_bank_holidays() #' @param naptan Naptan stop locations from get_naptan() #' @param scotland character, should Scottish bank holidays be used? Can be -#' "auto" (defualt), "yes", "no". If "auto" and path_in ends with "S.zip" +#' "auto" (default), "yes", "no". If "auto" and path_in ends with "S.zip" #' Scottish bank holidays will be used, otherwise England and Wales bank #' holidays are used. #' @param try_mode Logical, if TRUE import and conversion are wrapped in try #' calls thus a failure on a single file will not cause the whole process to #' fail. Warning this could result in a GTFS file with missing routes. #' @param force_merge Logical, passed to gtfs_merge(force), default FALSE +#' @param merge Logical, if results are merged into one GTFS object by calling gtfs_merge, default TRUE #' @return A GTFS named list #' @details #' -#' This is a meta fucntion which aids TransXchange to GTFS conversion. It simple +#' This is a meta function which aids TransXchange to GTFS conversion. It simple #' runs transxchange_import(), transxchange_export(), gtfs_merge(), gtfs_write() #' #' Progress Bars #' -#' To minimise overall processing when using mulitple cores the fucntion works +#' To minimise overall processing when using multiple cores the function works #' from largest to smallest file.This can mean the progress bar sits a 0% for #' quite some time, before starting to move rapidly. #' @@ -38,7 +39,8 @@ transxchange2gtfs <- function(path_in, naptan = get_naptan(), scotland = "auto", try_mode = TRUE, - force_merge = FALSE) { + force_merge = FALSE, + merge = TRUE) { # Check inputs checkmate::assert_numeric(ncores) checkmate::assert_logical(silent) @@ -206,13 +208,16 @@ transxchange2gtfs <- function(path_in, message("All files converted") } - if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} + if(merge) + { + if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} - gtfs_merged <- try(gtfs_merge(gtfs_all, force = force_merge)) + gtfs_merged <- gtfs_merge(gtfs_all, force=force_merge, quiet=silent) - if (inherits(gtfs_merged, "try-error")) { - message("Merging failed, returing unmerged GFTS object for analysis") - return(gtfs_all) + return(gtfs_merged) + } + else + { + return (gtfs_all) } - return(gtfs_merged) } diff --git a/R/transxchange_export.R b/R/transxchange_export.R index 8517cc5..acae130 100644 --- a/R/transxchange_export.R +++ b/R/transxchange_export.R @@ -499,12 +499,12 @@ transxchange_export <- function(obj, calendar_dates <- data.frame( trip_id = character(), date = character(), - exception_type = character(), + exception_type = integer(), stringsAsFactors = FALSE ) calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) } else { - # remove calendar_dates for trips that have been competly removed + # remove calendar_dates for trips that have been competely removed calendar_dates <- calendar_dates[calendar_dates$trip_id %in% calendar$trip_id, ] calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) diff --git a/R/write_gtfs.R b/R/write_gtfs.R deleted file mode 100644 index e021383..0000000 --- a/R/write_gtfs.R +++ /dev/null @@ -1,152 +0,0 @@ -#' Write GTFS -#' -#' Takes a list of data frames represneting the GTFS fromat and saves them as GTFS -#' Zip file. -#' -#' @param gtfs named list of data.frames -#' @param folder folder to save the gtfs file to -#' @param name the name of the zip file without the .zip extension, default "gtfs" -#' @param stripComma logical, should commas be stripped from text, default = TRUE -#' @param stripTab logical, should tab be stripped from text, default = TRUE -#' @param stripNewline logical, should newline tag be stripped from text, default = TRUE -#' @param quote logical, should strings be quoted, default = FALSE, passed to data.table::fwrite -#' @export -#' -gtfs_write <- function(gtfs, - folder = getwd(), - name = "gtfs", - stripComma = TRUE, - stripTab = TRUE, - stripNewline = TRUE, - quote = FALSE) { - if (stripComma) { - for (i in seq_len(length(gtfs))) { - gtfs[[i]] <- stripCommas(gtfs[[i]]) - } - } - - if (stripTab) { - for (i in seq_len(length(gtfs))) { - gtfs[[i]] <- stripTabs(gtfs[[i]], stripNewline) - } - } - - - #Format Dates - - if(inherits(gtfs$calendar$start_date, "Date")){ - gtfs$calendar$start_date <- format(gtfs$calendar$start_date, "%Y%m%d") - } - - if(inherits(gtfs$calendar$end_date, "Date")){ - gtfs$calendar$end_date <- format(gtfs$calendar$end_date, "%Y%m%d") - } - - if(inherits(gtfs$calendar_dates$date, "Date")){ - gtfs$calendar_dates$date <- format(gtfs$calendar_dates$date, "%Y%m%d") - } - - #Format times - if(inherits(gtfs$stop_times$arrival_time, "Period")){ - gtfs$stop_times$arrival_time <- period2gtfs(gtfs$stop_times$arrival_time) - } - - if(inherits(gtfs$stop_times$departure_time, "Period")){ - gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) - } - - - dir.create(paste0(tempdir(), "/gtfs_temp")) - data.table::fwrite(gtfs$calendar, paste0(tempdir(), "/gtfs_temp/calendar.txt"), row.names = FALSE, quote = quote) - if (nrow(gtfs$calendar_dates) > 0) { - data.table::fwrite(gtfs$calendar_dates, paste0(tempdir(), "/gtfs_temp/calendar_dates.txt"), row.names = FALSE, quote = quote) - } - data.table::fwrite(gtfs$routes, paste0(tempdir(), "/gtfs_temp/routes.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stop_times, paste0(tempdir(), "/gtfs_temp/stop_times.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$trips, paste0(tempdir(), "/gtfs_temp/trips.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stops, paste0(tempdir(), "/gtfs_temp/stops.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$agency, paste0(tempdir(), "/gtfs_temp/agency.txt"), row.names = FALSE, quote = quote) - if ("transfers" %in% names(gtfs)) { - data.table::fwrite(gtfs$transfers, paste0(tempdir(), "/gtfs_temp/transfers.txt"), row.names = FALSE, quote = quote) - } - if ("shapes" %in% names(gtfs)) { - data.table::fwrite(gtfs$shapes, paste0(tempdir(), "/gtfs_temp/shapes.txt"), row.names = FALSE, quote = quote) - } - zip::zipr(paste0(folder, "/", name, ".zip"), list.files(paste0(tempdir(), "/gtfs_temp"), full.names = TRUE), recurse = FALSE) - unlink(paste0(tempdir(), "/gtfs_temp"), recursive = TRUE) - message(paste0(folder, "/", name, ".zip")) -} - - -#' Strip Commas -#' -#' Remove commas from data frame -#' -#' @param df data frame -#' @noRd -#' -stripCommas <- function(df) { - df[] <- lapply(df, function(x) { - if (inherits(x, "character")) { - if(!all(validUTF8(x))){ - Encoding(x) <- "latin1" - x <- enc2utf8(x) - } - x <- gsub(",", " ", x, fixed = TRUE) - } - return(x) - }) - return(df) -} - -#' Strip tabs -#' -#' Remove tabs from data frame -#' -#' @param df data frame -#' @param stripNewline logical -#' @noRd -#' -stripTabs <- function(df, stripNewline) { - df[] <- lapply(df, function(x) { - if (inherits(x, "character")) { - if(!all(validUTF8(x))){ - Encoding(x) <- "latin1" - x <- enc2utf8(x) - } - if(stripNewline){ - x <- gsub("[\r\n]", " ", x, fixed = FALSE) - } - x <- gsub("\t", " ", x, fixed = TRUE) - } - return(x) - }) - return(df) -} - - -#' Convert Period to GTFS timestamps -#' -#' -#' @param x peridos -#' @noRd -#' -period2gtfs <- function(x) { - - # Check for days - dys <- lubridate::day(x) - if(sum(dys, na.rm = TRUE) > 0){ - stop("Days detected in period objects, incorectly formatted period object") - } - - hrs <- as.character(lubridate::hour(x)) - min <- as.character(lubridate::minute(x)) - sec <- as.character(lubridate::second(x)) - - hrs <- ifelse(nchar(hrs) == 1,paste0("0",hrs), hrs) - min <- ifelse(nchar(min) == 1,paste0("0",min), min) - sec <- ifelse(nchar(sec) == 1,paste0("0",sec), sec) - - return(paste0(hrs,":",min,":",sec)) -} - diff --git a/R/zzz.R b/R/zzz.R index 020fe99..1681d77 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,6 +1,16 @@ # Run when package loads +# bear in mind this runs in every worker process as we start them and load this package into the worker's namespace .onLoad <- function(libname, pkgname){ - update_data() + + tryCatch({ + if( TRUE == UK2GTFS_option_updateCachedDataOnLibaryLoad() ) + { + update_data( timeout=10 ) + } + }, error = function(err) { + warning(Sys.time(), " Process id=", Sys.getpid(), " threw errors during package load while calling update_data() :", err) + }) + } diff --git a/UK2GTFS.Rproj b/UK2GTFS.Rproj index 497f8bf..270314b 100644 --- a/UK2GTFS.Rproj +++ b/UK2GTFS.Rproj @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/data/activity_codes.rda b/data/activity_codes.rda index 3dceea3..24a65f9 100644 Binary files a/data/activity_codes.rda and b/data/activity_codes.rda differ diff --git a/inst/extdata/date.txt b/inst/extdata/date.txt deleted file mode 100644 index 199df0c..0000000 --- a/inst/extdata/date.txt +++ /dev/null @@ -1 +0,0 @@ -nodata diff --git a/man/UK2GTFS_option_stopProcessingAtUid.Rd b/man/UK2GTFS_option_stopProcessingAtUid.Rd new file mode 100644 index 0000000..7fbd343 --- /dev/null +++ b/man/UK2GTFS_option_stopProcessingAtUid.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_stopProcessingAtUid} +\alias{UK2GTFS_option_stopProcessingAtUid} +\title{UK2GTFS option stopProcessingAtUid} +\usage{ +UK2GTFS_option_stopProcessingAtUid(value) +} +\arguments{ +\item{value}{option value to be set (char)} +} +\description{ +sets/gets a UID value at which processing will stop - used for debugging +} +\details{ +If no value passed in will return the current setting of the option. (Usually NULL) + If value passed in, timetable build processing will stop in atoc_overlay.makeCalendarInner() + when an exact match for that value is encountered. + + THIS ONLY WORKS WITH ncores==1 +} diff --git a/man/UK2GTFS_option_treatDatesAsInt.Rd b/man/UK2GTFS_option_treatDatesAsInt.Rd new file mode 100644 index 0000000..7a3c5d3 --- /dev/null +++ b/man/UK2GTFS_option_treatDatesAsInt.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_treatDatesAsInt} +\alias{UK2GTFS_option_treatDatesAsInt} +\title{UK2GTFS option treatDatesAsInt} +\usage{ +UK2GTFS_option_treatDatesAsInt(value) +} +\arguments{ +\item{value}{option value to be set (logical)} +} +\description{ +sets/gets a logical value which determines how dates are processed while building calendar - used for debugging +} +\details{ +In the critical part of timetable building, handling dates as dates is about half the speed of handling as int + so we treat them as integers. However that's a complete pain for debugging, so make it configurable. + if errors are encountered during the timetable build phase, try setting this value to FALSE +} diff --git a/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd b/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd new file mode 100644 index 0000000..0313b9f --- /dev/null +++ b/man/UK2GTFS_option_updateCachedDataOnLibaryLoad.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/globals.R +\name{UK2GTFS_option_updateCachedDataOnLibaryLoad} +\alias{UK2GTFS_option_updateCachedDataOnLibaryLoad} +\title{UK2GTFS option updateCachedDataOnLibaryLoad} +\usage{ +UK2GTFS_option_updateCachedDataOnLibaryLoad(value) +} +\arguments{ +\item{value}{option value to be set (logical)} +} +\description{ +sets/gets a logical value which determines if the data cached in the library is checked for update when loaded +} +\details{ +when child processes are initialised we want to suppress this check, so it is also used for that purpose +} diff --git a/man/as_data_table_naptan_stop_area.Rd b/man/as_data_table_naptan_stop_area.Rd new file mode 100644 index 0000000..73486ad --- /dev/null +++ b/man/as_data_table_naptan_stop_area.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_naptan.R +\name{as_data_table_naptan_stop_area} +\alias{as_data_table_naptan_stop_area} +\title{as data table naptan stop area} +\usage{ +as_data_table_naptan_stop_area(doc) +} +\arguments{ +\item{doc}{xml document node} +} +\value{ +data table of stop areas +} +\description{ +Unpacks selected naptan XML doc elements into data.table +} diff --git a/man/as_data_table_naptan_stop_point.Rd b/man/as_data_table_naptan_stop_point.Rd new file mode 100644 index 0000000..c5e9b11 --- /dev/null +++ b/man/as_data_table_naptan_stop_point.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_naptan.R +\name{as_data_table_naptan_stop_point} +\alias{as_data_table_naptan_stop_point} +\title{as data table naptan stop point} +\usage{ +as_data_table_naptan_stop_point(doc, stopTypes = c("RLY")) +} +\arguments{ +\item{doc}{xml document node} + +\item{stopTypes}{list of stop types to restrict processing to (defaults to railway station)} +} +\value{ +data table of stop points +} +\description{ +Unpacks selected naptan XML doc elements into data.table +} +\details{ +RLY stop types include TIPLOC & CRS fields. The quality of the geographic location is better than from BPLAN +} diff --git a/man/atoc2gtfs.Rd b/man/atoc2gtfs.Rd index 1f17e45..2f9ee33 100644 --- a/man/atoc2gtfs.Rd +++ b/man/atoc2gtfs.Rd @@ -12,7 +12,9 @@ atoc2gtfs( agency = "atoc_agency", shapes = FALSE, transfers = TRUE, - missing_tiplocs = TRUE + missing_tiplocs = TRUE, + working_timetable = FALSE, + public_only = TRUE ) } \arguments{ @@ -31,8 +33,12 @@ atoc2gtfs( \item{transfers}{Logical, should transfers.txt be generated (default TRUE)} -\item{missing_tiplocs}{Logical, if locations = tiplocs, then will check for +\item{missing_tiplocs}{Logical, if true will check for any missing tiplocs against the main file and add them.(default TRUE)} + +\item{working_timetable}{Logical, should WTT times be used instead of public times (default FALSE)} + +\item{public_only}{Logical, only return calls/services that are for public passenger pickup/set down (default TRUE)} } \description{ Convert ATOC CIF files to GTFS diff --git a/man/get_bank_holidays.Rd b/man/get_bank_holidays.Rd index 77d86d7..41f04f6 100644 --- a/man/get_bank_holidays.Rd +++ b/man/get_bank_holidays.Rd @@ -24,6 +24,6 @@ data. \details{ TransXchange records bank holidays by name (e.g. Christmas Day), some UK bank holidays move around, so this function downloads the official - bank holiday calendar. The offical feed only covers a short period of time + bank holiday calendar. The official feed only covers a short period of time so this may not be suitable for converting files from the past / future. } diff --git a/man/get_naptan.Rd b/man/get_naptan.Rd index 908b571..d4795c7 100644 --- a/man/get_naptan.Rd +++ b/man/get_naptan.Rd @@ -18,11 +18,13 @@ get_naptan( data frame of stop locations } \description{ -Download the NaPTAN stop locations for more information on NaPTAN see +Download the NaPTAN stop locations in CSV format. For more information on NaPTAN see https://data.gov.uk/dataset/ff93ffc1-6656-47d8-9155-85ea0b8f2251/national-public-transport-access-nodes-naptan } \details{ TransXchange does not store the location of bus stops, so this functions downloads them from the offical DfT source. NaPTAN has some missing bus stops which are added by UK2GTFS. See `naptan_missing` + + Do not use this function for heavy rail use - download in XML format which includes TIPLOC identifiers } diff --git a/man/get_naptan_xml_doc.Rd b/man/get_naptan_xml_doc.Rd new file mode 100644 index 0000000..bdbfd01 --- /dev/null +++ b/man/get_naptan_xml_doc.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_naptan.R +\name{get_naptan_xml_doc} +\alias{get_naptan_xml_doc} +\title{Get naptan xml doc} +\usage{ +get_naptan_xml_doc( + url = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=xml", + timeout = 300L, + method = getOption("url.method", "default") +) +} +\arguments{ +\item{url}{character, url to the xml format NaPTAN} + +\item{timeout}{int, timeout in seconds to wait for download to complete} +} +\value{ +xml document node +} +\description{ +Download the NaPTAN stop locations in XML format. +For more information on NaPTAN see https://beta-naptan.dft.gov.uk/ +} +\details{ +TransXchange does not store the location of bus stops, so this + functions downloads them from the offical DfT source. + + NrStations with multiple tiplocs seem to be represented as multiple natpan nodes, + with one AnnotatedRailRef per naptan node, despite the schema supporting a multiplicity relationship. + e.g. stourbridge, clapham junction. + these can be joined together on the CRS + + As of 2023 Naptan is published under a more permissive (OGL3) licence than ATOC data (creative commons licence). + +http://naptan.dft.gov.uk/naptan/schema/2.5.1/napt/NaPT_stop-v2-5-1.xsd + + + + sequence of optional elements + + + + + + + + + + + % + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedResult = data.frame(UID=c( "uid1", "uid2", "uid3", "uid4", "uid5"), + count=c( 15, 10, 15, 1, 1)) + expectedResult$count <- as.integer( expectedResult$count ) + + printDifferencesDf(expectedResult,summary) + + expect_true( identical(expectedResult,summary) & ok ) +}) + + +#bizarrely had to add special glue to make it work correctly when duplicating one object +test_that("test makeAllOneDay:2", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "18-01-2023"), + Days=c( "1110000"), + STP=c( "P"), + rowID=c( 3)) + testData <- fixCalendarDates( testData ) + + res <- makeAllOneDay( testData ) + + ok = TRUE + + ok = ok & all(res$start_date == res$end_date) + + + expectedResult = data.table( + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "03-01-2023", "04-01-2023", "09-01-2023", "10-01-2023", "11-01-2023"), + end_date=c( "02-01-2023", "03-01-2023", "04-01-2023", "09-01-2023", "10-01-2023", "11-01-2023"), + Days=c( "1000000", "0100000", "0010000", "1000000", "0100000", "0010000"), + STP=c( "P", "P", "P", "P", "P", "P"), + rowID=c( 3, 3, 3, 3, 3, 3)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1", "uid1", "uid1"), + start_date=c("16-01-2023", "17-01-2023", "18-01-2023"), + end_date=c( "16-01-2023", "17-01-2023", "18-01-2023"), + Days=c( "1000000", "0100000", "0010000"), + STP=c( "P", "P", "P"), + rowID=c( 3, 3, 3))) + + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + ok = ok & identical(expectedResult,res) + + + summary <- as.data.frame( res %>% + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedCount = data.frame(UID=c( "uid1"), + count=c( 9)) + expectedCount$count <- as.integer( expectedCount$count ) + + printDifferencesDf(expectedCount,summary) + + expect_true( identical(expectedCount,summary) & identical(expectedResult,res) & ok ) +}) + + + + +test_that("test expandAllWeeks:1", { + + testData = data.table(UID=c( "uid1", "uid2", "uid3", "uid4"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "23-01-2023"), + end_date=c( "18-01-2023", "29-01-2023", "03-03-2023", "23-01-2023"), + Days=c( "1110000", "0001001", "0011100", "1000000"), + STP=c( "P", "C", "P", "C"), + rowID=c( 1, 2, 3, 6)) + testData <- fixCalendarDates( testData ) + + res <- expandAllWeeks( testData ) + + expectedResult = data.table(UID=c( "uid1", "uid1", "uid1", "uid2", "uid2", "uid2", "uid2", "uid3", "uid4"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023", "05-01-2023", "12-01-2023", "19-01-2023", "26-01-2023", "01-03-2023", "23-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023", "08-01-2023", "15-01-2023", "22-01-2023", "29-01-2023", "03-03-2023", "23-01-2023"), + Days=c( "1110000", "1110000", "1110000", "0001001", "0001001", "0001001", "0001001", "0011100", "1000000"), + STP=c( "P", "P", "P", "C", "C", "C", "C", "P", "C"), + rowID=c( 1, 1, 1, 2, 2, 2, 2, 3, 6)) + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("test expandAllWeeks:2", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "18-01-2023"), + Days=c( "1110000"), + STP=c( "P"), + rowID=c( 1)) + testData <- fixCalendarDates( testData ) + + res <- expandAllWeeks( testData ) + + expectedResult = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023"), + Days=c( "1110000", "1110000", "1110000"), + STP=c( "P", "P", "P"), + rowID=c( 1, 1, 1)) + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("test hasGapInOperatingDays:1", { + + testData = c("0000000", "1000000", "0000001", "0100000", "1100000", "0000011", "0011100", "0101000", "1000001", "0001001") + expectedResult = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE) + + res = hasGapInOperatingDays( testData ) + + expect_true( identical(expectedResult,res) ) +}) + + + + +test_that("test process_activity:1", { + + OK = TRUE + + { + testData = data.table( Activity=c(" ", "TBT D U R TF", "abcdefghijkl", "abcdefghij ", "TBT EEU XXTF", " D U ") ) + + res = process_activity( testData, TRUE ) + + expectedResult = data.table( Activity=c("TB,T,D,U,R,TF", "TB,T,U,TF", "D,U" ) ) + + OK = OK & identical(expectedResult,res) + } + { + testData = data.table( Activity=c(" ", "TBT D U R TF", "abcdefghijkl", "a def ijkl", " cdefghij ") ) + + res = process_activity( testData, FALSE ) + + expectedResult = data.table( Activity=c("", "TB,T,D,U,R,TF", "ab,cd,ef,gh,ij,kl", "a,d,ef,ij,kl","cd,ef,gh,ij" )) + + OK = OK & identical(expectedResult,res) + } + + expect_true( OK ) +}) + + + +#when running for real, this hangs if ncores>1 having trouble reproducing +test_that("test duplicateItems:1", { + + sourceDuplication = 99 + repetitions = 110 + expectedCount=sourceDuplication * repetitions + + testData = data.table(UID=c( "uid1", "uid2", "uid3"), + start_date=c("02-01-2023", "09-01-2023", "16-01-2023"), + end_date=c( "04-01-2023", "11-01-2023", "18-01-2023"), + Days=c( "1110000", "1110000", "1110000"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + testData <- testData[rep(seq_len(.N), times = sourceDuplication)] + + testData$`_reps` = repetitions + + res = duplicateItems( testData, "UID", ncores=4 ) + + summary <- as.data.frame( res %>% + dplyr::group_by(UID) %>% + dplyr::summarise(count = dplyr::n()) ) + + expectedResult = data.frame(UID=c( "uid1", "uid2", "uid3"), + count=c( expectedCount, expectedCount, expectedCount)) + expectedResult$count <- as.integer( expectedResult$count ) + + printDifferencesDf(expectedResult,summary) + + + expect_true( identical(expectedResult,summary) ) +}) + + + + + + + + +context("Running calendar overlay unit tests") + + +test_that("0:fixOverlappingDates -based on priority", { + #TODO add more test cases + + testData = data.table( + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", NA, "uid1"), + STP=c( "P", "C", "O", "P", "P", "C", "N"), + start_date=c("02-01-2023", "11-01-2023", "11-01-2023", "20-01-2023", "25-01-2023", "27-01-2023", "20-02-2023" ), + end_date=c( "11-01-2023", "11-01-2023", "20-01-2023", "25-02-2023", "27-01-2023", "20-03-2023", "31-03-2023")) + testData <- fixCalendarDates( testData ) + + res = fixOverlappingDates(testData) + + expectedResult = data.table( #equal priority but overlapping + #current behaviour is to leave unchanged + UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", NA, "uid1"), + STP=c( "P", "C", "O", "P", "P", "C", "N"), + start_date=c("02-01-2023", "11-01-2023", "12-01-2023", "21-01-2023", "25-01-2023", "27-01-2023", "20-02-2023" ), + end_date=c( "10-01-2023", "11-01-2023", "20-01-2023", "25-02-2023", "27-01-2023", "20-03-2023", "31-03-2023" )) + expectedResult <- fixCalendarDates( expectedResult ) + expectedResult$duration = testData$duration + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + + +test_that("1:test allocateCancellationsAcrossCalendars", { + + calendar = data.table(UID=c( "uid1 a", "uid1 b", "uid2 c", "uid2 d", "uid3 e"), + originalUID=c("uid1", "uid1", "uid2", "uid2", "uid3"), + start_date=c("02-01-2023", "05-01-2023", "01-03-2023", "07-03-2023", "26-01-2023" ), + end_date=c( "01-02-2023", "05-02-2023", "31-03-2023", "26-03-2023", "26-01-2023" ), + Days=c( "1110000", "0001001", "0011100", "0100001", "0001000" ), + STP=c( "P", "C", "P", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5)) + calendar <- fixCalendarDates( calendar ) + calendar <- splitAndRebindBitmask( calendar ) + + #TODO - discuss. the GTFS spec allows cancellations/ additions with no associated calendar - we're currently + # filtering these out, which is probably the right thing to do ? + cancellations = data.table( #these columns get removed + UID=c( "aaaaa", "bbbbbb", "ccccccc", "ddddddd", "eeeeee", "fffffff"), + originalUID=c("uid1", "uid1", "uid1", "uid2", "uid2", "uid4"), + start_date=c("02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), + end_date=c( "02-01-2023", "03-01-2023", "06-01-2023", "14-03-2023", "15-01-2023", "26-01-2023" ), + Days=c( "1000000", "0100000", "0000100", "0100000", "0010000", "0001000" ), + STP=c( "C", "C", "C", "C", "C", "C" ), + rowID=c( 6, 7, 8, 9, 10, 11)) + cancellations <- fixCalendarDates( cancellations ) + cancellations <- splitAndRebindBitmask( cancellations ) + + res <- allocateCancellationsAcrossCalendars( calendar, cancellations ) + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 a", "uid2 d"), + originalUID=c("uid1", "uid1", "uid2"), + start_date=c("02-01-2023", "03-01-2023", "14-03-2023"), + end_date=c( "02-01-2023", "03-01-2023", "14-03-2023"), + Days=c( "1000000", "0100000", "0100000"), + STP=c( "C", "C", "C"), + rowID=c( 6, 7, 9), + duration=c( 1, 1, 1), + monday=c( TRUE, FALSE, FALSE), + tuesday=c( FALSE, TRUE, TRUE), + wednesday=c( FALSE, FALSE, FALSE), + thursday=c( FALSE, FALSE, FALSE), + friday=c( FALSE, FALSE, FALSE), + saturday=c( FALSE, FALSE, FALSE), + sunday=c( FALSE, FALSE, FALSE) + ) + expectedResult <- fixCalendarDates( expectedResult, createOriginalUID=FALSE ) + + printDifferencesDf(expectedResult,res) + + expect_true( identical(expectedResult,res) ) +}) + + +test_that("2:test makeCalendarInner:one calendar entry for service", { + + testData = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + + expectedResult <- fixCalendarDates( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("1.1:test makeCalendarInner:all identical patterns - more than single day overlay", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "09-01-2023", "09-01-2023"), + end_date=c( "04-02-2023", "28-01-2023", "14-01-2023"), + Days=c( "1111110", "1111110", "1111110"), + STP=c( "P", "O", "C"), + rowID=c( 1, 2, 3)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("02-01-2023", "15-01-2023", "29-01-2023"), + end_date=c( "08-01-2023", "28-01-2023", "04-02-2023"), + Days=c( "1111110", "1111110", "1111110"), + STP=c( "P", "O", "P"), + rowID=c( 1, 2, 1)) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("3:test makeCalendarInner:one base: one day cancellations", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0010000", "0001000", "1000000" ), + STP=c( "P", "C", "C", "C" ), + rowID=c( 1, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1"), + start_date=c("02-01-2023"), + end_date=c( "04-02-2023"), + Days=c( "1111110"), + STP=c( "P"), + rowID=c( 1)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expectedResultDates = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "0010000", "0001000", "1000000" ), + STP=c( "C", "C", "C" ), + rowID=c( 4, 5, 6)) + expectedResultDates <- fixCalendarDates( expectedResultDates ) + + res.calendar_dates = removeOriginalUidField( res.calendar_dates ) + expectedResultDates = removeOriginalUidField( expectedResultDates ) + + printDifferencesDf(expectedResultDates,res.calendar_dates) + + + expect_true(identical(expectedResult,res.calendar) + & identical(expectedResultDates,res.calendar_dates)) +}) + + + + + +test_that("4:test makeCalendarInner:one day cancellations(current)", { + + #all overlays 1 day cancellations + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023"), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023"), + Days=c( "1111110", "0000001", "0011100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + + expectedResultDates = data.table(UID=c("uid1", "uid1", "uid1"), + start_date=c("11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "0010000", "0001000", "1000000" ), + STP=c( "C", "C", "C" ), + rowID=c( 4, 5, 6)) + expectedResultDates <- fixCalendarDates( expectedResultDates ) + + res.calendar_dates = removeOriginalUidField( res.calendar_dates ) + expectedResultDates = removeOriginalUidField( expectedResultDates ) + + printDifferencesDf( expectedResultDates,res.calendar_dates ) + + expect_true(identical(expectedResult,res.calendar) + & identical(expectedResultDates,res.calendar_dates)) +}) + + + + + +test_that("5:test makeCalendarInner:one day cancellations(old)", { + + #there are multiple valid ways to process this - because of cancellations being handled at a higher level this + #test case no longer applies - but quite a bit of work to create the test case, so keep it for now. + expect_true(TRUE) + + if(FALSE) + { + # all overlays 1 day cancellations + # this method splits up the base timetable, leaving gaps where there are cancellation days + + # this can create schedule entries which are by the CIF rules incorrect, because we don't + # validate that the new start/end dates align with the day pattern bitmask + + # while the cancellation part is no longer current, this is still a good test for all the date setting logic + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "11-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111110", "0000001", "0010000", "0001000", "1000000" ), + STP=c( "P", "P", "C", "C", "C" ), + rowID=c( 1, 2, 4, 5, 6)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid12"), + start_date=c("02-01-2023", "12-01-2023", "24-01-2023", "08-01-2023"), + end_date=c( "10-01-2023", "22-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "1111110", "1111110", "0000001"), + STP=c( "P", "P", "P", "P"), + rowID=c( 1, 1, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) + } +}) + + + + +test_that("6:test makeCalendarInner:overlay -matching base pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "09-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "21-01-2023"), + Days=c( "1111110", "0000001", "1111110"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid12"), + start_date=c("02-01-2023", "09-01-2023", "22-01-2023", "08-01-2023"), + end_date=c( "08-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "1111110", "1111110", "0000001"), + STP=c( "P", "O", "P", "P"), + rowID=c( 1, 3, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) + +}) + + +test_that("6.1:test makeCalendarInner:bases with different patterns, no overlay", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("22-05-2023", "25-09-2023", "02-10-2023"), + end_date=c( "22-09-2023", "26-09-2023", "13-10-2023"), + Days=c( "1111100", "1100000", "1111100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c"), + start_date=c("22-05-2023", "25-09-2023", "02-10-2023"), + end_date=c( "22-09-2023", "26-09-2023", "13-10-2023"), + Days=c( "1111100", "1100000", "1111100"), + STP=c( "P", "P", "P"), + rowID=c( 1, 2, 3)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("6.2:test makeCalendarInner:base is N (STP) with different patterns, no overlay", { + + testData = data.table(UID=c( "uid1", "uid1"), + start_date=c("26-06-2023", "31-07-2023"), + end_date=c( "29-07-2023", "03-08-2023"), + Days=c( "1111110", "1111000"), + STP=c( "N", "N"), + rowID=c( 1, 2)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b"), + start_date=c("26-06-2023", "31-07-2023"), + end_date=c( "29-07-2023", "03-08-2023"), + Days=c( "1111110", "1111000"), + STP=c( "N", "N"), + rowID=c( 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("7:test makeCalendarInner:overlay -different to base pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "21-01-2023"), + Days=c( "1111110", "0000001", "0111110"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid12"), + start_date=c("02-01-2023", "10-01-2023", "15-01-2023", "17-01-2023", "22-01-2023", "08-01-2023"), + end_date=c( "09-01-2023", "14-01-2023", "16-01-2023", "21-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "1111110", "0111110", "1111110", "0111110", "1111110", "0000001"), + STP=c( "P", "O", "P", "O", "P", "P"), + rowID=c( 1, 3, 1, 3, 1, 2)) + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("8:test makeCalendarInner:overlay -different to base pattern-gap in pattern", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "04-02-2023", "05-02-2023", "20-01-2023"), + Days=c( "1111110", "0000001", "0110100"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "14-01-2023"), + end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023", "16-01-2023"), + Days=c( "1111110", "0100000", "0010000", "1111110", "0000100", "1111110"), + STP=c( "P", "O", "O", "P", "O", "P"), + rowID=c( 1, 3, 3, 1, 3, 1)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 j", "uid1 k", "uid12"), + start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), + end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "04-02-2023", "05-02-2023"), + Days=c( "0100000", "0010000", "1111110", "0000100", "1111110", "0000001"), + STP=c( "O", "O", "P", "O", "P", "P"), + rowID=c( 3, 3, 1, 3, 1, 2))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("9:test makeCalendarInner:overlay -different to base pattern-gap in pattern -creating base fragments to be skipped", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "10-01-2023"), + end_date=c( "03-02-2023", "05-02-2023", "20-01-2023"), + Days=c( "0111100", "0000001", "0110100"), + STP=c( "P", "P", "O"), + rowID=c( 1, 2, 3)) + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e"), #the 'f' calendar gets thrown away + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), + end_date=c( "09-01-2023", "10-01-2023", "11-01-2023", "12-01-2023", "13-01-2023"), + Days=c( "0111100", "0100000", "0010000", "0111100", "0000100"), + STP=c( "P", "O", "O", "P", "O"), + rowID=c( 1, 3, 3, 1, 3)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 j", "uid1 k", "uid12"), + start_date=c("17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "21-01-2023", "08-01-2023"), + end_date=c( "17-01-2023", "18-01-2023", "19-01-2023", "20-01-2023", "03-02-2023", "05-02-2023"), + Days=c( "0100000", "0010000", "0111100", "0000100", "0111100", "0000001"), + STP=c( "O", "O", "P", "O", "P", "P"), + rowID=c( 3, 3, 1, 3, 1, 2))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + + +test_that("10:test makeCalendarInner", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "12-01-2023", "08-03-2023", "23-01-2023" ), + end_date=c( "03-02-2023", "05-02-2023", "31-03-2023", "19-01-2023", "12-01-2023", "09-03-2023", "23-01-2023" ), + Days=c( "1111100", "0000001", "0011100", "0011000", "0001000", "0011000", "1000000" ), + STP=c( "P", "P", "P", "O", "C", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6, 7)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + #this is a more complex expansion than strictly necessary - could add more logic to see if the base / overlay patterns + #currently we just test if the patterns collide from a operating day mask perspective, but not if they overlap for operating period. + #e.g. if we have base timetables for march and april with different operating patterns it will expand on a week-by-week basis + # instead of going 'oh that's fine, march and april don't overlap + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), + start_date=c("02-01-2023", "11-01-2023", "13-01-2023", "18-01-2023", "20-01-2023", "24-01-2023"), + end_date=c( "10-01-2023", "11-01-2023", "17-01-2023", "19-01-2023", "22-01-2023", "03-02-2023"), + Days=c( "1111100", "0011000", "1111100", "0011000", "1111100", "1111100"), + STP=c( "P", "O", "P", "O", "P", "P"), + rowID=c( 1, 4, 1, 4, 1, 1)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid12", "uid1 c3", "uid1 d3" ), + start_date=c("08-01-2023", "01-03-2023", "10-03-2023" ), + end_date=c( "05-02-2023", "07-03-2023", "31-03-2023" ), + Days=c( "0000001", "0011100", "0011100" ), + STP=c( "P", "P", "P" ), + rowID=c( 2, 3, 3 ))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + + +test_that("11:test makeCalendarInner: overlay matching pattern of a base that is offset temporaly", { + + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("04-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "08-03-2023"), + end_date=c( "02-02-2023", "05-02-2023", "30-03-2023", "19-01-2023", "16-03-2023"), + Days=c( "0011000", "0000001", "0011000", "0011000", "0011000"), + STP=c( "P", "P", "P", "O", "C"), + rowID=c( 1, 2, 3, 4, 5)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table(UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid12"), + start_date=c("04-01-2023", "11-01-2023", "20-01-2023", "01-03-2023", "17-03-2023", "08-01-2023"), + end_date=c( "10-01-2023", "19-01-2023", "02-02-2023", "07-03-2023", "30-03-2023", "05-02-2023"), + Days=c( "0011000", "0011000", "0011000", "0011000", "0011000", "0000001"), + STP=c( "P", "O", "P", "P", "P", "P"), + rowID=c( 1, 4, 1, 3, 3, 2)) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + +test_that("12: test makeCalendarInner", { + + #by convention Sunday service timetables are Sunday only + #the 'from' date should be the first day the timetable has effect (i.e. should have a 1 in the relevant day column) + #(and I assume the same is true of the last) + + #mon-sat timetable + #sun different operating hours on sunday + #engineering works means having to berth in a different platform for a couple of weeks wed-fri + #cancel mondays for 2 weeks + #cancel sundays for 2 weeks + #mon-sat sun -march- wed-fri platform cancel mon cancel sun + testData = data.table(UID=c( "uid1", "uid1", "uid1", "uid1", "uid1", "uid1"), + start_date=c("02-01-2023", "08-01-2023", "01-03-2023", "11-01-2023", "09-01-2023", "15-01-2023" ), + end_date=c( "04-02-2023", "05-02-2023", "31-03-2023", "27-01-2023", "16-01-2023", "22-01-2023" ), + Days=c( "1111110", "0000001", "0011100", "0011100", "1000000", "0000001" ), + STP=c( "P", "P", "P", "O", "C", "C" ), + rowID=c( 1, 2, 3, 4, 5, 6)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "uid1 a", "uid1 b", "uid1 c", "uid1 d", "uid1 e", "uid1 f"), + start_date=c("02-01-2023", "10-01-2023", "11-01-2023", "14-01-2023", "17-01-2023", "18-01-2023"), + end_date=c( "08-01-2023", "10-01-2023", "13-01-2023", "15-01-2023", "17-01-2023", "20-01-2023"), + Days=c( "1111110", "1111110", "0011100", "1111110", "1111110", "0011100"), + STP=c( "P", "P", "O", "P", "P", "O"), + rowID=c( 1, 1, 4, 1, 1, 4)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "uid1 g", "uid1 h", "uid1 i", "uid1 a2", "uid1 c2", "uid1 d3"), + start_date=c("21-01-2023", "25-01-2023", "28-01-2023", "08-01-2023", "23-01-2023", "01-03-2023"), + end_date=c( "24-01-2023", "27-01-2023", "04-02-2023", "14-01-2023", "05-02-2023", "31-03-2023"), + Days=c( "1111110", "0011100", "1111110", "0000001", "0000001", "0011100"), + STP=c( "P", "O", "P", "P", "P", "P"), + rowID=c( 1, 4, 1, 2, 2, 3))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + + +test_that("11:test makeCalendarInner: overlay matching pattern of a base that is offset temporaly", { + + testData = data.table( + UID=c( "C09094", "C09094", "C09094", "C09094", "C09094", "C09094", "C09094"), + start_date=c("22-05-2023", "27-05-2023", "10-08-2023", "07-09-2023", "21-09-2023", "05-10-2023", "28-10-2023"), + end_date=c( "08-12-2023", "09-09-2023", "10-08-2023", "07-09-2023", "21-09-2023", "05-10-2023", "09-12-2023"), + Days=c( "1111100", "0000010", "0001000", "0001000", "0001000" , "0001000", "0000010"), + STP=c( "P", "P", "O", "O", "O", "O", "P"), + rowID=c( 440737, 1205259, 2856390, 4529108, 5368156, 5835925, 6097391)) + + testData <- fixCalendarDates( testData ) + + res <- makeCalendarInner( testData ) + + res.calendar <- res[[1]] + res.calendar_dates <- res[[2]] + + expectedResult = data.table( + UID=c( "C09094 a", "C09094 b", "C09094 c", "C09094 d", "C09094 e", "C09094 f", "C09094 g"), + start_date=c("22-05-2023", "10-08-2023", "11-08-2023", "07-09-2023", "08-09-2023", "21-09-2023", "22-09-2023"), + end_date=c( "09-08-2023", "10-08-2023", "06-09-2023", "07-09-2023", "20-09-2023", "21-09-2023", "04-10-2023"), + Days=c( "1111100", "0001000", "1111100", "0001000", "1111100" , "0001000", "1111100"), + STP=c( "P", "O", "P", "O", "P", "O", "P"), + rowID=c( 440737, 2856390, 440737, 4529108, 440737, 5368156, 440737)) + + expectedResult = rbind(expectedResult, data.table( + UID=c( "C09094 h", "C09094 i", "C09094 a2", "C09094 b2"), + start_date=c("05-10-2023", "06-10-2023", "27-05-2023", "28-10-2023"), + end_date=c( "05-10-2023", "08-12-2023", "09-09-2023", "09-12-2023"), + Days=c( "0001000", "1111100", "0000010", "0000010"), + STP=c( "O", "P", "P", "P"), + rowID=c( 5835925, 440737, 1205259, 6097391))) + + expectedResult <- fixCalendarDates( expectedResult ) + + res.calendar = removeOriginalUidField( res.calendar ) + expectedResult = removeOriginalUidField( expectedResult ) + + printDifferencesDf(expectedResult,res.calendar) + + expect_true(identical(expectedResult,res.calendar) & is.na(res.calendar_dates)) +}) + + diff --git a/tests/testthat/test_atoc.R b/tests/testthat/test_atoc.R index fdbc787..4b577f2 100644 --- a/tests/testthat/test_atoc.R +++ b/tests/testthat/test_atoc.R @@ -1,10 +1,11 @@ - context("Get the example atoc files") file_path <- file.path(tempdir(),"uk2gtfs_tests") dir.create(file_path) data_path <- file.path(tempdir(),"uk2gtfs_data") dir.create(data_path) + + test_that("test atoc data is there", { expect_true(dl_example_file(data_path, "atoc")) expect_true(file.exists(file.path(data_path, "atoc.zip"))) @@ -13,6 +14,7 @@ test_that("test atoc data is there", { context("Test the main atoc function") test_that("test atoc2gtfs singlecore", { + gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1) @@ -20,9 +22,12 @@ test_that("test atoc2gtfs singlecore", { }) + + context("Test the main atoc function, with different settings") test_that("test atoc2gtfs singlecore", { + gtfs <- atoc2gtfs(path_in = file.path(data_path,"atoc.zip"), ncores = 1, locations = "file") diff --git a/tests/testthat/test_transxchange.R b/tests/testthat/test_transxchange.R index 112cc7b..92f3578 100644 --- a/tests/testthat/test_transxchange.R +++ b/tests/testthat/test_transxchange.R @@ -26,7 +26,8 @@ test_that("test transxchange2gtfs singlecore", { naptan = naptan, ncores = 1, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs2") expect_true(file.exists(file.path(file_path,"txc_gtfs2.zip"))) @@ -40,7 +41,8 @@ if(.Platform$OS.type == "unix") { naptan = naptan, ncores = 2, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs") expect_true(file.exists(file.path(file_path,"txc_gtfs.zip")))