Skip to content

Commit

Permalink
read encoding from first lines: grep() might fail when parsed_export …
Browse files Browse the repository at this point in the history
…is assumed with wrong encoding and special characters appear later
  • Loading branch information
DrEspresso committed May 15, 2024
1 parent 18a17ea commit 1511ba6
Showing 1 changed file with 39 additions and 23 deletions.
62 changes: 39 additions & 23 deletions R/read_export_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,45 +72,61 @@
read_export_table <- function(data_dir, file_name, export_options,
add_pat_id = TRUE, add_centre = TRUE, add_visitname = TRUE,
casenodes_table, centre_table, visitplan_table,
is_meta_table = FALSE, sep = export_options$sep, ...) {
is_meta_table = FALSE,
sep = export_options$sep, quote = export_options$quote,
...) {
ops <- options()
on.exit(ops)
options(stringsAsFactors = FALSE)

# ISO encoding must be names "latin1"
curr_encoding <- export_options$encoding
if (curr_encoding == "ISO-8859-1" | curr_encoding == "ISO-8859-15") {
curr_encoding <- "latin1"
warning("ISO-8859-1 encoding detected. Strings may not be interpreted correctly")
}
## if (curr_encoding == "ISO-8859-1" | curr_encoding == "ISO-8859-15") {
## curr_encoding <- "latin1"
## warning("ISO-8859-1 encoding detected. Strings may not be interpreted correctly")
## }

if (export_options$is_zip) {
archive_con <- unz(data_dir, file_name)
loaded_table <- read.table(file = archive_con,
header = TRUE,
na.strings = export_options$na.strings,
sep = sep,
fill = TRUE,
encoding = curr_encoding,
...)
table_lines <- readr::read_lines(archive_con, locale = readr::locale(encoding = curr_encoding))
} else if (export_options$is_zip == FALSE) {
loaded_table <- read.table(file = paste0(data_dir, "/", file_name),
header = TRUE,
na.strings = export_options$na.strings,
sep = sep,
fill = TRUE,
encoding = curr_encoding,
...)
table_lines <- readr::read_lines(paste0(data_dir, "/", file_name), locale = readr::locale(encoding = curr_encoding))
} else {
stop(paste0("Could not load table ", file_name))
}

# in earlier secuTrial exports there was
# a last/empty column "X" which can be removed
if ("X" %in% names(loaded_table)) {
loaded_table <- loaded_table[, -ncol(loaded_table)]
# Some tables may be non-rectangular due to additional
# empty cells at the end of some rows
# In such a case, fill rows to make table rectangular
# (analogous to fill = TRUE in read.table())

nsep <- sapply(table_lines, function(x) stringr::str_count(x, paste0(quote,sep,quote)), USE.NAMES = FALSE)
if (any(nsep) != max(nsep)) {
table_lines[nsep != max(nsep)] <- paste0(table_lines[nsep != max(nsep)], sep, quote, quote)
}

#Encoding(table_lines) <- curr_encoding

loaded_table <- readr::read_delim(file = I(table_lines),
na = export_options$na.strings,
delim = sep,
quote = quote,
#locale = readr::locale(encoding = curr_encoding),
# do not attempt to change the names of any last/empty column
name_repair = "minimal",
# do not convert columns
col_types = cols(.default = "c"),
# escape special characters with backslash
escape_backslash = TRUE,
escape_double = FALSE,
...)

# Remove any last empty column (columns without name)
loaded_table <- loaded_table[, names(loaded_table) != ""]

# Convert table to data.frame, convert column types with read.table() default type.convert()
loaded_table <- as.data.frame(lapply(loaded_table, function(x) type.convert(x, as.is = TRUE)))

# do not manipulate meta tables
if (is_meta_table) {
return(loaded_table)
Expand Down

0 comments on commit 1511ba6

Please sign in to comment.