Skip to content

Commit

Permalink
Skip short raw files per #67 (#82)
Browse files Browse the repository at this point in the history
* Skip short raw files per #67
* Add a too-short raw file to our test data
  • Loading branch information
bpbond authored Dec 13, 2023
1 parent 943a74b commit af25bae
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 52 deletions.
107 changes: 55 additions & 52 deletions synoptic/L0.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -80,57 +80,61 @@ f <- function(fn, new_dir) {
dat <- read_datalogger_file(fn, col_types = cols(.default = col_character()))
message("\tOriginal data is ", nrow(dat), " x ", ncol(dat))
# Pivot to long form. We need to do this now to calculate unique observation IDs
dat_long <- pivot_longer(dat, c(-Logger, -Table, -TIMESTAMP),
names_to = "loggernet_variable",
values_to = "value")
message("\tPivoted data is ", nrow(dat_long), " x ", ncol(dat_long))
# Remove duplicates
dupes <- duplicated(dat_long)
if(any(dupes)) {
message("\rRemoving ", sum(dupes), " duplicate rows")
dat_long <- dat_long[!dupes,]
}
# Add a unique ID column
# We use digest::digest() and the "md5" algorithm (first 10 characters);
# originally tried the "crc32" algorithm but this generated duplicate IDs
# This is not *guaranteed* to be unique, but collisions are highly unlikely
dat_long$ID <- sapply(apply(dat_long, 1, paste, collapse = ""),
FUN = function(x) {
substr(digest::digest(x, algo = "md5"), 1, 16)
})
# Since we removed duplicate rows above, hashes should be unique
if(any(duplicated(dat_long$ID))) {
stop("Duplicate ID hashes!")
}
# Construct the new filename: old filename + nrow + ncol + short hash
# This is so we can distinguish datasets with identical raw filename but
# differing data contents (can happen sometimes with dataloggers)
stripped_fn <- gsub("\\.dat$", "", basefn)
short_hash <- substr(digest::digest(dat_long, algo = "md5"), 1, 4)
new_fn <- paste0(stripped_fn, "_", nrow(dat_long), "x", ncol(dat_long), "_", short_hash, ".csv")
note <- ""
if(file.exists(file.path(new_dir, new_fn))) {
note <- "Overwriting existing file"
message("\t", note)
overwrites <<- overwrites + 1
}
# Write the new file, checking to make sure successful
new_fqfn <- file.path(new_dir, new_fn)
message("\tWriting ", new_fqfn)
try(write.csv(dat_long, new_fqfn, row.names = FALSE))
if(!file.exists(new_fqfn)) {
note <- "Write error"
message("\t", note)
errors <<- errors + 1
long_rows <- long_cols <- short_hash <- NA
# Roy: "I generally have a filter that checks and throws out data with very few lines
# as they are usually garbage, I have my input check for <4 lines after header"
if(nrow(dat) < 4) {
message("\tRaw data file is very short and likely garbage; skipping")
note <- "Too short; skipped"
} else {
# Move to 'Raw_done' folder
# Pivot to long form. We need to do this now to calculate unique observation IDs
dat_long <- pivot_longer(dat, c(-Logger, -Table, -TIMESTAMP),
names_to = "loggernet_variable",
values_to = "value")
long_rows <- nrow(dat_long)
long_cols <- ncol(dat_long)
message("\tPivoted data is ", long_rows, " x ", long_cols)
# Remove duplicates
dupes <- duplicated(dat_long)
if(any(dupes)) {
message("\rRemoving ", sum(dupes), " duplicate rows")
dat_long <- dat_long[!dupes,]
}
# Add a unique ID column
# We use digest::digest() and the "md5" algorithm (first 10 characters);
# originally tried the "crc32" algorithm but this generated duplicate IDs
# This is not *guaranteed* to be unique, but collisions are highly unlikely
dat_long$ID <- sapply(apply(dat_long, 1, paste, collapse = ""),
FUN = function(x) {
substr(digest::digest(x, algo = "md5"), 1, 16)
})
# Since we removed duplicate rows above, hashes should be unique
if(any(duplicated(dat_long$ID))) {
stop("Duplicate ID hashes!")
}
# Construct the new filename: old filename + nrow + ncol + short hash
# This is so we can distinguish datasets with identical raw filename but
# differing data contents (can happen sometimes with dataloggers)
stripped_fn <- gsub("\\.dat$", "", basefn)
short_hash <- substr(digest::digest(dat_long, algo = "md5"), 1, 4)
new_fn <- paste0(stripped_fn, "_", long_rows, "x", long_cols, "_", short_hash, ".csv")
if(file.exists(file.path(new_dir, new_fn))) {
note <- "Overwriting existing file"
message("\t", note)
overwrites <<- overwrites + 1
}
# Write the new file...
new_fqfn <- file.path(new_dir, new_fn)
message("\tWriting ", new_fqfn)
write.csv(dat_long, new_fqfn, row.names = FALSE)
# ...and move to 'Raw_done' folder
if(params$remove_input_files) {
message("\tArchiving raw input files")
file.copy(fn, file.path(params$raw_done, basefn), overwrite = FALSE)
Expand All @@ -142,8 +146,8 @@ f <- function(fn, new_dir) {
data.frame(File = basefn,
Orig_rows = nrow(dat),
Orig_columns = ncol(dat),
Rows = nrow(dat_long),
Columns = ncol(dat_long),
Rows = long_rows,
Columns = long_cols,
Hash = short_hash,
Note = note)
}
Expand Down Expand Up @@ -188,4 +192,3 @@ Git commit `r GIT_COMMIT`.
```{r reproducibility}
sessionInfo()
```

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"TOA5","Compass_CRC_TR_302","CR1000X","28733","CR1000X.Std.06.00","CPU:COMPASS_v3.3.CR1X","12050","CheckTable"
"TIMESTAMP","RECORD","BattV","SolarV","Batt_CHECK","Solar_CHECK","Flag(1)","Flag(2)","Flag(3)","Flag(4)","Flag(5)","Flag(6)","Flag(7)","Flag(8)","Flag(9)","Flag(10)","Flag(11)","Flag(12)","Flag(13)","Flag(14)","Statname","PB"
"TS","RN","Volts","Volts","","","","","","","","","","","","","","","","","",""
"","","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp","Smp"
"2023-04-21 08:30:00",0,11.93,28.3,"LOW","OKAY",0,0,-1,0,-1,0,0,-1,0,0,0,0,0,0,"COMPASS_CRC_TR",302
"2023-04-21 08:45:00",1,11.93,28.05,"LOW","OKAY",0,0,-1,0,-1,0,0,-1,0,0,0,0,0,0,"COMPASS_CRC_TR",302

0 comments on commit af25bae

Please sign in to comment.