From d553af9fe81f3636c235ac78dfa3369b72ccc638 Mon Sep 17 00:00:00 2001 From: Dustin Zastera Date: Thu, 28 Mar 2024 22:14:53 -0500 Subject: [PATCH 1/3] Fix for Nested Attachments (Issue #13) Updated attachment processing to handle nested attachments and updated the regex on msg_fields to be more robust with grep. --- R/process.r | 13 +++++---- R/utils.r | 76 ++++++++++++++++++++++++++--------------------------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/R/process.r b/R/process.r index cafdbb0..abb6bb1 100644 --- a/R/process.r +++ b/R/process.r @@ -21,19 +21,18 @@ process_recipients <- function(x) { } process_attachments <- function(x) { - y <- grep("/__attach_version1.0_", names(x), value=TRUE) - z <- sapply(y, strsplit, split = "/", fixed=TRUE, USE.NAMES = FALSE) - z <- sprintf("/%s", unique(sapply(z, `[`, 2))) + y <- grep(paste0("^/__attach_version1.*", msg_fields$AttachFilename), names(x), value=TRUE) + z <- regmatches(y, regexpr("^/__.*/", y)) lapply(z, function(r) { - attachmnt <- x[grep(sprintf("^%s", r), names(x), value=TRUE)] + attachmnt <- x[grep(paste0("^", r, "__substg1.0_[0-9A-F]{8}$"), names(x), value = TRUE)] list( filename = unlist(unname(x[grep(msg_fields$AttachFilename, names(attachmnt), value=TRUE)])), long_filename = unlist(unname(x[grep(msg_fields$AttachLongFilename, names(attachmnt), value=TRUE)])), mime = unlist(unname(x[grep(msg_fields$AttachMIME, names(attachmnt), value=TRUE)])), - content = unlist(unname(x[grep(msg_fields$AttachContent, names(attachmnt), value=TRUE)])) + content = unlist(unname(x[grep(msg_fields$AttachContent, names(attachmnt), value=TRUE)])), + extension = unlist(unname(x[grep(msg_fields$AttachExtension, names(attachmnt), value=TRUE)])) ) -> res - extension <- unlist(unname(x[grep(msg_fields$AttachExtension, names(attachmnt), value=TRUE)])) - if (!is.null(extension)) res$extension <- extension + res[sapply(res, is.null)] <- NA res }) } diff --git a/R/utils.r b/R/utils.r index 7584594..4545fe0 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,40 +1,40 @@ msg_fields <- list( - CreationTime = "_3007", - LastModificationTime = "_3008", - LastModifierName = "_3FFA", - OriginalSenderEmailAddress = "_0067", - MessageBodyHtml = "_1013", - MessageClass = "_001A", - Subject = "_0037", - ReceivedByName = "_0040", - ReceivedRepresentingName = "_0044", - SentRepresentingName = "_0071", - SentRepresentingEmailAddress = "_0065", - SentRepresentingAddressType = "_0064", - SenderAddressType = "_0C1E", - SenderEmailAddress = "_0C1F", - SenderName = "_0C1A", - ConversationTopic = "_0070", - ReceivedRepresentingEmailAddress = "_0078", - ReceivedByEmailAddress = "_0076", - ReceivedByAddressType = "_0075", - ReceivedRepresentingAddressType = "_0077", - TransportMessageHeaders = "_007D", - DisplayName = "_0E01", - DisplayBcc = "_0E02", - DisplayCc = "_0E03", - DisplayTo = "_0E04", - NormalizedSubject = "_0E1D", - OriginalMessageId = "_1046", - InternetMessageId = "_1035", - MessageBody = "_1000", - AttachExtension = "_3703", - AttachFilename = "_3704", - AttachLongFilename = "_3707", - AttachMIME = "_370E", - AttachContentId = "_3712", - AttachContent = "_3701", - DisplayName = "_3001", - AddressType = "_3002", - EmailAddress = "_3003" + CreationTime = "_3007[0-9A-F]{4}$", + LastModificationTime = "_3008[0-9A-F]{4}$", + LastModifierName = "_3FFA[0-9A-F]{4}$", + OriginalSenderEmailAddress = "_0067[0-9A-F]{4}$", + MessageBodyHtml = "_1013[0-9A-F]{4}$", + MessageClass = "_001A[0-9A-F]{4}$", + Subject = "_0037[0-9A-F]{4}$", + ReceivedByName = "_0040[0-9A-F]{4}$", + ReceivedRepresentingName = "_0044[0-9A-F]{4}$", + SentRepresentingName = "_0071[0-9A-F]{4}$", + SentRepresentingEmailAddress = "_0065[0-9A-F]{4}$", + SentRepresentingAddressType = "_0064[0-9A-F]{4}$", + SenderAddressType = "_0C1E[0-9A-F]{4}$", + SenderEmailAddress = "_0C1F[0-9A-F]{4}$", + SenderName = "_0C1A[0-9A-F]{4}$", + ConversationTopic = "_0070[0-9A-F]{4}$", + ReceivedRepresentingEmailAddress = "_0078[0-9A-F]{4}$", + ReceivedByEmailAddress = "_0076[0-9A-F]{4}$", + ReceivedByAddressType = "_0075[0-9A-F]{4}$", + ReceivedRepresentingAddressType = "_0077[0-9A-F]{4}$", + TransportMessageHeaders = "_007D[0-9A-F]{4}$", + DisplayName = "_0E01[0-9A-F]{4}$", + DisplayBcc = "_0E02[0-9A-F]{4}$", + DisplayCc = "_0E03[0-9A-F]{4}$", + DisplayTo = "_0E04[0-9A-F]{4}$", + NormalizedSubject = "_0E1D[0-9A-F]{4}$", + OriginalMessageId = "_1046[0-9A-F]{4}$", + InternetMessageId = "_1035[0-9A-F]{4}$", + MessageBody = "_1000[0-9A-F]{4}$", + AttachExtension = "_3703[0-9A-F]{4}$", + AttachFilename = "_3704[0-9A-F]{4}$", + AttachLongFilename = "_3707[0-9A-F]{4}$", + AttachMIME = "_370E[0-9A-F]{4}$", + AttachContentId = "_3712[0-9A-F]{4}$", + AttachContent = "_3701[0-9A-F]{4}$", + DisplayName = "_3001[0-9A-F]{4}$", + AddressType = "_3002[0-9A-F]{4}$", + EmailAddress = "_3003[0-9A-F]{4}$" ) From cf01583c2ddb1984436d9b54c98e2a7987f64f45 Mon Sep 17 00:00:00 2001 From: Zastera Date: Mon, 4 Nov 2024 06:54:24 -0600 Subject: [PATCH 2/3] Patched to ensure unique filenames during attachment save. --- DESCRIPTION | 2 +- R/attach.r | 17 ++++++++++++++++- src/RcppExports.cpp | 5 +++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 330b999..5824f8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: msgxtractr Type: Package Title: Read Outlook '.msg' Files -Version: 0.3.0 +Version: 0.3.0.9001 Date: 2020-05-06 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")), diff --git a/R/attach.r b/R/attach.r index d845bb0..564280c 100644 --- a/R/attach.r +++ b/R/attach.r @@ -33,12 +33,27 @@ save_attachments <- function(msg_obj, path=getwd(), use_short=TRUE, quiet=FALSE) out_path <- path.expand(path) out_path <- file.path(out_path, fn) + ## works for now, clean up later + if(file.exists(out_path)) { + i <- 1 + ext <- tools::file_ext(out_path) + file <- tools::file_path_sans_ext(out_path) + new_path <- paste0(file, "_", i, ".", ext) + + while(file.exists(new_path)) { + i <- i + 1 + new_path <- paste0(file, "_", i, ".", ext) + } + out_path <- new_path + } + ## + if (!quiet) message(sprintf("Saving %s (%s bytes)", out_path, scales::comma(length(a$content)))) writeBin(a$content, con = out_path) - out <- c(out, out_path) + #out <- c(out, out_path) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a3603e6..4a7d3aa 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // is_rtf bool is_rtf(RawVector v); RcppExport SEXP _msgxtractr_is_rtf(SEXP vSEXP) { From 3d0f8efa9b92387554f68602f70ecc2d4090dabc Mon Sep 17 00:00:00 2001 From: Zastera Date: Mon, 4 Nov 2024 06:54:24 -0600 Subject: [PATCH 3/3] Patched to ensure unique filenames during attachment save. --- DESCRIPTION | 2 +- R/attach.r | 15 +++++++++++++++ src/RcppExports.cpp | 5 +++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 330b999..5824f8f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: msgxtractr Type: Package Title: Read Outlook '.msg' Files -Version: 0.3.0 +Version: 0.3.0.9001 Date: 2020-05-06 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")), diff --git a/R/attach.r b/R/attach.r index d845bb0..3bae9b3 100644 --- a/R/attach.r +++ b/R/attach.r @@ -33,6 +33,21 @@ save_attachments <- function(msg_obj, path=getwd(), use_short=TRUE, quiet=FALSE) out_path <- path.expand(path) out_path <- file.path(out_path, fn) + ## works for now, clean up later + if(file.exists(out_path)) { + i <- 1 + ext <- tools::file_ext(out_path) + file <- tools::file_path_sans_ext(out_path) + new_path <- paste0(file, "_", i, ".", ext) + + while(file.exists(new_path)) { + i <- i + 1 + new_path <- paste0(file, "_", i, ".", ext) + } + out_path <- new_path + } + ## + if (!quiet) message(sprintf("Saving %s (%s bytes)", out_path, scales::comma(length(a$content)))) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a3603e6..4a7d3aa 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // is_rtf bool is_rtf(RawVector v); RcppExport SEXP _msgxtractr_is_rtf(SEXP vSEXP) {