From d553af9fe81f3636c235ac78dfa3369b72ccc638 Mon Sep 17 00:00:00 2001 From: Dustin Zastera Date: Thu, 28 Mar 2024 22:14:53 -0500 Subject: [PATCH] Fix for Nested Attachments (Issue #13) Updated attachment processing to handle nested attachments and updated the regex on msg_fields to be more robust with grep. --- R/process.r | 13 +++++---- R/utils.r | 76 ++++++++++++++++++++++++++--------------------------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/R/process.r b/R/process.r index cafdbb0..abb6bb1 100644 --- a/R/process.r +++ b/R/process.r @@ -21,19 +21,18 @@ process_recipients <- function(x) { } process_attachments <- function(x) { - y <- grep("/__attach_version1.0_", names(x), value=TRUE) - z <- sapply(y, strsplit, split = "/", fixed=TRUE, USE.NAMES = FALSE) - z <- sprintf("/%s", unique(sapply(z, `[`, 2))) + y <- grep(paste0("^/__attach_version1.*", msg_fields$AttachFilename), names(x), value=TRUE) + z <- regmatches(y, regexpr("^/__.*/", y)) lapply(z, function(r) { - attachmnt <- x[grep(sprintf("^%s", r), names(x), value=TRUE)] + attachmnt <- x[grep(paste0("^", r, "__substg1.0_[0-9A-F]{8}$"), names(x), value = TRUE)] list( filename = unlist(unname(x[grep(msg_fields$AttachFilename, names(attachmnt), value=TRUE)])), long_filename = unlist(unname(x[grep(msg_fields$AttachLongFilename, names(attachmnt), value=TRUE)])), mime = unlist(unname(x[grep(msg_fields$AttachMIME, names(attachmnt), value=TRUE)])), - content = unlist(unname(x[grep(msg_fields$AttachContent, names(attachmnt), value=TRUE)])) + content = unlist(unname(x[grep(msg_fields$AttachContent, names(attachmnt), value=TRUE)])), + extension = unlist(unname(x[grep(msg_fields$AttachExtension, names(attachmnt), value=TRUE)])) ) -> res - extension <- unlist(unname(x[grep(msg_fields$AttachExtension, names(attachmnt), value=TRUE)])) - if (!is.null(extension)) res$extension <- extension + res[sapply(res, is.null)] <- NA res }) } diff --git a/R/utils.r b/R/utils.r index 7584594..4545fe0 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,40 +1,40 @@ msg_fields <- list( - CreationTime = "_3007", - LastModificationTime = "_3008", - LastModifierName = "_3FFA", - OriginalSenderEmailAddress = "_0067", - MessageBodyHtml = "_1013", - MessageClass = "_001A", - Subject = "_0037", - ReceivedByName = "_0040", - ReceivedRepresentingName = "_0044", - SentRepresentingName = "_0071", - SentRepresentingEmailAddress = "_0065", - SentRepresentingAddressType = "_0064", - SenderAddressType = "_0C1E", - SenderEmailAddress = "_0C1F", - SenderName = "_0C1A", - ConversationTopic = "_0070", - ReceivedRepresentingEmailAddress = "_0078", - ReceivedByEmailAddress = "_0076", - ReceivedByAddressType = "_0075", - ReceivedRepresentingAddressType = "_0077", - TransportMessageHeaders = "_007D", - DisplayName = "_0E01", - DisplayBcc = "_0E02", - DisplayCc = "_0E03", - DisplayTo = "_0E04", - NormalizedSubject = "_0E1D", - OriginalMessageId = "_1046", - InternetMessageId = "_1035", - MessageBody = "_1000", - AttachExtension = "_3703", - AttachFilename = "_3704", - AttachLongFilename = "_3707", - AttachMIME = "_370E", - AttachContentId = "_3712", - AttachContent = "_3701", - DisplayName = "_3001", - AddressType = "_3002", - EmailAddress = "_3003" + CreationTime = "_3007[0-9A-F]{4}$", + LastModificationTime = "_3008[0-9A-F]{4}$", + LastModifierName = "_3FFA[0-9A-F]{4}$", + OriginalSenderEmailAddress = "_0067[0-9A-F]{4}$", + MessageBodyHtml = "_1013[0-9A-F]{4}$", + MessageClass = "_001A[0-9A-F]{4}$", + Subject = "_0037[0-9A-F]{4}$", + ReceivedByName = "_0040[0-9A-F]{4}$", + ReceivedRepresentingName = "_0044[0-9A-F]{4}$", + SentRepresentingName = "_0071[0-9A-F]{4}$", + SentRepresentingEmailAddress = "_0065[0-9A-F]{4}$", + SentRepresentingAddressType = "_0064[0-9A-F]{4}$", + SenderAddressType = "_0C1E[0-9A-F]{4}$", + SenderEmailAddress = "_0C1F[0-9A-F]{4}$", + SenderName = "_0C1A[0-9A-F]{4}$", + ConversationTopic = "_0070[0-9A-F]{4}$", + ReceivedRepresentingEmailAddress = "_0078[0-9A-F]{4}$", + ReceivedByEmailAddress = "_0076[0-9A-F]{4}$", + ReceivedByAddressType = "_0075[0-9A-F]{4}$", + ReceivedRepresentingAddressType = "_0077[0-9A-F]{4}$", + TransportMessageHeaders = "_007D[0-9A-F]{4}$", + DisplayName = "_0E01[0-9A-F]{4}$", + DisplayBcc = "_0E02[0-9A-F]{4}$", + DisplayCc = "_0E03[0-9A-F]{4}$", + DisplayTo = "_0E04[0-9A-F]{4}$", + NormalizedSubject = "_0E1D[0-9A-F]{4}$", + OriginalMessageId = "_1046[0-9A-F]{4}$", + InternetMessageId = "_1035[0-9A-F]{4}$", + MessageBody = "_1000[0-9A-F]{4}$", + AttachExtension = "_3703[0-9A-F]{4}$", + AttachFilename = "_3704[0-9A-F]{4}$", + AttachLongFilename = "_3707[0-9A-F]{4}$", + AttachMIME = "_370E[0-9A-F]{4}$", + AttachContentId = "_3712[0-9A-F]{4}$", + AttachContent = "_3701[0-9A-F]{4}$", + DisplayName = "_3001[0-9A-F]{4}$", + AddressType = "_3002[0-9A-F]{4}$", + EmailAddress = "_3003[0-9A-F]{4}$" )