-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIPTrr_PreDev.R
140 lines (98 loc) · 5.54 KB
/
IPTrr_PreDev.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Prep Related Resource extension datasets
# 1. Retrieve Catalog records with related resources
# - i.e. where RelRelationship_tab is NOT NULL
# 2. Report with "IPT Related Resource"
# Generated CSVs include:
# - ecatalog.csv
# - Group1.csv
#
# 3. Save report to [this repo]/data01raw/relationships
library(readr)
library(tidyr)
relat_raw <- read_csv("data01raw/relationships/Group1.csv",
col_types = cols(.default = col_character()))
relat_raw <- type_convert(relat_raw)
# # test with sample data
# relat_raw <- read_csv("sampleData/relationships/Group1.csv")
# Group.DarGlobalUniqueID = resourceID (= occurrenceID)
# Group.relatedResourceID = relatedResourceID
# Group.RelRelationship_tab = relationshipOfResource
# Group.DarScientificName/[to be parsed]" = scientificName
# Group.RelNotes = relationshipRemarks
# "[to be parsed]" = relationshipAccordingTo
# "[to be parsed]" = relationshipEstablishedDate
relat <- data.frame("resourceID" = relat_raw$DarGlobalUniqueIdentifier,
"relatedResourceID" = relat_raw$relatedResourceID,
"relationshipOfResource" = relat_raw$RelRelationship,
"scientificName" = relat_raw$DarScientificName,
"RelNotes" = relat_raw$RelNotes,
stringsAsFactors = FALSE)
# Parse RelNotes
#
# RelNotes values from EMu need to be formatted like this:
# “Count: [value] | ObjectURI: [OBJECT GUID or URI if not FMNH catalogue record] | RecordedByIRN: [Recorder-1 irn, Recorder-2 irn] | RecordedBySummary: [Recorder-1 name, Recorder-2 name] | TaxonIRN: [irn] | TaxonSummary: [Summary] | Notes: [Text from notes]”
relat$Count <- gsub("Count:\\s*|\\s*\\|\\s*ObjURI:.*", "", relat$RelNotes)
relat$relatedResourceID_2 <- gsub(".*ObjURI:\\s*|\\s*\\|\\s*RecordedByIRN.*", "", relat$RelNotes)
relat$RecordedByIRN <- gsub(".*RecordedByIRN:\\s*|\\s*\\|\\s*RecordedBySummary.*", "", relat$RelNotes)
relat$relationshipAccordingTo <- gsub(".*RecordedBySummary:\\s*|\\s*\\|\\s*TaxonIRN.*", "", relat$RelNotes)
relat$TaxonIRN <- gsub(".*TaxonIRN:\\s*|\\s*\\|\\s*TaxonSummary.*", "", relat$RelNotes)
relat$scientificName_2 <- gsub(".*TaxonSummary:\\s*|\\s*\\|\\s*Notes.*", "", relat$RelNotes)
relat$relationshipRemarks <- gsub(".*\\|\\s*Notes:\\s*", "", relat$RelNotes)
# # Separate seems simpler but more fragile if RelNotes value doesn't strictly follow format
# relat <- separate(relat, col = "relationshipRemarks",
# into = c("Count", "relatedResourceID_2",
# "RecordedByIRN", "relationshipAccordingTo",
# "TaxonIRN", "scientificName_2", "relationshipRemarks"),
# sep = "\\|", remove = TRUE, convert = FALSE,
# extra = "warn", fill = "warn")
#
# # Cleanup parsed values
# relat$Count <- gsub("Count:\\s*", "", relat$Count)
# relat$relatedResourceID_2 <- gsub("ObjURI:\\s*", "", relat$relatedResourceID_2)
# relat$relationshipAccordingTo <- gsub("RecordedBy:\\s*", "", relat$relationshipAccordingTo)
# relat$TaxonIRN <- gsub("TaxonIRN:\\s*", "", relat$TaxonIRN)
# relat$scientificName_2 <- gsub("TaxonSummary:\\s*", "", relat$scientificName_2)
# relat$relationshipRemarks <- gsub("Notes:\\s*", "", relat$relationshipRemarks)
relat <- as.data.frame(sapply(relat, trimws, simplify = FALSE),
stringsAsFactors = F)
relat <- as.data.frame(sapply(relat, gsub, pattern = "NULL", replacement = "",
simplify = FALSE),
stringsAsFactors = FALSE)
# Merge fields mapped to multiple pre-dev fields
relat$relatedResourceID[is.na(relat$relatedResourceID)==T] <- relat$relatedResourceID_2[is.na(relat$relatedResourceID)==T]
relat$scientificName[is.na(relat$scientificName)==T] <- relat$scientificName_2[is.na(relat$scientificName)==T]
# Add scientificName to relationshipRemarks until IPT can map sciName
relat$relationshipRemarks[is.na(relat$scientificName)==F] <- paste0(relat$relationshipRemarks[is.na(relat$scientificName)==F],
" | scientificName: ",
relat$scientificName[is.na(relat$scientificName)==F])
# cleanup NA values
relat[is.na(relat)] <- ""
relat$relationshipRemarks <- gsub("^NA\\s+\\|\\s+", "", relat$relationshipRemarks)
# Add placeholders for missing fields
relat$resourceRelationshipID <- ""
relat$relationshipEstablishedDate <- ""
# Prep final export table
relat_out <- relat[,c("resourceRelationshipID", "resourceID", "relatedResourceID",
"relationshipOfResource", "relationshipAccordingTo",
"relationshipEstablishedDate", "relationshipRemarks",
"scientificName")]
# output resource relationship extension
if(!dir.exists("data02output/relation")) {
if(!dir.exists("data02output")) {
dir.create("data02output")
print("created 'output' directory")
} else {
print("output directory exists")
}
dir.create("data02output/relation")
print("created 'relation' output subdirectory")
} else {
print("relation output directory exists")
}
write.csv(relat_out,
file = paste0("data02output/relation/relation_",
gsub("-|\\s+|:", "", Sys.time()),
".csv"),
row.names = FALSE,
quote = TRUE,
na = "")