dlizcano · crcruzr · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -47,4 +47,4 @@ jobs:
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
-          build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
+          build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf", "--resave-data")'
diff --git a/Coordinate_Validator/.gitignore b/Coordinate_Validator/.gitignore
@@ -0,0 +1,12 @@
+
+# data to test:
+test_data_coordiantes.csv
+# code test
+coordiante.r
+draft.R
+# Ignore macOS system files
+.DS_Store
+
+# Ignore temporary files or directories
+temp/
+cache/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -35,6 +35,7 @@ Imports:
     ggplot2,
     magrittr,
     sf,
+    terra,
 Suggests:
     dplyr,
     finch,
@@ -48,6 +49,7 @@ Suggests:
 Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
+LazyDataCompression: xz
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(mamm_coords_validator)
 export(mammalmap)
 export(search_mammalcol)
 export(sp_by_depto)

diff --git a/R/colmap_igac.R b/R/colmap_igac.R
@@ -0,0 +1,42 @@
+#' colmap_igac Dataset
+#'
+#' The colmap dataset is a simple feature collection with 33 features and 6 fields. This version 
+#' was obtained from: colombia en mapas portal "https://www.colombiaenmapas.gov.co"
+#' 
+#' @format Simple feature collection with 33 features and 3 fields:
+#'   \describe{
+#'     \item{DEPARTAMEN}{class name GDAM.}
+#'      \item{DEPARTAMEN}{class name NAME_1}
+#'     \item{geometry}{order name of GDAM.} 
+#'   }
+#'
+#'
+#' @details This dataset is designed to provide users of mammalcol package with a companion map to
+#' plot the mammal distribution per departamento.
+#'
+#' @examples
+#' 
+#' # Load the mammalcol package
+#' library(mammalcol)
+#' library (sf)
+#' 
+#' # Access the mammalcol_tab dataset
+#' # data("colmap_igac")
+#'
+#' # Display the first few rows
+#' head(colmap)
+#'
+#' plot (colmap["NAME_1"])
+#' 
+#' @seealso
+#' For more information about the "mammalcol" package and the data sources, visit
+#' the package's GitHub repository: \url{https://github.com/dlizcano/mammalcol}
+#'
+#' @references
+#' The dataset is based on the "List of the Mammals of Colombia" by Ramírez-Chaves 2021.
+#'
+#' @author
+#' Data compilation: Ramírez-Chaves 2021, Package implementation: Cristian A. Cruz-R.
+#'
+#' @keywords internal
+"colmap_igac"
diff --git a/R/colombian_sea.R b/R/colombian_sea.R
@@ -0,0 +1,45 @@
+#' Colombian_sea Dataset
+#'
+#' The colmap dataset is a simple feature collection with 33 features and 6 fields. This version 
+#' was obtained from marine regions page: "https://www.marineregions.org/stats_downloads.php"
+#' 
+#' @format Simple feature collection with 33 features and 11 fields:
+#'   \describe{
+#'    \item{MRGID}{id from original GDAM}
+#'     \item{GEONAME}{id from original GDAM}
+#'     \item{SOVEREIGN1}{Colombia.}
+#'     \item{NAME_1}{Departamentos.}
+#'     \item{DEPARTAMEN}{class name GDAM.}
+#'     \item{geometry}{order name of GDAM.}
+#'   }
+#'
+#'
+#' @details This dataset is designed to provide users of mammalcol package with a companion map to
+#' plot the mammal distribution per departamento.
+#'
+#' @examples
+#' 
+#' # Load the mammalcol package
+#' library(mammalcol)
+#' library (sf)
+#' 
+#' # Access the mammalcol_tab dataset
+#' # data("colmap")
+#'
+#' # Display the first few rows
+#' head(colmap)
+#'
+#' plot (colmap["NAME_1"])
+#' 
+#' @seealso
+#' For more information about the "mammalcol" package and the data sources, visit
+#' the package's GitHub repository: \url{https://github.com/dlizcano/mammalcol}
+#'
+#' @references
+#' The dataset is based on the "List of the Mammals of Colombia" by Ramírez-Chaves 2021.
+#'
+#' @author
+#' Data compilation: Ramírez-Chaves 2021, Package implementation: Cristian A. Cruz-R.
+#'
+#' @keywords internal
+"Colombian_sea"
diff --git a/R/mamm_coords_validator.r b/R/mamm_coords_validator.r
@@ -0,0 +1,234 @@
+#' Validate mammal species distribution data based on geographic coordinates.
+#' 
+#' This function validates species distribution data provided in a data frame
+#' against known mammal species lists and geographic coordinates. It outputs
+#' a data frame with validation results and additional information.
+#'
+#' @param df A data frame containing species distribution data with columns 'species',
+#'        'decimalLongitude', and 'decimalLatitude'.
+#' @param sp_names Name of the column containing species names (Genus + Specific Epithet).
+#' @param taxon A data frame with distribution information, including 'scientificName' and 'distribution'.
+#'              The scientificName must be in binomial form, and the distribution should contain names separated by |.
+#'              By default, the function uses the checklist available at https://www.gbif.org/dataset/e8b9ed9b-f715-4eac-ae24-772fbf40d7ae.
+#' @param colmap A spatial object in vector format representing the geographic area to validate against.
+#'               By default, the function uses the Colombia Administrative Boundaries.
+#' @param lon Name of the column containing longitude values in df. Default is 'decimalLongitude'.
+#' @param lat Name of the column containing latitude values in df. Default is 'decimalLatitude'.
+#' @param adm_names Name of the column in colmap representing administrative boundaries. Default is 'NAME_1'.
+#' @param oceanmap A spatial object representing the ocean area to validate against.
+#' @param oce_adm_names Name of the column in oceanmap representing administrative boundaries for ocean areas. Default is 'ocean'.
+#' @return A data frame with validated species records and validation results.
+
+#' @details
+#' This function validates species distribution data by checking species names against a
+#' known list and verifying geographic coordinates against specified maps ('colmap' and 'oceanmap').
+#' It assigns a validation result ('validation_result') where 1 means coincidence and 0 means
+#' no match. Additional details are provided in the returned data frame.
+#'
+#' @examples
+#' validated_data <- mamm_coords_validator(test_data_coordiantes, sp_names = "species")
+#'
+#' @export
+mamm_coords_validator <- function(df, sp_names, taxon = NULL, colmap = NULL, lon = NULL, lat = NULL, adm_names = NULL, oceanmap = NULL, oce_adm_names = NULL) {
+  # Initialize function
+
+  # Validate input and set defaults if necessary
+  df <- as.data.frame(df)
+  oriNames <- names(df)
+
+  # Check if species column name is provided and follows binomial structure
+  if (missing(sp_names)) {
+    stop("You must specify the name of the column containing species names (sp_names) using binomial structure (Genus + specifiepithet).")
+  }
+
+  # Validation: Check if taxon, colmap, and column names are provided
+  if (is.null(taxon))  {
+    taxon <- mammalcol::taxon
+  }
+
+  if (is.null(adm_names)) {
+    adm_names = 'NAME_1'
+  }
+
+  if (is.null(colmap)) {
+    #load('data/colmap_igac.rda')
+    colmap <- mammalcol::colmap_igac
+    colmap[[adm_names]] <- tolower(colmap[[adm_names]])
+  } else {
+    colmap <- sf::st_as_sf(colmap)
+    colmap[[adm_names]] <- tolower(colmap[[adm_names]])
+  }
+
+  # Set default column names for longitude and latitude if not provided
+  if (is.null(lon) & is.null(lat))  {
+    lon = 'decimalLongitude'
+    lat = 'decimalLatitude'
+  } 
+
+  # Set default ocean map and administrative boundary name for ocean if not provided
+  if (is.null(oceanmap)) {
+    #load('data/colombian_sea.rda')
+    oceanmap <- mammalcol::Colombian_sea
+  }  else {
+    oceanmap <- sf::st_as_sf(oceanmap)
+  }
+
+  if (is.null(oce_adm_names)) {
+    oceanmap[[adm_names]] <- 'ocean'
+  } else {
+    oceanmap[[adm_names]] <- oce_adm_names
+  }
+
+  ## Start the data process
+
+  df$IDVal <- paste0('M', 1:nrow(df))
+
+  # Extract unique species names from the data frame
+  sppnms <- unique(df[[sp_names]])
+
+  # Validate species names against known species list
+  vlid_spp <- search_mammalcol(sppnms, max_distance = 0.0)
+
+  # Display summary of species validation
+  if (length(vlid_spp$name_submitted) == 0) {
+    cat("There aren't valid species in the dataset. Please review the species names before using this function.")
+  } else {
+    cat(length(sppnms), "species found in the matrix and ", nrow(vlid_spp), "is/are valid.\n")
+  }
+
+  # Separate species into validated and non-validated
+  notValispp <- df[!df[[sp_names]] %in% unique(vlid_spp$name_submitted), ]
+  notValispp$validation_result <- 4
+  notValispp$dup.areas.val <- NA
+
+  Valispp <- df[df[[sp_names]] %in% unique(vlid_spp$name_submitted), ]
+
+  # Initialize placeholder for final validated results
+  finalVal <- NA
+
+  # Loop through each valid species
+  for (i in 1:nrow(vlid_spp)) {
+    spp.i <- Valispp[Valispp[[sp_names]] %in% vlid_spp$name_submitted[i], ]
+
+    # Extract and intersect species distribution with administrative boundaries
+    vect.spp.i <- terra::vect(spp.i, geom = c(lon, lat), crs = "+proj=longlat +datum=WGS84")
+    vect.spp.i.t <- terra::intersect(terra::vect(colmap), vect.spp.i)
+
+    # extract coordinates to the file
+    if (nrow(vect.spp.i.t) > 0) {
+      vect.spp.i.t2 <- as.data.frame(vect.spp.i.t, geom = 'XY')
+      colnames(vect.spp.i.t2)[colnames(vect.spp.i.t2) == c("x")]<- lon
+      colnames(vect.spp.i.t2)[colnames(vect.spp.i.t2) == c("y")]  <- lat
+      spp.i.f <- vect.spp.i.t2[,names(spp.i)]
+      spp.i.f$validDpto<- vect.spp.i.t2[[adm_names]]
+    } else {
+      spp.i.f <- NULL
+    } 
+
+
+    # Handle cases where records are not fully evaluated
+    if (nrow(vect.spp.i) > nrow(vect.spp.i.t) ) {
+
+      vect.spp.i.novali <- vect.spp.i[!(vect.spp.i$IDVal %in% vect.spp.i.t$IDVal), ]
+      vect.spp.i.novali2 <- terra::intersect(terra::vect(oceanmap), vect.spp.i.novali)
+
+      if (nrow(vect.spp.i.novali2) == 0) {
+        vect.spp.i.novali <- as.data.frame(vect.spp.i.novali, geom = 'XY')
+        colnames(vect.spp.i.novali)[colnames(vect.spp.i.novali) == c("x")] <- lon
+        colnames(vect.spp.i.novali)[colnames(vect.spp.i.novali) == c("y")]  <- lat
+        vect.spp.i.novali.f <- vect.spp.i.novali[, names(spp.i)]
+        vect.spp.i.novali.f$validDpto <- 'Other'
+        spp.i.f <- rbind(spp.i.f, vect.spp.i.novali.f)
+      } else {
+        vect.spp.i.novali <- as.data.frame(vect.spp.i.novali2, geom = 'XY')
+        colnames(vect.spp.i.novali)[colnames(vect.spp.i.novali) == c("x")] <- lon
+        colnames(vect.spp.i.novali)[colnames(vect.spp.i.novali) == c("y")]  <- lat
+        vect.spp.i.novali.f <- vect.spp.i.novali[, names(spp.i)]
+        vect.spp.i.novali.f$validDpto <- 'Ocean'
+        spp.i.f <- rbind(spp.i.f, vect.spp.i.novali.f)
+      }
+    }
+
+    # Check for duplicate records and assign appropriate validation
+    if (any(duplicated(spp.i.f$IDVal))) {
+      dupspp <- spp.i.f[duplicated(spp.i.f$IDVal), ]
+      spp.i.f$dup.areas.val <- ifelse(spp.i.f$IDVal %in% dupspp$IDVal, 1, 0)
+    } else {
+      spp.i.f$dup.areas.val <- 0
+    }
+
+    # Append validated species subset to final results
+    finalVal <- rbind(finalVal, spp.i.f)
+  }
+
+  finalVal <- finalVal[-1, ]
+
+  # Additional validation based on species distribution
+  distribution_list <- strsplit(taxon$distribution, "\\|")
+  finaleva <- NA
+
+  for (j in 1:nrow(vlid_spp)) {
+    sp_id.j <- which(taxon$scientificName == vlid_spp$name_submitted[j])
+    unos <- tolower(trimws(distribution_list[[sp_id.j]]))
+    validdepto.i <- finalVal[finalVal[[sp_names]] %in% vlid_spp$name_submitted[j], ]
+
+    evaluate_text <- function(text) {
+      if (text %in% unos) {
+        return(1)
+      } else if (text == 'Other') {
+        return(3)
+      } else if (text == 'Ocean') {
+        return(2)
+      } else {
+        return(0)
+      }
+    }
+
+    # Apply the evaluation function to assign 'validation_result'
+    validdepto.i$validation_result <- sapply(validdepto.i$validDpto, evaluate_text)
+    validdepto.i <- subset(validdepto.i, select = -c(validDpto))
+    finaleva <- rbind(finaleva, validdepto.i)
+  }
+
+  finaleva <- finaleva[-1, ]
+
+  # Additional checks for duplicated records
+  t.dupl <- NA
+  tn.dupl <- NA
+
+  if (any(finaleva$dup.areas.val %in% 1)) {
+    t.dupl <- finaleva[finaleva$dup.areas.val == 1, ]
+    tn.dupl <- finaleva[finaleva$dup.areas.val == 0, ]
+    c.dup <- unique(t.dupl$IDVal)
+
+    for (h in 1:length(c.dup)) {
+      t.dupl.h <- t.dupl[t.dupl$IDVal %in% c.dup[h], ]
+      if (all(t.dupl.h$validation_result == 1)) {
+        tn.dupl <- rbind(tn.dupl, t.dupl.h[1, ])
+      } else if (all(t.dupl.h$validation_result == 0)) {
+        tn.dupl <- rbind(tn.dupl, t.dupl.h[1, ])
+      } else {
+        t.dupl.h$validation_result <- 3
+        tn.dupl <- rbind(tn.dupl, t.dupl.h[1, ])
+      }
+    }
+  } else {
+    tn.dupl <- finaleva
+  }
+
+  # Combine final validated and non-validated species data
+  finalVal <- rbind(tn.dupl, notValispp)
+
+  # Return final validated data frame
+  finalValT <- finalVal[, c(oriNames, 'validation_result')]
+
+  cat('Validation Finished.\n')
+  cat('A total of', nrow(df), 'records were evaluated. The evaluation results are recorded in the "validation_result" column as follows:\n')
+  cat('- 0 = Valid species but records not registered within the analyzed boundaries.\n')
+  cat('- 1 = Valid species and coordinates according to official publications.\n')
+  cat('- 2 = Valid species and coordinates are registered in the ocean.\n')
+  cat('- 3 = Valid species and coordinates are within the limits of the ocean administrative boundaries. We recommend reviewing the location manually.\n')
+  cat('- 4 = Not valid species and not validated.\n')
+
+  return(finalValT)
+}
diff --git a/R/mamm_dep_col_conf.R b/R/mamm_dep_col_conf.R
@@ -0,0 +1,12 @@
+#' mamm_dep_col_conf Dataset
+#'
+#' In progress
+#'
+#' @format A tibble with 548 rows and 19 columns:
+#'   \describe{
+#'     \item{id}{id from original taxon table.}
+#'   }
+#'
+#' @source
+#' For more information about the "mammalcol" package and the data sources, visit
+#' the package's GitHub repository: \url{https://github.com/dlizcano/mammalcol}
diff --git a/R/mamm_mun_col_conf.R b/R/mamm_mun_col_conf.R
@@ -0,0 +1,3 @@
+#' mamm_mun_col_conf Dataset
+#'
+