README.Rmd

---
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

#############
load(file = "C:/CodigoR/Mammal_Col/MammalCol/data/taxon.rda") # , taxon
load(file = "C:/CodigoR/Mammal_Col/MammalCol/data/colmap.rda") # , colmap
load(file = "C:/CodigoR/Mammal_Col/MammalCol/data/distribution.rda") # , disribution
############

search_mammalcol <- function(splist, max_distance = 0.2) {
  # Defensive function here, check for user input errors
  if (is.factor(splist)) {
    splist <- as.character(splist)
  }
  # Fix species name
  splist_st <- standardize_names(splist)
  dupes_splist_st <- find_duplicates(splist_st)

  if (length(dupes_splist_st) != 0) {
    message(
      "The following names are repeated in the 'splist': ",
      paste(dupes_splist_st, collapse = ", ")
    )
  }
  splist_std <- unique(splist_st)

  # create an output data container
  output_matrix <- matrix(nrow = length(splist_std), ncol = 21) # two more
  colnames(output_matrix) <- c(
    "name_submitted",
    names(taxon),
    "Distance"
  )

  # loop code to find the matching string

  for (i in seq_along(splist_std)) {
    # Standardise max distance value
    if (max_distance < 1 & max_distance > 0) {
      max_distance_fixed <- ceiling(nchar(splist_std[i]) * max_distance)
    } else {
      max_distance_fixed <- max_distance
    }

    # fuzzy and exact match
    matches <- agrep(splist_std[i],
      taxon$scientificName, # base data column
      max.distance = max_distance_fixed,
      value = TRUE
    )

    # check non matching result
    if (length(matches) == 0) {
      row_data <- rep("nill", 19) # number of columns
    } else if (length(matches) != 0) { # match result
      dis_value <- as.numeric(utils::adist(splist_std[i], matches))
      matches1 <- matches[dis_value <= max_distance_fixed]
      dis_val_1 <- dis_value[dis_value <= max_distance_fixed]

      if (length(matches1) == 0) {
        row_data <- rep("nill", 19) # number of columns
      } else if (length(matches1) != 0) {
        row_data <- as.matrix(taxon[taxon$scientificName %in% matches1, ])
      }
    }

    # distance value
    if (is.null(nrow(row_data))) {
      dis_value_1 <- "nill"
    } else {
      dis_value_1 <- utils::adist(splist_std[i], row_data[, 2])
    }

    output_matrix[i, ] <-
      c(splist_std[i], row_data, dis_value_1)
  }

  # Output
  output <- as.data.frame(output_matrix)
  # rownames(output) <- NULL
  output <- output[, -2] # delete the id column
  return(output[output$scientificName != "nill", ])
}


standardize_names <- function(splist) {
  fixed1 <- simple_cap(trimws(splist)) # all up
  fixed2 <- gsub("cf\\.", "", fixed1)
  fixed3 <- gsub("aff\\.", "", fixed2)
  fixed4 <- trimws(fixed3) # remove trailing and leading space
  fixed5 <- gsub("_", " ", fixed4) # change names separated by _ to space

  # Hybrids
  fixed6 <- gsub("(^x )|( x$)|( x )", " ", fixed5)
  hybrids <- fixed5 == fixed6
  if (!all(hybrids)) {
    sp_hybrids <- splist[!hybrids]
    warning(
      paste(
        "The 'x' sign indicating hybrids have been removed in the",
        "following names before search:",
        paste(paste0("'", sp_hybrids, "'"), collapse = ", ")
      ),
      immediate. = TRUE, call. = FALSE
    )
  }
  # Merge multiple spaces
  fixed7 <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", fixed6, perl = TRUE)
  return(fixed7)
}


simple_cap <- function(x) {
  # Split each string into words, remove unnecessary white spaces, and convert to lowercase
  words <- sapply(strsplit(x, "\\s+"), function(words) paste(tolower(words), collapse = " "))

  # Capitalize the first letter of each word
  capitalized <- sapply(strsplit(words, ""), function(word) {
    if (length(word) > 0) {
      word[1] <- toupper(word[1])
    }
    paste(word, collapse = "")
  })

  return(capitalized)
}


find_duplicates <- function(vector) {
  # Count the frequency of each word
  word_counts <- table(vector)
  # Find words with a frequency greater than 1
  duplicated_words <- names(word_counts[word_counts > 1])
  return(duplicated_words)
}


################
mammalmap <- function(species, legend = TRUE) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    install.packages("ggplot2")
  }
  if (!requireNamespace("sf", quietly = TRUE)) {
    install.packages("sf")
  }

  if (missing(species)) {
    stop("Argument species was not included")
  }

  if (!is.character(species)) {
    stop(paste0("Argument species must be a character, not ", class(species)))
  }

  if (!is.logical(legend)) {
    stop(paste0("Argument legend must be logical, not ", class(legend)))
  }


  # require("ggplot2")
  # require("sf")

  # load("data/colmap.rda")
  # load("data/taxon.rda")

  # data(mammalcol::taxon)
  # data(mammalcol::colmap)

  distribution_list <-
    strsplit(taxon$distribution, "\\|") # trimws () removes spaces

  deptos <- as.data.frame(cbind(Depto = unique(colmap$NAME_1), fill = "white"))
  sp_id <- which(taxon$scientificName == species)
  
  # if species is not in the table and is integer(0)
  if (length(sp_id) == 0) {
    stop(paste0("The species should be in the list. Make sure you use the function search_mammalcol first. ", species, " is not a species present in Colombia"))
  }
  
  unos <- trimws(distribution_list[[sp_id]]) # get species number

  # nested loop to get deptos
  for (i in 1:length(deptos[, 1])) {
    for (j in 1:length(unos)) {
      if (deptos$Depto[i] == unos[j]) {
        deptos$fill[i] <- "blue"
      }
    }
  }

  # make the map
  # if legend true
  if (legend == TRUE) {
    mapa <- ggplot2::ggplot(colmap) +
      ggplot2::geom_sf(ggplot2::aes(fill = NAME_1)) +
      ggplot2::scale_fill_manual(values = deptos$fill) +
      # ggtitle(taxon$scientificName[25]) + #species name number
      ggplot2::labs(subtitle = taxon$scientificName[sp_id]) +
      ggplot2::theme(
        legend.position = "right", # location legend
        legend.title = ggplot2::element_blank(), # element_text(size=7),#,
        legend.text = ggplot2::element_text(size = 8, ), # text depto size
        plot.subtitle = ggplot2::element_text(face = "italic") # italica
      )
  } else { # if legend false
    mapa <- ggplot2::ggplot(colmap) +
      ggplot2::geom_sf(ggplot2::aes(fill = NAME_1), show.legend = FALSE) + # removes legend
      ggplot2::scale_fill_manual(values = deptos$fill) +
      # ggtitle(taxon$scientificName[25]) + #species name number
      ggplot2::labs(subtitle = taxon$scientificName[sp_id]) +
      ggplot2::theme(plot.subtitle = ggplot2::element_text(face = "italic")) # italica
  }

  return(mapa)
} # end mammalmap
##################

sp_by_depto <- function(states, type = c("any", "only", "all"), taxa = NULL) {
  if (length(states) == 0) stop("Please provide at least one Colombian Departamento")
  type <- match.arg(type)
  states <- sort(states)
  # states <- paste("BR-", states, sep = "")
  if (length(states) == 0) stop("Please provide at least one Colombian Departamento")
  # res <- lapply(occurrences, match, states)
  if (type == "any") {
    # res <- lapply(res, function(x) any(!is.na(x)))
    res <- subset(distribution, grepl(paste(states, collapse = "|"), locality))
  }
  if (type == "only") {
    res <- subset(distribution, grepl(paste("^", paste(states, collapse = "\\|"), "$", sep = ""), locality))
  }
  if (type == "all") {
    res <- subset(distribution, grepl(paste(states, collapse = ".*"), locality))
  }
  # res <- distribution[unlist(res), ]
  if (nrow(res) == 0) {
    return(NA)
  }
  if (is.null(taxa)) {
    merge(taxon[, c("scientificName", "family", "order", "id")], res[, c("id", "locality")], by = "id")[, -1]
    # removes id
  } else {
    merge(taxon[taxon$order %in% taxa, c("scientificName", "family", "order", "id")], res[, c("id", "locality")], by = "id")[, -1]
    # removes id
  }
}


```


# mammalcol <img src="man/figures/logo.png" align="right" height="139" alt="" />
<!-- badges: start -->
[![R-CMD-check](https://github.com/dlizcano/mammalcol/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/dlizcano/mammalcol/actions/workflows/R-CMD-check.yaml)
[![Codecov test coverage](https://codecov.io/gh/dlizcano/mammalcol/branch/main/graph/badge.svg)](https://app.codecov.io/gh/dlizcano/mammalcol?branch=main)
[![CRAN status](https://www.r-pkg.org/badges/version/mammalcol)](https://CRAN.R-project.org/package=mammalcol)
[![lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
[![size](https://img.shields.io/github/languages/code-size/dlizcano/mammalcol.svg)](https://github.com/dlizcano/mammalcol)
<!-- badges: end -->

The goal of mammalcol is to allow easy access to the List of Mammal Species of Colombia.

Researchers can explore Colombia's amazing variety of mammals using the mammalcol R package. This package includes information on 548 mammal species, making Colombia one of the world leaders in mammal diversity. The data comes from the latest "Checklist of the mammals (Mammalia) of Colombia" by [Ramírez-Chaves et al (2021)](https://doi.org/10.47603/mano.v7n2.253). But It's important to remember that how scientists classify mammals keeps changing constantly, so the information in this package has been updated with the checklist [Mamíferos de Colombia](https://doi.org/10.15472/kl1whs), which is published and updated regularly by the Colombian Mammal Society. 

## Installation

You can install the development version of mammalcol from [GitHub](https://github.com/dlizcano/mammalcol) with:

``` r
# install.packages("devtools")
devtools::install_github("dlizcano/mammalcol")
```

## load library

```{r}

library(mammalcol)
```


## Example

### Search database

This is a basic example which shows you how to search mammal names in Colombia using a vector of mammal names.

```{r example1, eval=TRUE, echo=TRUE}
# define a vector with species to search for
splist <- c(
  "Tapirus bairdii", "Tapirus pinchaque", "Tapirus terrestris",
  "Tapirus terrestris", "Pudu mephistophiles", "Tapirus bairdii"
)

# search in database
search_mammalcol(splist)

```

#### Search database with typos

`mammalcol` has the ability to identify and correct minor typos and lower case in the genus. Correcting potential typos or variations in species names through fuzzy matching is a crucial aspect of data management. This technique ensures precise retrieval by adeptly identifying and accommodating minor differences in input names, thus enhancing the reliability of analyses conducted on diverse, inconsistent, and ensemble datasets. 


```{r example2, eval=TRUE, echo=TRUE}
# vector with species names and intentional typos
splist <- c("Tapiru terrestre", "pudu mephistophiles", "tapirus bairdii")

# search in database
search_mammalcol(splist)
```

### Produce a map
  
The function mammalmap produces basic a map of distribution at the "Departamento" level for a single species.
 

```{r example3, eval=TRUE, echo=TRUE}
# write a species name in the function to map it
mammalmap("Tapirus pinchaque")
```

#### Remove the legeng

Using the parameter legend=FALSE

```{r example3b, eval=TRUE, echo=TRUE}
# write a species name in the function to map it
mammalmap("Tapirus pinchaque", legend=FALSE)
```

### See the full taxon table
  
```{r example4}

head(taxon)

```

### Search mammals present by departamento

Use the departamento name or a vector of departamentos to know the mammals species present. The argument type = "any"  retrieve mammals present in any of those departamentos. 

```{r}
occ.any <- sp_by_depto(c("Arauca", "Norte de Santander"), type = "any")
head(occ.any)

```

The argument type = "only" retrieves species present only in that departamento and in no other departamento. 

```{r}
occ.only <- sp_by_depto(c("Norte de Santander"), type = "only")
head(occ.only)
```

The argument all retrieves species present in both departamentos. The argument taxa limit the search to one order. occ.bats has the bats present in Arauca and Norte de Santander. 

```{r}
occ.all <- sp_by_depto(c("Arauca", "Norte de Santander"), type = "all")
occ.bats <- sp_by_depto(c("Arauca", "Norte de Santander"), type = "all", taxa = "Chiroptera")
head(occ.bats)

```


### Sugested citation
  
```{r eval=TRUE}
citation("mammalcol")

```

Lizcano, DJ. (2024). mammalcol: Access to the List of Mammal Species of Colombia. R package version 0.1.0