Skip to content

Commit

Permalink
Updates to the app with new checks and updated explanations
Browse files Browse the repository at this point in the history
  • Loading branch information
linusblomqvist committed Aug 22, 2024
1 parent 4a97b01 commit 4382bb7
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 21 deletions.
18 changes: 9 additions & 9 deletions check_ebird_checklists/check_ebird_checklists_fcn.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ create_chk <- function(txt_file, too_many_species, too_many_species_stationary,
median_count = median(observation_count_num, na.rm = T),
number_media = sum(has_media)
) %>%
mutate(no_checklists = str_count(checklist_id, "S\\d+")) %>%
ungroup() %>%
select(-c(group_id, taxonomic_order, common_name, observation_count, observation_count_num, has_media)) %>%
unique()
Expand All @@ -64,25 +65,24 @@ create_chk <- function(txt_file, too_many_species, too_many_species_stationary,
high_number_species = number_species > too_many_species,
only_one_species = all_species_reported & number_species==1 & duration_minutes > 5,
same_count_all_species = all_species_reported & median_count>0 & number_distinct_count==1 & number_species>2,
too_long_distance = effort_distance_km > too_long_distance,
multi_day = time_observations_started + minutes(duration_minutes) > hours(24),
too_many_observers = number_observers > too_many_observers,
too_short_duration = all_species_reported & number_species/duration_minutes > 10,
too_fast = effort_distance_km/duration_minutes*60 > 60,
complete_media = all_species_reported & number_media==number_species,
not_stationary = protocol_type == "Stationary" & number_species > too_many_species_stationary,
specialized_protocol = !(protocol_type %in% c("Historical", "Traveling", "Incidental", "Stationary")),
not_traveling = protocol_type=="Traveling" & effort_distance_km < 0.03
)# %>%
#mutate(across(starts_with("chk_"), ~replace_na(., FALSE)))

chk <- chk %>%
mutate(dubious = ifelse(rowSums(.[,17:30]) >= 1, TRUE, FALSE)) %>%
filter(dubious == TRUE)
) %>%
mutate(too_long_distance = ifelse(protocol_type != "eBird Pelagic Protocol" & effort_distance_km > too_long_distance,
TRUE, FALSE)) %>%
mutate(pelagic_too_long = ifelse(protocol_type == "eBird Pelagic Protocol" & duration_minutes > 75, TRUE, FALSE)) %>%
mutate(specialized_protocol = !(protocol_type %in% c("Historical", "Traveling", "Incidental", "Stationary"))) %>%
mutate(no_observer_mismatch = ifelse(no_checklists > number_observers, TRUE, FALSE))

chk <- chk %>%
select(-c(time_observations_started, all_species_reported, number_distinct_count,
median_count, number_media, checklist_link, dubious))
median_count, number_media, checklist_link, checklist_id, observer_id)) %>%
relocate(no_checklists, .after = number_observers)

return(chk)
}
6 changes: 5 additions & 1 deletion check_ebird_checklists/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ server <- function(input, output) {
if (is.null(inFile))
return(NULL)

create_chk(inFile$datapath, input$too_many_species, input$too_many_species_stationary, input$too_long_distance, input$too_many_observers)
create_chk(inFile$datapath, input$too_many_species, input$too_many_species_stationary, input$too_long_distance, input$too_many_observers) %>%
select(url:number_species, all_of(input$vars)) %>%
mutate(dubious = apply(.[, 10:ncol(.)], 1, any)) %>%
filter(dubious == TRUE) %>%
select(-dubious)
})

output$contents <- renderDataTable({
Expand Down
32 changes: 21 additions & 11 deletions check_ebird_checklists/ui.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
selected_checks <- c("ampm", "midnight", "high_number_species", "only_one_species", "same_count_all_species", "multi_day", "too_many_observers", "too_short_duration", "too_fast", "complete_media", "not_stationary", "not_traveling", "too_long_distance", "pelagic_too_long", "specialized_protocol", "no_observer_mismatch")

ui <- fluidPage(
tags$head(includeHTML("google-analytics.Rhtml")),
titlePanel("Check eBird Checklists"),
Expand All @@ -6,10 +8,14 @@ ui <- fluidPage(
fileInput("upload", "Choose .txt file",
buttonLabel = "Upload...", multiple = TRUE, accept = ".txt"),
hr(),
numericInput('too_many_species_stationary', 'not_stationary threshold', value = 50),
numericInput('too_many_species', 'high_number_species threshold', value = 70),
numericInput('too_long_distance', 'too_long_distance threshold (km)', value = 20),
numericInput('too_many_observers', 'too_many_observers threshold', value = 20),
numericInput('too_many_species_stationary', 'not_stationary threshold', value = 50),
numericInput('too_long_distance', 'too_long_distance threshold (km)', value = 20),
hr(),
checkboxGroupInput("vars", "Select checks to include:",
choices = selected_checks, # Choices will be set in the server
selected = selected_checks),
hr(),
downloadButton(
"downloadData",
Expand All @@ -19,21 +25,25 @@ ui <- fluidPage(
)
),
mainPanel(HTML("<p>This web app helps detect eBird checklists with potential checklist-level errors.<p>"),
HTML("<p>First upload a .txt file containing eBird data, downloaded from <a href = 'https://ebird.org/data/download'>https://ebird.org/data/download.</a> Adjust the thresholds for different checks if needed. Finally, click the download button and it will give you a .csv file with a list of potentially problematic checklists."),
HTML("<p>Columns 11 through 24 in this spreadsheet indicate the type of potential error, as listed below. Note: just because a checklist gets flagged by this algorithm does not mean that it necessarily has an error. Each flagged checklist needs to be individually assessed by a reviewer before any action is taken. <br>
<li>ampm: A frequent issue is for the time to be entered as AM instead of PM (i.e., in the middle of the night, rather than in the afternoon).</li>
HTML("<p>First upload a .txt file containing eBird data, downloaded from <a href = 'https://ebird.org/data/download'>https://ebird.org/data/download.</a> Adjust the thresholds for different checks if needed, and then select which checks to run. Finally, click the download button and it will give you a .csv file with a list of potentially problematic checklists."),
HTML("<p>Columns 10 and up in this spreadsheet indicate the type of potential error, as listed below. Note: just because a checklist gets flagged by this algorithm does not mean that it necessarily has an error. Each flagged checklist needs to be individually assessed by a reviewer before any action is taken.<p>"),
HTML("<p>For larger areas or longer time periods, a very large number of checklists might be flagged. Doing one month at a time for an individual county might be a reasonable approach. Also, in some cases, a single check might account for a large share of the flagged checklists. In our experience, this can happen with no_observer_mismatch. In this case, to reduce the burden, you can deselect the offending check. <p>"),
HTML("<p><li>ampm: A frequent issue is for the time to be entered as AM instead of PM (i.e., in the middle of the night, rather than in the afternoon).</li>
<li>midnight: Starting a checklist at midnight should be quite uncommon, but it's often used to enter day-list or incorrectly add time on a historical/incidental list without time.</li>
<li>multi_day: Checklists that span over two days are not valid.</li>
<li>not_traveling: Traveling checklists of less than 30 m should be entered as stationary.</li>
<li>not_stationary: Staionary checklists with more than <b>X</b> species could be traveling checklists.</li>
<li>high_number_species: Checklists with an exceptionally high number of species (<b>X</b>, relative to your region) are generally indicative of multi-day lists or list-building.</li>
<li>only_one_species: A complete checklist with a single species is often indicative of incorrectly checking the 'Complete' button.</li>
<li>too_long_distance: A checklist of more than <b>X</b> km could be invalid based on distance covered.</li>
<li>same_count_all_species: Most commonly occurs when the number of individuals for every species is 1, in which case it is possible that the correct selection should be X to mark presence.</li>
<li>multi_day: Checklists that span over two days are not valid.</li>
<li>too_many_observers: Checklists with more than <b>X</b> people could represent multi-party effort or some other issue.</li>
<li>too_short_duration: Checklists with a high number of species relative to the duration are suspicious.</li>
<li>too_fast: Catch protocol issues with incoherent duration and distance</li>
<li>complete_media: Checklists marked as complete with a media for each species often tend to rather be 'Incomplete' checklists.</li>
<li>too_many_observers: Checklists with more than <b>X</b> people could represent multi-party effort or some other issue.</li>
<li>specialized_protocol: You might want to check the use of non-standard protocols.</li><br><br>
<li>not_stationary: Staionary checklists with more than <b>X</b> species could be traveling checklists.</li>
<li>not_traveling: Traveling checklists of less than 30 m should be entered as stationary.</li>
<li>too_long_distance: A checklist of more than <b>X</b> km could be invalid based on distance covered. This does not apply to checklists with the pelagic protocol.</li>
<li>pelagic_too_long: Checklists with the pelagic protocol are flagged if they are longer than 75 minutes.</li>
<li>specialized_protocol: You might want to check the use of non-standard protocols.</li>
<li>no_observer_mismatch: Flagged when a checklist is shared with a larger number of people than the indicated number of observers.</li><br><br>
Code by Raphaël Nussbaumer and Linus Blomqvist (<a href = 'https://github.com/Zoziologie/Check-eBird-Checklist'>here)</a>. Web app by Linus Blomqvist. <br>
This app is a work in progress. For questions or suggestions please contact linus [dot] blomqvist [at] gmail [dot] com.<p>")
)
Expand Down

0 comments on commit 4382bb7

Please sign in to comment.