diff --git a/.Rhistory b/.Rhistory index bb846a4..9b42b6c 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,341 +1,3 @@ -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region), -x = "Year", -y = "Score", -color = "Community") -ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Heatmaps for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) + -geom_tile() + -facet_wrap(~ Indicator, scales = "free_y") + -scale_fill_gradient(low = "blue", high = "red") + -theme_minimal() + -theme(panel.background = element_rect(fill = "white", color = NA), -plot.background = element_rect(fill = "white", color = NA), -axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region), -x = "Year", -y = "Community", -fill = "Score") -ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white") -} -# Create Box Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) + -geom_boxplot() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Distribution of Standardized Indicator Scores by Community and Year", region), -x = "Year", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Box Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) + -geom_boxplot() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Distribution of Standardized Indicator Scores by Year", region), -x = "Year", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -library(readr) -CSVI_2010 = read_csv("indicator_data/CaribCSVI_2010.csv", locale = locale(encoding = "UTF-8")) -CSVI_2020 = read_csv("indicator_data/CaribCSVI_2020.csv", locale = locale(encoding = "UTF-8")) -library(ggplot2) -library(dplyr) -library(tidyr) -CSVI_2010$Year <- 2010 -CSVI_2020$Year <- 2020 -# Combine both dataframes -df_combined <- bind_rows(CSVI_2010, CSVI_2020) -#remove columns for raw data -df_combined = df_combined[,-c(2:7)] -# Convert to long format for ggplot -df_long <- df_combined %>% -pivot_longer(cols = ends_with("Cat"), -names_to = "Indicator", -values_to = "Score") -# Remove rows with NA values -df_long <- na.omit(df_long) -df_long$Region = as.character(df_long$Region) -df_long$Year = as.factor(df_long$Year) -df_long$Indicator = as.character(df_long$Indicator) -df_long$Score = as.integer(df_long$Score) -# Rename and combine regions -df_long <- df_long %>% -mutate(Region = recode(Region, -'STT' = 'St. Thomas & St. John', -'STJ' = 'St. Thomas & St. John', -'STX' = 'St. Croix', -'W' = 'Puerto Rico West', -'E' = 'Puerto Rico East')) -# Verify the changes -unique(df_long$Region) -# Rename indicators -df_long <- df_long %>% -mutate(Indicator = recode(Indicator, -'PerDisCat' = 'Personal Disruption', -'PopComCat' = 'Pop Composition', -'PovertyCat' = 'Poverty', -'LabFrc_revCat' = 'Labor Force', -'HsChr_rev_Cat' = 'Housing Charac', -'RetMigCat' = 'Retiree Migration')) -unique(df_long$Indicator) -# Create Faceted Bar Plots for each region -regions <- unique(df_long$Region) -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = Community, y = Score, fill = as.factor(Year))) + -geom_bar(stat = "identity", position = "dodge") + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Standardized Indicator Scores by Community & Year", region), -x = "Community", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Faceted_Bar_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Line Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = Year, y = Score, color = Community, group = Community)) + -geom_line() + -geom_point() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region), -x = "Year", -y = "Score", -color = "Community") -ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Heatmaps for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) + -geom_tile() + -facet_wrap(~ Indicator, scales = "free_y") + -scale_fill_gradient(low = "blue", high = "red") + -theme_minimal() + -theme(panel.background = element_rect(fill = "white", color = NA), -plot.background = element_rect(fill = "white", color = NA), -axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region), -x = "Year", -y = "Community", -fill = "Score") -ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white") -} -# Create Box Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) + -geom_boxplot() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Distribution of Standardized Indicator Scores by Year", region), -x = "Year", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -library(readr) -CSVI_2010 = read_csv("indicator_data/CaribCSVI_2010.csv", locale = locale(encoding = "UTF-8")) -View(CSVI_2010) -CSVI_2020 = read_csv("indicator_data/CaribCSVI_2020.csv", locale = locale(encoding = "UTF-8")) -library(ggplot2) -library(dplyr) -library(tidyr) -CSVI_2010$Year <- 2010 -CSVI_2020$Year <- 2020 -# Combine both dataframes -df_combined <- bind_rows(CSVI_2010, CSVI_2020) -#remove columns for raw data -df_combined = df_combined[,-c(2:7)] -# Convert to long format for ggplot -df_long <- df_combined %>% -pivot_longer(cols = ends_with("Cat"), -names_to = "Indicator", -values_to = "Score") -# Remove rows with NA values -df_long <- na.omit(df_long) -df_long$Community = as.character(df_long$Community) -df_long$Region = as.character(df_long$Region) -df_long$Year = as.factor(df_long$Year) -df_long$Indicator = as.character(df_long$Indicator) -df_long$Score = as.integer(df_long$Score) -# Rename and combine regions -df_long <- df_long %>% -mutate(Region = recode(Region, -'STT' = 'St. Thomas & St. John', -'STJ' = 'St. Thomas & St. John', -'STX' = 'St. Croix', -'W' = 'Puerto Rico West', -'E' = 'Puerto Rico East')) -# Verify the changes -unique(df_long$Region) -# Rename indicators -df_long <- df_long %>% -mutate(Indicator = recode(Indicator, -'PerDisCat' = 'Personal Disruption', -'PopComCat' = 'Pop Composition', -'PovertyCat' = 'Poverty', -'LabFrc_revCat' = 'Labor Force', -'HsChr_rev_Cat' = 'Housing Charac', -'RetMigCat' = 'Retiree Migration')) -unique(df_long$Indicator) -# Create Faceted Bar Plots for each region -regions <- unique(df_long$Region) -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = Community, y = Score, fill = as.factor(Year))) + -geom_bar(stat = "identity", position = "dodge") + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Standardized Indicator Scores by Community & Year", region), -x = "Community", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Faceted_Bar_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Line Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = Year, y = Score, color = Community, group = Community)) + -geom_line() + -geom_point() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region), -x = "Year", -y = "Score", -color = "Community") -ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Create Heatmaps for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) + -geom_tile() + -facet_wrap(~ Indicator, scales = "free_y") + -scale_fill_gradient(low = "blue", high = "red") + -theme_minimal() + -theme(panel.background = element_rect(fill = "white", color = NA), -plot.background = element_rect(fill = "white", color = NA), -axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region), -x = "Year", -y = "Community", -fill = "Score") -ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white") -} -# Create Box Plots for each region -for (region in regions) { -p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) + -geom_boxplot() + -facet_wrap(~ Indicator, scales = "free_y") + -theme_bw() + -theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), -axis.text.y = element_text(size = 10), -strip.text = element_text(size = 10), -legend.title = element_text(size = 10), -legend.text = element_text(size = 10)) + -labs(title = paste("Distribution of Standardized Indicator Scores by Year", region), -x = "Year", -y = "Score", -fill = "Year") -ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8) -} -# Tally number of stocks from Maria's table: -PR = 34 -STTSTJ = 20 -STX = 20 -PR_perc = 1/PR -PR_perc -USVI_perc = 1/STX -USVI_perc -USVI_perc = 3/STX -USVI_perc -PR_perc = 1/PR -USVI_perc = 1/STX -# Tally number of stocks from Maria's table: -PR_stocks = 34 -STTSTJ_stocks = 20 -STX_stocks = 20 -PR_perc = 1/PR_stocks -USVI_perc = 1/STX_stocks -PR = as.vector(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc) -PR_perc -PR = as.vector(0,0,0,0,0,0,0,0,0,0,0,0,3) -PR = c(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc) -PR -# Tally number of stocks from Maria's table: -PR_stocks = 34 -STTSTJ_stocks = 20 -STX_stocks = 20 -PR_perc = (1/PR_stocks)*100 -USVI_perc = (1/STX_stocks)*100 -PR = c(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc) -PR -USVI = c(0,0,0,0,0,0,0,0,0,0,0,0,USVI_perc) -USVI -# save as indicator object ---------------------- -datdata <- 2011:2023 -inddata <- data.frame(cbind(PR, USVI)) -labs <- c("Stocks/complexes with Tier 3 designation" , "Percent", "Puerto Rico", -"Stocks/complexes with Tier 3 designation" , "Percent", "USVI") -indnames <- data.frame(matrix(labs, nrow = 3, byrow = F)) -ind <- list(labels = indnames, indicators = inddata, datelist = datdata) -class(inddata) <- "indicatordata" # plot and save ---------------------------------- plotIndicatorTimeSeries(ind, plotrownum = 2, coltoplot = 1:2, sublabel = TRUE, dateformat = "%Y%b", trendAnalysis = T) library(plotTimeSeries) @@ -510,3 +172,341 @@ require(tidyverse) require(ggplot2) url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf' raw_text <- map(url2, pdf_text) +url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS' +library(readxl) +library(httr) +packageVersion("readxl") +GET(url1, write_disk(tf <- tempfile(fileext = ".xls"))) +df <- read_excel(tf, 2L) +str(df) +df2 = df[c(52,63),] +df2t = as.data.frame(t(df2)) +df2t = df2t[-c(1,2,12),] +yrs_PR = as.integer(df2t$V1) +cruise_PR = df2t$V2 +yrs_PR +cruise_PR +require(pdftools) +require(tidyverse) +require(ggplot2) +url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf' +raw_text <- map(url2, pdf_text) +clean_table1 <- function(raw) { +#Split the single pages +raw <- map(raw, ~str_split(.x, "\\n") %>% unlist()) +#Concatenate the split pages +raw <- reduce(raw, c) +#Specify the start and end of the table data +table_start <- stringr::str_which(tolower(raw), 'total visitors') +table_end <- stringr:str_which(tolower(raw), 'number of cruise ships') +table_end <- table_end[min(which(table_end > table_start))] +#Build the table and remove special characters +table <- raw[(table_start):(table_end)] +table <- str_replace_all(table, "\\s{2,}","|") +text_con <- textConnection(table) +data_table <- read.csv(text_con, sep = "|") +#Create a list of column names +colnames(data_table) <- c("x","xx","1990","2000","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022") +data_table +} +results <- map_df(raw_text, clean_table1) +require(stringr) +url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf' +raw_text <- map(url2, pdf_text) +clean_table1 <- function(raw) { +#Split the single pages +raw <- map(raw, ~str_split(.x, "\\n") %>% unlist()) +#Concatenate the split pages +raw <- reduce(raw, c) +#Specify the start and end of the table data +table_start <- stringr::str_which(tolower(raw), 'total visitors') +table_end <- stringr:str_which(tolower(raw), 'number of cruise ships') +table_end <- table_end[min(which(table_end > table_start))] +#Build the table and remove special characters +table <- raw[(table_start):(table_end)] +table <- str_replace_all(table, "\\s{2,}","|") +text_con <- textConnection(table) +data_table <- read.csv(text_con, sep = "|") +#Create a list of column names +colnames(data_table) <- c("x","xx","1990","2000","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022") +data_table +} +results <- map_df(raw_text, clean_table1) +raw_text +# Load necessary libraries +library(pdftools) +library(dplyr) +library(tidyr) +# Load the PDF +pdf_file <- "../../../Downloads/Tourism-indicator-2022-12-28-23-1.pdf" +# Extract text from the PDF +pdf_text <- pdf_text(pdf_file) +# Extract the relevant page (assuming the table is on the first page) +page_text <- pdf_text[1] +# Split the text into lines +lines <- strsplit(page_text, "\n")[[1]] +# Identify the lines containing the Visitor Arrivals table +# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES") +start_line <- grep("VISITOR ARRIVALS", lines) +end_line <- grep("VISITOR EXPENDITURES", lines) +table_lines <- lines[(start_line + 1):(end_line - 1)] +# Combine lines into a single text block and split by spaces +table_text <- paste(table_lines, collapse = " ") +table_data <- strsplit(table_text, " +")[[1]] +table_data +# Extract the years and visitor numbers +years <- c(1990:2022) # Adjust the range based on actual column positions +years +# Extract the years and visitor numbers +years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) # Adjust the range based on actual column positions +table_data +cruise_passengers <- as.numeric(table_data[85:101]) # Adjust the range based on actual column positions +table_data[85:101] +# Create years column +years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) +# Extract and clean the total visitors data by removing commas +cruise_passengers <- gsub(",", "", table_data[85:101]) +cruise_passengers <- as.numeric(cruise_passengers) +cruise_passengers +# Create years column +years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) +# Extract and clean the total visitors data by removing commas +passengers <- gsub(",", "", table_data[85:101]) +passengers <- as.numeric(passengers) +# Create a data frame +USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers) +# Print the data frame +print(USVI_cruise) +# Specify the URL of the PDF +url <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf" +# Specify the destination file path +destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf" +# Download the PDF +download.file(url, destfile, mode = "wb") +# Check if the file has been downloaded +file.exists(destfile) +url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS' +library(readxl) +library(httr) +packageVersion("readxl") +GET(url1, write_disk(tf <- tempfile(fileext = ".xls"))) +df <- read_excel(tf, 2L) +str(df) +df2 = df[c(52,63),] +df2t = as.data.frame(t(df2)) +df2t = df2t[-c(1,2,12),] +yrs_PR = as.integer(df2t$V1) +cruise_PR = df2t$V2 +yrs_PR +cruise_PR +yrs_PR = as.integer(df2t$V1) +cruise_PR = as.numeric(df2t$V2) +cruise_PR +# Load necessary libraries +library(pdftools) +library(dplyr) +library(tidyr) +# Specify the URL of the PDF +url2 <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf" +# Specify the destination file path +destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf" +# Download the PDF +download.file(url2, destfile, mode = "wb") +# Check if the file has been downloaded +file.exists(destfile) +# Load the PDF +pdf_file <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf" +# Extract text from the PDF +pdf_text <- pdf_text(pdf_file) +# Extract the relevant page (assuming the table is on the first page) +page_text <- pdf_text[1] +# Split the text into lines +lines <- strsplit(page_text, "\n")[[1]] +# Identify the lines containing the Visitor Arrivals table +# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES") +start_line <- grep("VISITOR ARRIVALS", lines) +end_line <- grep("VISITOR EXPENDITURES", lines) +table_lines <- lines[(start_line + 1):(end_line - 1)] +# Combine lines into a single text block and split by spaces +table_text <- paste(table_lines, collapse = " ") +table_data <- strsplit(table_text, " +")[[1]] +# Create years column +years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) +# Extract and clean the total visitors data by removing commas +passengers <- gsub(",", "", table_data[85:101]) +passengers <- as.numeric(passengers) +# Create a data frame +USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers) +# Print the data frame +print(USVI_cruise) +yrs_USVI = as.integer(USVI_cruise$Year) +cruise_USVI = as.numeric(USVI_cruise$Cruise_passengers) +yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI) +yrs +# Combine lines into a single text block and split by spaces +table_text <- paste(table_lines, collapse = " ") +table_data <- strsplit(table_text, " +")[[1]] +# Create years column +years <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) +# Extract and clean the total visitors data by removing commas +passengers <- gsub(",", "", table_data[87:101]) +passengers <- as.numeric(passengers) +# Create a data frame +USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers) +# Print the data frame +print(USVI_cruise) +yrs_USVI = as.integer(USVI_cruise$Year) +cruise_USVI = as.numeric(USVI_cruise$Cruise_passengers) +yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI) +yrs +yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI) +mat <- data.frame(matrix(data = NA, nrow = length(yrs), ncol = 2)) +rownames(mat) <- yrs +mat[which(yrs %in% cruise_PR), 1] <- prtab[, 2] +yrs_PR +# Create a data frame +PR_cruise <- data.frame(Year = yrs_PR, Cruise_passengers = cruise_PR) +all_years <- min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers) +# Merge the data frames with the complete data frame to fill in missing years with NA +combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE) +combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE) +all_years <- min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers) +# Merge the data frames with the complete data frame to fill in missing years with NA +combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE) +PR_cruise +all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers)) +all_years +all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year)) +all_years +# Merge the data frames with the complete data frame to fill in missing years with NA +combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE) +combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE) +combined_df +# save as indicator object ---------------------- +datdata <- all_years +inddata <- data.frame(cbind(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers)) +combined_df +# save as indicator object ---------------------- +datdata <- all_years +inddata <- data.frame(cbind(combined_df$Cruise_passengers.x, combined_df$Cruise_passengers.y)) +labs <- c("Cruise passengers" , "thousands of people", "Puerto Rico", +"Cruise passengers" , "thousands of people", "USVI") +indnames <- data.frame(matrix(labs, nrow = 3, byrow = F)) +inddata <- list(labels = indnames, indicators = inddata, datelist = datdata) +class(inddata) <- "indicatordata" +ind <- inddata +plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE) +library(plotTimeSeries) +plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE) +all_years +# specification file and libraries ----------------------------- +rm(list = ls()) +dev.off() +library(maps) +library(plotTimeSeries) +library(xml2) +library(rvest) +load("indicator_processing/spec_file.RData") +url <- paste0("https://ds.iris.edu/ieb/evtable.phtml?caller=IEB&st=1970-01-01&et=2025-01-01&minmag=3.5&maxmag=10&mindepth=0&xde=900&orderby=time-desc&src=usgs&limit=5000&", +"maxlat=", max_lat, "&minlat=", min_lat, "&maxlon=", max_lon, "&minlon=", min_lon, +"&sbl=1&zm=8&mt=ter&title=IEB%20export%3A%201033%20earthquakes%20as%20a%20sortable%20table.&stitle=from%", +"201970-01-01%20to%202025-01-01%2C%20with%20magnitudes%20from%203.5%20to%2010%2C%20depths%20from%200%20to", +"%20900%20km%2C%20with%20priority%20for%20most%20recent%2C%20limited%20to%205000%2C%20%20showing%20data%20from%20USGS%2C%20") +page <- read_html(url) #Creates an html document from URL +table <- html_table(page, fill = TRUE) #Parses tables into data frames +table +dat <- data.frame(table[[1]]) +head(dat) +dim(dat) +min(dat$Year) +min(dat$Year) +max(dat$Year) +hist(dat$Month) +hist(dat$Depth) +hist(dat$Mag) +dat <- dat[which(dat$Year >= 2000), ] +dat <- dat[which(dat$Year <= terminal_year), ] +# check data download ----------------------------------------- +map('world', fill = 1, interior=F, col = gray(0.95), add=F, xlim = c(-80, -60), ylim = c(10, 30)) +points(dat$Lon, dat$Lat, pch=19, cex=0.5, col = 1) +dev.off() +tot_num <- table(dat$Year) +tot_st <- tapply(dat$Mag, dat$Year, sum, na.rm = T) +barplot(tot_num) +barplot(tot_st) +plot(as.numeric(tot_num), as.numeric(tot_st)) +datdata <- min(dat$Year):max(dat$Year) +inddata <- data.frame(as.numeric(tot_num)) +labs <- c("Earthquake activity", "number of events per year", "") +indnames <- data.frame(matrix(labs, nrow = 3, byrow = F)) +url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS' +library(readxl) +library(httr) +packageVersion("readxl") +GET(url1, write_disk(tf <- tempfile(fileext = ".xls"))) +df <- read_excel(tf, 2L) +str(df) +df2 = df[c(52,63),] +df2t = as.data.frame(t(df2)) +df2t = df2t[-c(1,2,12),] +yrs_PR = as.integer(df2t$V1) +cruise_PR = as.numeric(df2t$V2) +# Create a data frame +PR_cruise <- data.frame(Year = yrs_PR, Cruise_passengers = cruise_PR) +# Load necessary libraries +library(pdftools) +library(dplyr) +library(tidyr) +# Specify the URL of the PDF +url2 <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf" +# Specify the destination file path +destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf" +# Download the PDF +download.file(url2, destfile, mode = "wb") +# Check if the file has been downloaded +file.exists(destfile) +# Load the PDF +pdf_file <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf" +# Extract text from the PDF +pdf_text <- pdf_text(pdf_file) +# Extract the relevant page (assuming the table is on the first page) +page_text <- pdf_text[1] +# Split the text into lines +lines <- strsplit(page_text, "\n")[[1]] +# Identify the lines containing the Visitor Arrivals table +# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES") +start_line <- grep("VISITOR ARRIVALS", lines) +end_line <- grep("VISITOR EXPENDITURES", lines) +table_lines <- lines[(start_line + 1):(end_line - 1)] +# Combine lines into a single text block and split by spaces +table_text <- paste(table_lines, collapse = " ") +table_data <- strsplit(table_text, " +")[[1]] +# Create years column +years <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) +# Extract and clean the total visitors data by removing commas +passengers <- gsub(",", "", table_data[87:101]) +passengers <- as.numeric(passengers) +# Create a data frame +USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers) +# Print the data frame +print(USVI_cruise) +all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year)) +# Extract and clean the total visitors data by removing commas +passengers <- gsub(",", "", table_data[87:101]) +all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year)) +# Merge the data frames with the complete data frame to fill in missing years with NA +combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE) +combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE) +# save as indicator object ---------------------- +datdata <- all_years +# save as indicator object ---------------------- +datdata <- all_years$Year +inddata <- data.frame(cbind(combined_df$Cruise_passengers.x, combined_df$Cruise_passengers.y)) +labs <- c("Cruise passengers" , "thousands of people", "Puerto Rico", +"Cruise passengers" , "thousands of people", "USVI") +indnames <- data.frame(matrix(labs, nrow = 3, byrow = F)) +inddata <- list(labels = indnames, indicators = inddata, datelist = datdata) +class(inddata) <- "indicatordata" +ind <- inddata +plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE) +save(ind, file = "indicator_objects/cruise.RData") diff --git a/indicator_processing/non_automated/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_processing/non_automated/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf similarity index 100% rename from indicator_processing/non_automated/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf rename to indicator_data/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf diff --git a/indicator_objects/outreach.RData b/indicator_objects/outreach.RData new file mode 100644 index 0000000..c1de894 Binary files /dev/null and b/indicator_objects/outreach.RData differ diff --git a/indicator_processing/non_automated/outreach.R b/indicator_processing/non_automated/outreach.R new file mode 100644 index 0000000..3337e73 --- /dev/null +++ b/indicator_processing/non_automated/outreach.R @@ -0,0 +1,90 @@ +# Code to develop outreach indicators - including MREP workshop industry participants over time and SeaGrant workshop participants. + +# Last updated 6/20/2024 by Carissa Gervasi + +rm(list = ls()) +dev.off() + +library(pdftools) +library(stringr) +library(dplyr) + +############### +## MREP participants (not able to automate, numbers emailed from Courtney Pickett) + +years = c(2015:2024) + +participants = c(21,0,52,0,33,0,0,32,22,33) + +MREP = data.frame(year = years, part_MREP = participants) + +MREP$cumu_MREP = cumsum(MREP$part_MREP) + + + +################### +## Seagrant meeting/workshop participants (pdf annual reports emailed). Want to extract the number of attendees in SeaGrant meetings/workshops for each year. + + +# Function to extract attendees from a single PDF +extract_attendees <- function(pdf_path) { + pdf_text <- pdf_text(pdf_path) + last_page <- pdf_text[length(pdf_text)] + attendees <- str_match(last_page, "Attendees in SG Meetings/Workshops\\s+(\\d+)")[2] + return(as.numeric(attendees)) +} + +# Function to extract the year from the filename +extract_year <- function(filename) { + year <- str_match(filename, "(\\d{4})")[1] + return(as.numeric(year)) +} + +# Set the directory containing the PDF files +pdf_directory <- "indicator_data/outreach/" + +# Get a list of all PDF files in the directory +pdf_files <- list.files(pdf_directory, pattern = "\\.pdf$", full.names = TRUE) + +# Initialize an empty data frame to store results +Seagrant <- data.frame(year = integer(), attendees = integer(), stringsAsFactors = FALSE) + +# Loop through each PDF file +for (pdf_file in pdf_files) { + year <- extract_year(basename(pdf_file)) + attendees <- extract_attendees(pdf_file) + Seagrant <- rbind(Seagrant, data.frame(year = year, attendees = attendees)) +} + +# Print the results +print(Seagrant) + +Seagrant$cumu_Seagrant = cumsum(Seagrant$attendees) +Seagrant$cumu_Seagrant2 = Seagrant$cumu_Seagrant/1000 + + + + +##### Combine MREP & SeaGrant participant data + +df = merge(MREP, Seagrant, by = "year", all=T) + + +# save as indicator object ---------------------- +datdata <- 2010:2024 +inddata <- data.frame(cbind(df$cumu_MREP, df$cumu_Seagrant2)) +labs <- c("MREP industry participants" , "cumulative graduates", "", + "SeaGrant workshop/meeting participants" , "cumulative attendees (thousands)", "") +indnames <- data.frame(matrix(labs, nrow = 3, byrow = F)) +inddata <- list(labels = indnames, indicators = inddata, datelist = datdata) +class(inddata) <- "indicatordata" + + +# plot and save ---------------------------------- + +ind <- inddata +plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2) + +save(ind, file = "indicator_objects/outreach.RData") + +############################### END #############################