diff --git a/.Rhistory b/.Rhistory
index bb846a4..9b42b6c 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,341 +1,3 @@
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region),
-x = "Year",
-y = "Score",
-color = "Community")
-ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Heatmaps for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) +
-geom_tile() +
-facet_wrap(~ Indicator, scales = "free_y") +
-scale_fill_gradient(low = "blue", high = "red") +
-theme_minimal() +
-theme(panel.background = element_rect(fill = "white", color = NA),
-plot.background = element_rect(fill = "white", color = NA),
-axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region),
-x = "Year",
-y = "Community",
-fill = "Score")
-ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white")
-}
-# Create Box Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) +
-geom_boxplot() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Distribution of Standardized Indicator Scores by Community and Year", region),
-x = "Year",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Box Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) +
-geom_boxplot() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Distribution of Standardized Indicator Scores by Year", region),
-x = "Year",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-library(readr)
-CSVI_2010 = read_csv("indicator_data/CaribCSVI_2010.csv", locale = locale(encoding = "UTF-8"))
-CSVI_2020 = read_csv("indicator_data/CaribCSVI_2020.csv", locale = locale(encoding = "UTF-8"))
-library(ggplot2)
-library(dplyr)
-library(tidyr)
-CSVI_2010$Year <- 2010
-CSVI_2020$Year <- 2020
-# Combine both dataframes
-df_combined <- bind_rows(CSVI_2010, CSVI_2020)
-#remove columns for raw data
-df_combined = df_combined[,-c(2:7)]
-# Convert to long format for ggplot
-df_long <- df_combined %>%
-pivot_longer(cols = ends_with("Cat"),
-names_to = "Indicator",
-values_to = "Score")
-# Remove rows with NA values
-df_long <- na.omit(df_long)
-df_long$Region = as.character(df_long$Region)
-df_long$Year = as.factor(df_long$Year)
-df_long$Indicator = as.character(df_long$Indicator)
-df_long$Score = as.integer(df_long$Score)
-# Rename and combine regions
-df_long <- df_long %>%
-mutate(Region = recode(Region,
-'STT' = 'St. Thomas & St. John',
-'STJ' = 'St. Thomas & St. John',
-'STX' = 'St. Croix',
-'W' = 'Puerto Rico West',
-'E' = 'Puerto Rico East'))
-# Verify the changes
-unique(df_long$Region)
-# Rename indicators
-df_long <- df_long %>%
-mutate(Indicator = recode(Indicator,
-'PerDisCat' = 'Personal Disruption',
-'PopComCat' = 'Pop Composition',
-'PovertyCat' = 'Poverty',
-'LabFrc_revCat' = 'Labor Force',
-'HsChr_rev_Cat' = 'Housing Charac',
-'RetMigCat' = 'Retiree Migration'))
-unique(df_long$Indicator)
-# Create Faceted Bar Plots for each region
-regions <- unique(df_long$Region)
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = Community, y = Score, fill = as.factor(Year))) +
-geom_bar(stat = "identity", position = "dodge") +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Standardized Indicator Scores by Community & Year", region),
-x = "Community",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Faceted_Bar_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Line Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = Year, y = Score, color = Community, group = Community)) +
-geom_line() +
-geom_point() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region),
-x = "Year",
-y = "Score",
-color = "Community")
-ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Heatmaps for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) +
-geom_tile() +
-facet_wrap(~ Indicator, scales = "free_y") +
-scale_fill_gradient(low = "blue", high = "red") +
-theme_minimal() +
-theme(panel.background = element_rect(fill = "white", color = NA),
-plot.background = element_rect(fill = "white", color = NA),
-axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region),
-x = "Year",
-y = "Community",
-fill = "Score")
-ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white")
-}
-# Create Box Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) +
-geom_boxplot() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Distribution of Standardized Indicator Scores by Year", region),
-x = "Year",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-library(readr)
-CSVI_2010 = read_csv("indicator_data/CaribCSVI_2010.csv", locale = locale(encoding = "UTF-8"))
-View(CSVI_2010)
-CSVI_2020 = read_csv("indicator_data/CaribCSVI_2020.csv", locale = locale(encoding = "UTF-8"))
-library(ggplot2)
-library(dplyr)
-library(tidyr)
-CSVI_2010$Year <- 2010
-CSVI_2020$Year <- 2020
-# Combine both dataframes
-df_combined <- bind_rows(CSVI_2010, CSVI_2020)
-#remove columns for raw data
-df_combined = df_combined[,-c(2:7)]
-# Convert to long format for ggplot
-df_long <- df_combined %>%
-pivot_longer(cols = ends_with("Cat"),
-names_to = "Indicator",
-values_to = "Score")
-# Remove rows with NA values
-df_long <- na.omit(df_long)
-df_long$Community = as.character(df_long$Community)
-df_long$Region = as.character(df_long$Region)
-df_long$Year = as.factor(df_long$Year)
-df_long$Indicator = as.character(df_long$Indicator)
-df_long$Score = as.integer(df_long$Score)
-# Rename and combine regions
-df_long <- df_long %>%
-mutate(Region = recode(Region,
-'STT' = 'St. Thomas & St. John',
-'STJ' = 'St. Thomas & St. John',
-'STX' = 'St. Croix',
-'W' = 'Puerto Rico West',
-'E' = 'Puerto Rico East'))
-# Verify the changes
-unique(df_long$Region)
-# Rename indicators
-df_long <- df_long %>%
-mutate(Indicator = recode(Indicator,
-'PerDisCat' = 'Personal Disruption',
-'PopComCat' = 'Pop Composition',
-'PovertyCat' = 'Poverty',
-'LabFrc_revCat' = 'Labor Force',
-'HsChr_rev_Cat' = 'Housing Charac',
-'RetMigCat' = 'Retiree Migration'))
-unique(df_long$Indicator)
-# Create Faceted Bar Plots for each region
-regions <- unique(df_long$Region)
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = Community, y = Score, fill = as.factor(Year))) +
-geom_bar(stat = "identity", position = "dodge") +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Standardized Indicator Scores by Community & Year", region),
-x = "Community",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Faceted_Bar_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Line Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = Year, y = Score, color = Community, group = Community)) +
-geom_line() +
-geom_point() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Change in Standardized Indicator Scores Over Time by Community", region),
-x = "Year",
-y = "Score",
-color = "Community")
-ggsave(paste0("indicator_plots/CSVI_plots/Line_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Create Heatmaps for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Community, fill = Score)) +
-geom_tile() +
-facet_wrap(~ Indicator, scales = "free_y") +
-scale_fill_gradient(low = "blue", high = "red") +
-theme_minimal() +
-theme(panel.background = element_rect(fill = "white", color = NA),
-plot.background = element_rect(fill = "white", color = NA),
-axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Heatmap of Standardized Indicator Scores by Community and Year", region),
-x = "Year",
-y = "Community",
-fill = "Score")
-ggsave(paste0("indicator_plots/CSVI_plots/Heatmap_", region, ".png"), plot = p, width = 12, height = 8, bg = "white")
-}
-# Create Box Plots for each region
-for (region in regions) {
-p <- ggplot(df_long %>% filter(Region == region), aes(x = as.factor(Year), y = Score, fill = as.factor(Year))) +
-geom_boxplot() +
-facet_wrap(~ Indicator, scales = "free_y") +
-theme_bw() +
-theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
-axis.text.y = element_text(size = 10),
-strip.text = element_text(size = 10),
-legend.title = element_text(size = 10),
-legend.text = element_text(size = 10)) +
-labs(title = paste("Distribution of Standardized Indicator Scores by Year", region),
-x = "Year",
-y = "Score",
-fill = "Year")
-ggsave(paste0("indicator_plots/CSVI_plots/Box_Plot_", region, ".png"), plot = p, width = 12, height = 8)
-}
-# Tally number of stocks from Maria's table:
-PR = 34
-STTSTJ = 20
-STX = 20
-PR_perc = 1/PR
-PR_perc
-USVI_perc = 1/STX
-USVI_perc
-USVI_perc = 3/STX
-USVI_perc
-PR_perc = 1/PR
-USVI_perc = 1/STX
-# Tally number of stocks from Maria's table:
-PR_stocks = 34
-STTSTJ_stocks = 20
-STX_stocks = 20
-PR_perc = 1/PR_stocks
-USVI_perc = 1/STX_stocks
-PR = as.vector(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc)
-PR_perc
-PR = as.vector(0,0,0,0,0,0,0,0,0,0,0,0,3)
-PR = c(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc)
-PR
-# Tally number of stocks from Maria's table:
-PR_stocks = 34
-STTSTJ_stocks = 20
-STX_stocks = 20
-PR_perc = (1/PR_stocks)*100
-USVI_perc = (1/STX_stocks)*100
-PR = c(0,0,0,0,0,0,0,0,0,0,0,0,PR_perc)
-PR
-USVI = c(0,0,0,0,0,0,0,0,0,0,0,0,USVI_perc)
-USVI
-# save as indicator object ----------------------
-datdata <- 2011:2023
-inddata <- data.frame(cbind(PR, USVI))
-labs <- c("Stocks/complexes with Tier 3 designation" , "Percent", "Puerto Rico",
-"Stocks/complexes with Tier 3 designation" , "Percent", "USVI")
-indnames <- data.frame(matrix(labs, nrow = 3, byrow = F))
-ind <- list(labels = indnames, indicators = inddata, datelist = datdata)
-class(inddata) <- "indicatordata"
 # plot and save ----------------------------------
 plotIndicatorTimeSeries(ind, plotrownum = 2, coltoplot = 1:2, sublabel = TRUE, dateformat = "%Y%b", trendAnalysis = T)
 library(plotTimeSeries)
@@ -510,3 +172,341 @@ require(tidyverse)
 require(ggplot2)
 url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf'
 raw_text <- map(url2, pdf_text)
+url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS'
+library(readxl)
+library(httr)
+packageVersion("readxl")
+GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
+df <- read_excel(tf, 2L)
+str(df)
+df2 = df[c(52,63),]
+df2t = as.data.frame(t(df2))
+df2t = df2t[-c(1,2,12),]
+yrs_PR = as.integer(df2t$V1)
+cruise_PR = df2t$V2
+yrs_PR
+cruise_PR
+require(pdftools)
+require(tidyverse)
+require(ggplot2)
+url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf'
+raw_text <- map(url2, pdf_text)
+clean_table1 <- function(raw) {
+#Split the single pages
+raw <- map(raw, ~str_split(.x, "\\n") %>% unlist())
+#Concatenate the split pages
+raw <- reduce(raw, c)
+#Specify the start and end of the table data
+table_start <- stringr::str_which(tolower(raw), 'total visitors')
+table_end <- stringr:str_which(tolower(raw), 'number of cruise ships')
+table_end <- table_end[min(which(table_end > table_start))]
+#Build the table and remove special characters
+table <- raw[(table_start):(table_end)]
+table <- str_replace_all(table, "\\s{2,}","|")
+text_con <- textConnection(table)
+data_table <- read.csv(text_con, sep = "|")
+#Create a list of column names
+colnames(data_table) <- c("x","xx","1990","2000","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022")
+data_table
+}
+results <- map_df(raw_text, clean_table1)
+require(stringr)
+url2 <- 'https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf'
+raw_text <- map(url2, pdf_text)
+clean_table1 <- function(raw) {
+#Split the single pages
+raw <- map(raw, ~str_split(.x, "\\n") %>% unlist())
+#Concatenate the split pages
+raw <- reduce(raw, c)
+#Specify the start and end of the table data
+table_start <- stringr::str_which(tolower(raw), 'total visitors')
+table_end <- stringr:str_which(tolower(raw), 'number of cruise ships')
+table_end <- table_end[min(which(table_end > table_start))]
+#Build the table and remove special characters
+table <- raw[(table_start):(table_end)]
+table <- str_replace_all(table, "\\s{2,}","|")
+text_con <- textConnection(table)
+data_table <- read.csv(text_con, sep = "|")
+#Create a list of column names
+colnames(data_table) <- c("x","xx","1990","2000","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022")
+data_table
+}
+results <- map_df(raw_text, clean_table1)
+raw_text
+# Load necessary libraries
+library(pdftools)
+library(dplyr)
+library(tidyr)
+# Load the PDF
+pdf_file <- "../../../Downloads/Tourism-indicator-2022-12-28-23-1.pdf"
+# Extract text from the PDF
+pdf_text <- pdf_text(pdf_file)
+# Extract the relevant page (assuming the table is on the first page)
+page_text <- pdf_text[1]
+# Split the text into lines
+lines <- strsplit(page_text, "\n")[[1]]
+# Identify the lines containing the Visitor Arrivals table
+# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES")
+start_line <- grep("VISITOR ARRIVALS", lines)
+end_line <- grep("VISITOR EXPENDITURES", lines)
+table_lines <- lines[(start_line + 1):(end_line - 1)]
+# Combine lines into a single text block and split by spaces
+table_text <- paste(table_lines, collapse = " ")
+table_data <- strsplit(table_text, " +")[[1]]
+table_data
+# Extract the years and visitor numbers
+years <- c(1990:2022)  # Adjust the range based on actual column positions
+years
+# Extract the years and visitor numbers
+years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)  # Adjust the range based on actual column positions
+table_data
+cruise_passengers <- as.numeric(table_data[85:101])  # Adjust the range based on actual column positions
+table_data[85:101]
+# Create years column
+years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)
+# Extract and clean the total visitors data by removing commas
+cruise_passengers <- gsub(",", "", table_data[85:101])
+cruise_passengers <- as.numeric(cruise_passengers)
+cruise_passengers
+# Create years column
+years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)
+# Extract and clean the total visitors data by removing commas
+passengers <- gsub(",", "", table_data[85:101])
+passengers <- as.numeric(passengers)
+# Create a data frame
+USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers)
+# Print the data frame
+print(USVI_cruise)
+# Specify the URL of the PDF
+url <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf"
+# Specify the destination file path
+destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf"
+# Download the PDF
+download.file(url, destfile, mode = "wb")
+# Check if the file has been downloaded
+file.exists(destfile)
+url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS'
+library(readxl)
+library(httr)
+packageVersion("readxl")
+GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
+df <- read_excel(tf, 2L)
+str(df)
+df2 = df[c(52,63),]
+df2t = as.data.frame(t(df2))
+df2t = df2t[-c(1,2,12),]
+yrs_PR = as.integer(df2t$V1)
+cruise_PR = df2t$V2
+yrs_PR
+cruise_PR
+yrs_PR = as.integer(df2t$V1)
+cruise_PR = as.numeric(df2t$V2)
+cruise_PR
+# Load necessary libraries
+library(pdftools)
+library(dplyr)
+library(tidyr)
+# Specify the URL of the PDF
+url2 <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf"
+# Specify the destination file path
+destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf"
+# Download the PDF
+download.file(url2, destfile, mode = "wb")
+# Check if the file has been downloaded
+file.exists(destfile)
+# Load the PDF
+pdf_file <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf"
+# Extract text from the PDF
+pdf_text <- pdf_text(pdf_file)
+# Extract the relevant page (assuming the table is on the first page)
+page_text <- pdf_text[1]
+# Split the text into lines
+lines <- strsplit(page_text, "\n")[[1]]
+# Identify the lines containing the Visitor Arrivals table
+# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES")
+start_line <- grep("VISITOR ARRIVALS", lines)
+end_line <- grep("VISITOR EXPENDITURES", lines)
+table_lines <- lines[(start_line + 1):(end_line - 1)]
+# Combine lines into a single text block and split by spaces
+table_text <- paste(table_lines, collapse = " ")
+table_data <- strsplit(table_text, " +")[[1]]
+# Create years column
+years <- c(1990,2000,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)
+# Extract and clean the total visitors data by removing commas
+passengers <- gsub(",", "", table_data[85:101])
+passengers <- as.numeric(passengers)
+# Create a data frame
+USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers)
+# Print the data frame
+print(USVI_cruise)
+yrs_USVI = as.integer(USVI_cruise$Year)
+cruise_USVI = as.numeric(USVI_cruise$Cruise_passengers)
+yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI)
+yrs
+# Combine lines into a single text block and split by spaces
+table_text <- paste(table_lines, collapse = " ")
+table_data <- strsplit(table_text, " +")[[1]]
+# Create years column
+years <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)
+# Extract and clean the total visitors data by removing commas
+passengers <- gsub(",", "", table_data[87:101])
+passengers <- as.numeric(passengers)
+# Create a data frame
+USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers)
+# Print the data frame
+print(USVI_cruise)
+yrs_USVI = as.integer(USVI_cruise$Year)
+cruise_USVI = as.numeric(USVI_cruise$Cruise_passengers)
+yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI)
+yrs
+yrs <- min(yrs_PR, yrs_USVI) : max(yrs_PR, yrs_USVI)
+mat <- data.frame(matrix(data = NA, nrow = length(yrs), ncol = 2))
+rownames(mat) <- yrs
+mat[which(yrs %in% cruise_PR), 1] <- prtab[, 2]
+yrs_PR
+# Create a data frame
+PR_cruise <- data.frame(Year = yrs_PR, Cruise_passengers = cruise_PR)
+all_years <- min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers)
+# Merge the data frames with the complete data frame to fill in missing years with NA
+combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE)
+combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE)
+all_years <- min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers)
+# Merge the data frames with the complete data frame to fill in missing years with NA
+combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE)
+PR_cruise
+all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers))
+all_years
+all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year))
+all_years
+# Merge the data frames with the complete data frame to fill in missing years with NA
+combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE)
+combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE)
+combined_df
+# save as indicator object ----------------------
+datdata <- all_years
+inddata <- data.frame(cbind(PR_cruise$Cruise_passengers, USVI_cruise$Cruise_passengers))
+combined_df
+# save as indicator object ----------------------
+datdata <- all_years
+inddata <- data.frame(cbind(combined_df$Cruise_passengers.x, combined_df$Cruise_passengers.y))
+labs <- c("Cruise passengers" , "thousands of people", "Puerto Rico",
+"Cruise passengers" , "thousands of people", "USVI")
+indnames <- data.frame(matrix(labs, nrow = 3, byrow = F))
+inddata <- list(labels = indnames, indicators = inddata, datelist = datdata)
+class(inddata) <- "indicatordata"
+ind <- inddata
+plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE)
+library(plotTimeSeries)
+plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE)
+all_years
+# specification file and libraries -----------------------------
+rm(list = ls())
+dev.off()
+library(maps)
+library(plotTimeSeries)
+library(xml2)
+library(rvest)
+load("indicator_processing/spec_file.RData")
+url <- paste0("https://ds.iris.edu/ieb/evtable.phtml?caller=IEB&st=1970-01-01&et=2025-01-01&minmag=3.5&maxmag=10&mindepth=0&xde=900&orderby=time-desc&src=usgs&limit=5000&",
+"maxlat=", max_lat, "&minlat=", min_lat, "&maxlon=", max_lon, "&minlon=", min_lon,
+"&sbl=1&zm=8&mt=ter&title=IEB%20export%3A%201033%20earthquakes%20as%20a%20sortable%20table.&stitle=from%",
+"201970-01-01%20to%202025-01-01%2C%20with%20magnitudes%20from%203.5%20to%2010%2C%20depths%20from%200%20to",
+"%20900%20km%2C%20with%20priority%20for%20most%20recent%2C%20limited%20to%205000%2C%20%20showing%20data%20from%20USGS%2C%20")
+page <- read_html(url) #Creates an html document from URL
+table <- html_table(page, fill = TRUE) #Parses tables into data frames
+table
+dat <- data.frame(table[[1]])
+head(dat)
+dim(dat)
+min(dat$Year)
+min(dat$Year)
+max(dat$Year)
+hist(dat$Month)
+hist(dat$Depth)
+hist(dat$Mag)
+dat <- dat[which(dat$Year >= 2000), ]
+dat <- dat[which(dat$Year <= terminal_year), ]
+# check data download -----------------------------------------
+map('world', fill = 1, interior=F, col = gray(0.95), add=F, xlim = c(-80, -60), ylim = c(10, 30))
+points(dat$Lon, dat$Lat, pch=19, cex=0.5, col = 1)
+dev.off()
+tot_num <- table(dat$Year)
+tot_st <- tapply(dat$Mag, dat$Year, sum, na.rm = T)
+barplot(tot_num)
+barplot(tot_st)
+plot(as.numeric(tot_num), as.numeric(tot_st))
+datdata <- min(dat$Year):max(dat$Year)
+inddata <- data.frame(as.numeric(tot_num))
+labs <- c("Earthquake activity", "number of events per year", "")
+indnames <- data.frame(matrix(labs, nrow = 3, byrow = F))
+url1<-'https://www.bde.pr.gov/BDE/PREDDOCS/I_CRUISE.XLS'
+library(readxl)
+library(httr)
+packageVersion("readxl")
+GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
+df <- read_excel(tf, 2L)
+str(df)
+df2 = df[c(52,63),]
+df2t = as.data.frame(t(df2))
+df2t = df2t[-c(1,2,12),]
+yrs_PR = as.integer(df2t$V1)
+cruise_PR = as.numeric(df2t$V2)
+# Create a data frame
+PR_cruise <- data.frame(Year = yrs_PR, Cruise_passengers = cruise_PR)
+# Load necessary libraries
+library(pdftools)
+library(dplyr)
+library(tidyr)
+# Specify the URL of the PDF
+url2 <- "https://usviber.org/wp-content/uploads/2023/12/Tourism-indicator-2022-12-28-23-1.pdf"
+# Specify the destination file path
+destfile <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf"
+# Download the PDF
+download.file(url2, destfile, mode = "wb")
+# Check if the file has been downloaded
+file.exists(destfile)
+# Load the PDF
+pdf_file <- "indicator_data/Cruise-indicator-2022-12-28-23-1.pdf"
+# Extract text from the PDF
+pdf_text <- pdf_text(pdf_file)
+# Extract the relevant page (assuming the table is on the first page)
+page_text <- pdf_text[1]
+# Split the text into lines
+lines <- strsplit(page_text, "\n")[[1]]
+# Identify the lines containing the Visitor Arrivals table
+# (assuming the table starts with "VISITOR ARRIVALS" and ends before "VISITOR EXPENDITURES")
+start_line <- grep("VISITOR ARRIVALS", lines)
+end_line <- grep("VISITOR EXPENDITURES", lines)
+table_lines <- lines[(start_line + 1):(end_line - 1)]
+# Combine lines into a single text block and split by spaces
+table_text <- paste(table_lines, collapse = " ")
+table_data <- strsplit(table_text, " +")[[1]]
+# Create years column
+years <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022)
+# Extract and clean the total visitors data by removing commas
+passengers <- gsub(",", "", table_data[87:101])
+passengers <- as.numeric(passengers)
+# Create a data frame
+USVI_cruise <- data.frame(Year = years, Cruise_passengers = passengers)
+# Print the data frame
+print(USVI_cruise)
+all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year))
+# Extract and clean the total visitors data by removing commas
+passengers <- gsub(",", "", table_data[87:101])
+all_years <- data.frame(Year = min(PR_cruise$Year, USVI_cruise$Year) : max(PR_cruise$Year, USVI_cruise$Year))
+# Merge the data frames with the complete data frame to fill in missing years with NA
+combined_df <- merge(all_years, PR_cruise, by = "Year", all.x = TRUE)
+combined_df <- merge(combined_df, USVI_cruise, by = "Year", all.x = TRUE)
+# save as indicator object ----------------------
+datdata <- all_years
+# save as indicator object ----------------------
+datdata <- all_years$Year
+inddata <- data.frame(cbind(combined_df$Cruise_passengers.x, combined_df$Cruise_passengers.y))
+labs <- c("Cruise passengers" , "thousands of people", "Puerto Rico",
+"Cruise passengers" , "thousands of people", "USVI")
+indnames <- data.frame(matrix(labs, nrow = 3, byrow = F))
+inddata <- list(labels = indnames, indicators = inddata, datelist = datdata)
+class(inddata) <- "indicatordata"
+ind <- inddata
+plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2, sublabel = TRUE)
+save(ind, file = "indicator_objects/cruise.RData")
diff --git a/indicator_processing/non_automated/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2010 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2011 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2012 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2013 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2014 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2015 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2016 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2017 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2018 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2019 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_processing/non_automated/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf b/indicator_data/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf
similarity index 100%
rename from indicator_processing/non_automated/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf
rename to indicator_data/outreach/2020 Sea Grant_ PIER_ Annual Report.pdf
diff --git a/indicator_objects/outreach.RData b/indicator_objects/outreach.RData
new file mode 100644
index 0000000..c1de894
Binary files /dev/null and b/indicator_objects/outreach.RData differ
diff --git a/indicator_processing/non_automated/outreach.R b/indicator_processing/non_automated/outreach.R
new file mode 100644
index 0000000..3337e73
--- /dev/null
+++ b/indicator_processing/non_automated/outreach.R
@@ -0,0 +1,90 @@
+# Code to develop outreach indicators - including MREP workshop industry participants over time and SeaGrant workshop participants. 
+
+# Last updated 6/20/2024 by Carissa Gervasi
+
+rm(list = ls())
+dev.off()
+
+library(pdftools)
+library(stringr)
+library(dplyr)
+
+###############
+## MREP participants (not able to automate, numbers emailed from Courtney Pickett)
+
+years = c(2015:2024)
+
+participants = c(21,0,52,0,33,0,0,32,22,33)
+
+MREP = data.frame(year = years, part_MREP = participants)
+
+MREP$cumu_MREP = cumsum(MREP$part_MREP)
+
+
+
+###################
+## Seagrant meeting/workshop participants (pdf annual reports emailed). Want to extract the number of attendees in SeaGrant meetings/workshops for each year.
+
+
+# Function to extract attendees from a single PDF
+extract_attendees <- function(pdf_path) {
+  pdf_text <- pdf_text(pdf_path)
+  last_page <- pdf_text[length(pdf_text)]
+  attendees <- str_match(last_page, "Attendees in SG Meetings/Workshops\\s+(\\d+)")[2]
+  return(as.numeric(attendees))
+}
+
+# Function to extract the year from the filename
+extract_year <- function(filename) {
+  year <- str_match(filename, "(\\d{4})")[1]
+  return(as.numeric(year))
+}
+
+# Set the directory containing the PDF files
+pdf_directory <- "indicator_data/outreach/"
+
+# Get a list of all PDF files in the directory
+pdf_files <- list.files(pdf_directory, pattern = "\\.pdf$", full.names = TRUE)
+
+# Initialize an empty data frame to store results
+Seagrant <- data.frame(year = integer(), attendees = integer(), stringsAsFactors = FALSE)
+
+# Loop through each PDF file
+for (pdf_file in pdf_files) {
+  year <- extract_year(basename(pdf_file))
+  attendees <- extract_attendees(pdf_file)
+  Seagrant <- rbind(Seagrant, data.frame(year = year, attendees = attendees))
+}
+
+# Print the results
+print(Seagrant)
+
+Seagrant$cumu_Seagrant = cumsum(Seagrant$attendees)
+Seagrant$cumu_Seagrant2 = Seagrant$cumu_Seagrant/1000
+
+
+
+
+##### Combine MREP & SeaGrant participant data
+
+df = merge(MREP, Seagrant, by = "year", all=T)
+
+
+# save as indicator object ----------------------
+datdata <- 2010:2024
+inddata <- data.frame(cbind(df$cumu_MREP, df$cumu_Seagrant2))
+labs <- c("MREP industry participants" , "cumulative graduates", "",
+          "SeaGrant workshop/meeting participants" , "cumulative attendees (thousands)", "")
+indnames <- data.frame(matrix(labs, nrow = 3, byrow = F))
+inddata <- list(labels = indnames, indicators = inddata, datelist = datdata)
+class(inddata) <- "indicatordata"
+
+
+# plot and save ----------------------------------
+
+ind <- inddata
+plotIndicatorTimeSeries(ind, coltoplot = 1:2, plotrownum = 2)
+
+save(ind, file = "indicator_objects/outreach.RData")
+
+###############################  END  #############################