p_ci_metodologie_vedy.Rmd

---
title: "P-values, CI"
output: html_notebook
---

From Cochrane: https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD003823.pub2/full
Baseline systolic BP 157.1. SD of change in SBP was 13.9. Mean placebo effect -3.2. The best estimate of the near maximal blood pressure lowering (difference from placebo) for the ACE inhibitor class of drugs is -7.68 (95% CI -8.45, -6.91).
Weighted mean SD of SBP: 16.6 (table 3)

```{r setup}
library(tidyverse)
library(cowplot)
theme_set(theme_cowplot())

sbp_baseline <- 157.1
sbp_sd <- 16.6
effect_ace_minus_placebo <- -7.68

n_sims <- 1000

plots <- list()
widths <- list()
heights <- list()

```

P hodnoty dle velikosti efektu

```{r}
set.seed(56684322)
velikost_efektu <- tibble(efekt = c(0, -5 / sbp_sd, effect_ace_minus_placebo / sbp_sd, -1))
n_vzorku <- tibble(n = c(15, 30, 60, 120))
#n_vzorku <- tibble(n = seq(sqrt(15), sqrt(120), length.out = 4) ^ 2 %>% round())

single_p_sim <- function(data_row) {
  control <- rnorm(data_row$n, 0, 1)
  treatment <- rnorm(data_row$n, data_row$efekt, 1)
  test_res <- t.test(control, treatment)
  
  cbind(data_row, tibble(p = test_res$p.value, zmereny_rozdil = mean(treatment) - mean(control), 
                         conf_low = -test_res$conf.int[1], conf_high = -test_res$conf.int[2]))
}

p_velikost_efektu <- velikost_efektu %>% 
  crossing(n_vzorku) %>%
  crossing(sim = 1:n_sims) %>%
  transpose() %>%
  map_df(single_p_sim) %>%
  mutate(druh_rozdilu = factor(zmereny_rozdil > 0, levels = c(TRUE, FALSE), labels = c("Pozitivní", "Negativní")),
         efekt_f = factor(efekt, levels = velikost_efektu$efekt))


```



```{r}
hg_sd_label <- function(sd_val) {
  numeric_sd_val <- sd_val %>% as.numeric()
  paste0("Rozdíl: ", round(sbp_sd * numeric_sd_val, 1), " mmHg (", abs(round(numeric_sd_val, 2)), " sd)")
}


facet_efekt <- function(scales = "free") { 
  facet_wrap(~efekt_f, scales = scales, labeller = labeller(efekt_f = hg_sd_label))
}

make_scale_pocet <- function(n_sims) {
  scale_y_continuous("Frekvence", labels = function(x) { scales::percent(x / n_sims, accuracy = 1)}, trans = scales::trans_new("Relative", transform = function(x) { x / n_sims}, inverse = function(x) { x * n_sims}))
}
scale_pocet <- make_scale_pocet(n_sims)

p_breaks <- seq(0,1, by = 0.05)
n_first_plot <- n_vzorku$n[2]

for(kolik_zobrazit in c(1, 2, 3, 10, 50, n_sims)) {
  plot_name <- paste0("p_tutorial_histogram_", kolik_zobrazit)
  plots[[plot_name]] <- p_velikost_efektu %>% 
    filter(n == n_first_plot, efekt == 0) %>% 
    head(n = kolik_zobrazit) %>%
    ggplot(aes(x = p)) + geom_histogram(breaks = p_breaks) +  
    make_scale_pocet(kolik_zobrazit) 
  print(plots[[plot_name]])
  
}

plots$p_fixni_efekt_prvni <- p_velikost_efektu %>% 
    filter(n == n_first_plot) %>% 
    ggplot(aes(x = p, alpha = p < 0.05)) + geom_histogram(breaks = p_breaks) +  
    scale_alpha_discrete(range = c(0.5,1), guide = FALSE) +
    scale_pocet + facet_efekt() + ggtitle(paste0("N: ", n_first_plot))
plots$p_fixni_efekt_prvni





```

```{r}
for(n_k_zobrazeni in n_vzorku$n) {
  plot_name <- paste0("p_fixni_n_", n_k_zobrazeni)
  plots[[plot_name]] <- p_velikost_efektu %>% filter(n == n_k_zobrazeni) %>% ggplot(aes(x = p, fill = druh_rozdilu, alpha = p < 0.05)) + geom_histogram(breaks = p_breaks) + 
      scale_pocet + facet_efekt() + scale_fill_discrete("Změřený rozdíl") +
      scale_alpha_discrete(range = c(0.5,1), guide = FALSE) +
      ggtitle(paste0("N: ", n_k_zobrazeni))
  print(plots[[plot_name]])
}


for(efekt_k_zobrazeni in velikost_efektu$efekt) {
  plot_name <- paste0("p_fixni_efekt_", round(efekt_k_zobrazeni,2))
  plots[[plot_name]] <- p_velikost_efektu %>% filter(efekt == efekt_k_zobrazeni) %>% ggplot(aes(x = p, fill = druh_rozdilu, alpha = p < 0.05)) + 
      geom_histogram(breaks = p_breaks) + facet_wrap(~n, labeller = label_both, scales = "free_y") + 
      scale_fill_discrete("Změřený rozdíl") + scale_pocet +
      scale_alpha_discrete(range = c(0.5,1), guide = FALSE) +
      ggtitle(hg_sd_label(efekt_k_zobrazeni))
  print(plots[[plot_name]])
}
```


```{r}
power.t.test(delta = effect_ace_minus_placebo, sd = sbp_sd, sig.level = 0.05, power = 0.8)
```

```{r}
efekty <- velikost_efektu$efekt[c(2,3)]
nka <- n_vzorku$n[c(2,4)]
for(i in 1:length(efekty) ) {
  n_k_zobrazeni <- nka[i]
  efekt_k_zobrazeni <- efekty[i]
  plot_name <- paste0("m_and_s_error_",n_k_zobrazeni,"_eff_", round(abs(efekt_k_zobrazeni),2))
  plots[[plot_name]] <- p_velikost_efektu %>% filter(n == n_k_zobrazeni, efekt == efekt_k_zobrazeni, p < 0.05) %>%
    ggplot(aes(x = zmereny_rozdil * sbp_sd)) + geom_histogram(aes(y = stat(width*density)), binwidth = 1)  + geom_vline(xintercept = efekt_k_zobrazeni * sbp_sd, color = "blue", size = 2, linetype = "dashed") + 
    facet_efekt() + scale_y_continuous("Podíl", labels = scales::percent) + scale_x_continuous("Změřený rozdíl [mmHg]") +
    ggtitle(paste0("Pouze p < 0.05, N: ", n_k_zobrazeni))
  print(plots[[plot_name]])
  
  widths[[plot_name]] <- 6
}
```


# Více studií

```{r}
set.seed(321685524)

n_studii <- 8
n_vice_studii <- 30
studie_df_raw <- list()
next_id <- 1
efekty_pro_vice_studii <- c(0, velikost_efektu$efekt[3])
for(efekt_pro_simulace in efekty_pro_vice_studii) {
  p_pro_sim <- p_velikost_efektu %>% filter(n == n_vice_studii, efekt == efekt_pro_simulace)
  for(i in 1:n_sims) {
    studie_df_raw[[next_id]] <- p_pro_sim %>% sample_n(n_studii) %>% 
      group_by(n, efekt, efekt_f) %>%
      summarise(min.p = min(p), max.p = max(p), min.zmereny_rozdil = min(zmereny_rozdil), max.zmereny_rozdil = max(zmereny_rozdil), .groups = "drop")
      #summarise(min.p = min(p), max.p = max(p), min.zmereny_rozdil = min(c(Inf,zmereny_rozdil[p < 0.05])), max.zmereny_rozdil = max(c(-Inf,zmereny_rozdil[p < 0.05])))
    next_id <- next_id + 1
  }
}

p_vice_studii <- do.call(rbind, studie_df_raw)

p_vice_studii_long <- p_vice_studii %>%
      pivot_longer(c("min.p","max.p","min.zmereny_rozdil","max.zmereny_rozdil"), names_to = c("typ","meritko"), names_sep = "\\.", values_to = "hodnota") 

```

```{r}
for(efekt_k_zobrazeni in efekty_pro_vice_studii) {
  plot_name_base <- paste0("vice_studii_", abs(round(efekt_k_zobrazeni,2)),"_")
  title <- ggtitle(paste0(n_studii, " studií, N: ", n_vice_studii))
  
  plot_name_p <- paste0(plot_name_base, "_p")
  plots[[plot_name_p]] <- p_vice_studii_long %>% filter(meritko == "p", efekt == efekt_k_zobrazeni) %>%
      ggplot(aes(x = hodnota, color = typ)) + geom_freqpoly(breaks = p_breaks) + scale_pocet + facet_efekt() + title +
      geom_vline(xintercept = 0.05, color = "green", size = 2, linetype = "dashed") +
    scale_x_continuous("p-hodnota")
  widths[[plot_name_p]] <- 6
  print(plots[[plot_name_p]])

  plot_name_zmereny <- paste0(plot_name_base, "_zmereny_rozdil")
  plots[[plot_name_zmereny]] <- p_vice_studii_long %>% filter(meritko == "zmereny_rozdil", efekt == efekt_k_zobrazeni, !is.infinite(hodnota)) %>%
      ggplot(aes(x = hodnota * sbp_sd, color = typ)) + geom_freqpoly(bins = 10) + scale_pocet + facet_efekt() + title +
    geom_vline(xintercept = efekt_k_zobrazeni * sbp_sd, color = "blue", size = 2, linetype = "dashed") +
    scale_x_continuous("Změřený rozdíl")
  widths[[plot_name_zmereny]] <- 6
  print(plots[[plot_name_zmereny]])
  
}
```

## Malá změna v datech

```{r}
set.seed(3494994)
g1 <- round(rnorm(20, mean = sbp_baseline, sd = sbp_sd))
cat(paste(g1, collapse = "\t"))
cat("\n\n")
g2 <- sample(g1, 20) - 13
cat(paste(g2, collapse = "\t"))
cat("\n")

t.test(g1, g2)

g2_mod <- g2
g2_mod[20] <- g2_mod[20] + 10
t.test(g1, g2_mod)
```

## Konfidenční interval

```{r}
p_velikost_efektu <- p_velikost_efektu %>% 
  mutate(p_formatted = if_else(
    p >= 0.001, paste0("p = ",(round(p, 3))), "p < 0.001"),
    stars = case_when(p > 0.05 ~ "   ",
                      p > 0.01 ~ "  *",
                      p > 0.001 ~ " **",
                      TRUE ~ "***"
                      ))
    #p >= 0.0001 ~ as.cha
  

for(ci_to_show in c(0, 1, 2, 3, 15)) {
  plot_name <- paste0("ci_tanec_prvni_", ci_to_show)

  if(ci_to_show == 0) {
    geom1 <- NULL
    geom2 <- NULL
    geom3 <- NULL
    ci_to_show <- 1
  } else {
    geom1 <- geom_linerange()
    geom2 <- geom_point(size = 3)
    geom3 <- geom_text(aes(label = paste0(stars, " ", p_formatted)), y = 0.5 * sbp_sd, hjust = 0, vjust = 0.5, size = 5, family = "mono")
  }
  plots[[plot_name]] <- p_velikost_efektu %>%
    filter(n == n_first_plot, efekt == velikost_efektu$efekt[3]) %>%
    head(ci_to_show) %>%
    mutate(id = seq(from = 1, length.out = n())) %>%
    ggplot(aes(x = id, y = zmereny_rozdil * sbp_sd, ymin = conf_low * sbp_sd, ymax = conf_high * sbp_sd)) + 
    geom_hline(yintercept = 0, color = "red", size = 2) + geom_hline(aes(yintercept = efekt * sbp_sd), color = "blue", linetype = "dashed", size = 2) + 
    facet_efekt() +
    geom1 + geom2 + geom3 +
    coord_flip() + scale_x_continuous("") + scale_y_continuous("Změřený rozdíl [mmHg], 95% CI") +
    expand_limits(x = c(1,15), y = c(-1.5, 1.1) * sbp_sd) +
    ggtitle(paste0("N: ", n_first_plot)) +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank())

  print(plots[[plot_name]])

}

for(n_k_zobrazeni in n_vzorku$n) {
  plot_name <- paste0("ci_tanec_n_", n_k_zobrazeni)
  plots[[plot_name]] <- p_velikost_efektu %>%
    filter(n == n_k_zobrazeni) %>%
    mutate(id_raw = 1:n()) %>%
    group_by(efekt) %>%
    top_n(15, id_raw) %>%
    mutate(id = 1:n()) %>%
    ungroup() %>%
    ggplot(aes(x = id, y = zmereny_rozdil * sbp_sd, ymin = conf_low * sbp_sd, ymax = conf_high * sbp_sd)) + 
    geom_hline(yintercept = 0, color = "red", size = 2) + geom_hline(aes(yintercept = efekt * sbp_sd), color = "blue", linetype = "dashed", size = 2) + facet_efekt(scales = "fixed") +
    geom_linerange() +   geom_point(size = 2) + 
    geom_text(aes(label = paste0(stars, " ", p_formatted)), y = 1.1 * sbp_sd, hjust = 0, vjust = 0.5, size = 2.5, family = "mono") +
    coord_flip() + scale_x_continuous("") + scale_y_continuous("Změřený rozdíl [mmHg], 95% CI") +
    ggtitle(paste0("N: ", n_k_zobrazeni)) + 
    expand_limits(y = c(-2, 2) * sbp_sd) +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank())
  
  print(plots[[plot_name]])
}


```


## Lineární modely


```{r}
set.seed(3494949)
n <- 200
data_linear_continous <- data.frame(age = runif(n, 40, 65)) %>%
  mutate(SBP = rnorm(n, 9/20 * (age-40) + 130 , sbp_sd)) 
plots$linear_continuous1 <- data_linear_continous %>%
  ggplot(aes(x = age, y = SBP)) + geom_point() +  scale_x_continuous("Věk") +
  scale_y_continuous("SBP [mm Hg]")

plots$linear_continuous1

plots$linear_continuous2 <- plots$linear_continuous1 + geom_smooth(method = "lm")
plots$linear_continuous2

plots$linear_continuous_inverted <- data_linear_continous %>%
  ggplot(aes(y = age, x = SBP)) + geom_point() +  scale_y_continuous("Věk") +
  scale_x_continuous("SBP [mm Hg]")+ geom_smooth(method = "lm")
plots$linear_continuous_inverted
```

```{r}
set.seed(234234342)
n_per_group <- 50
plots$linear_binary <- 
data.frame(Skupina = rep(c(0,1), each = n_per_group), SBP = c(rnorm(n_per_group, mean = sbp_baseline, sd = sbp_sd), rnorm(n_per_group, mean = sbp_baseline + effect_ace_minus_placebo, sd = sbp_sd))) %>%
  ggplot(aes(x = Skupina, y = SBP)) + geom_jitter(width = 0.01, height = 0) + geom_smooth(method = "lm") + scale_x_continuous(breaks = c(0,1), labels = c("Control","Treatment")) +
  scale_y_continuous("SBP [mm Hg]")

plots$linear_binary
```



## RCT vs. biasnutá studie


TODO radeji fakt simulovat RCT (dve skupiny)

```{r}
set.seed(321355422)

full_population <- rnorm(1e6, effect_ace_minus_placebo, sbp_sd)
increase_biases <- c(3/2, 4/3, 10/9)
increase_biases_labels <- c("3:2", "4:3", "10:9")

biased_study_population_size <- 1e5
biased_study_errors <- numeric(n_sims * length(increase_biases))
all_biases <- numeric(n_sims * length(increase_biases))
for(bias_index in 1:length(increase_biases)) {
  bias <- increase_biases[bias_index]
  full_population_probs <-if_else(full_population < 0, 1, bias)
  for(i in 1:n_sims) {
    biased_sample <- sample(full_population, size = biased_study_population_size, replace = TRUE, prob = full_population_probs)
    index <-  (bias_index - 1) * n_sims + i
    biased_study_errors[index] <- (mean(biased_sample) - effect_ace_minus_placebo)^2 
    all_biases[index] <- bias
  }
}
  
rct_results <- data.frame(sample_size = 5 * (2 ^ (0:8))) %>% crossing(sim = 1:n_sims) %>%
  rowwise() %>%
  mutate(error = mean(rnorm(sample_size, 0, sbp_sd)) ^ 2) %>%
  group_by(sample_size) %>%
  summarise(low_error = sqrt(quantile(error, 0.025)), mean_error = sqrt(mean(error)), high_error = sqrt(quantile(error, 0.975)), .groups = "drop")

biased_results <- data.frame(bias = all_biases, study_error = biased_study_errors) %>%
  group_by(bias) %>% 
  summarise(low_error = sqrt(quantile(study_error, 0.025)), 
            mean_error = sqrt(mean(study_error)), 
            high_error = sqrt(quantile(study_error, 0.975)),
            .groups = "drop")
# %>% 
#   crossing(data.frame(sample_size = unique(rct_results$sample_size)))


```


```{r}
plots$rct_error <- 
rct_results %>% ggplot(aes(x = sample_size, y = mean_error))  + #, ymin = low_error, ymax = high_error
  #geom_ribbon(alpha = 0.3) + 
  geom_line() + 
#  geom_ribbon(data = biased_results, fill = "blue", alpha = 0.3) + 
  scale_y_continuous("Půměrná chyba RCT [mmHg]") +
  scale_x_continuous("Velikost vzorku RCT")

plots$rct_error

hline_size <- 2
pal <- viridisLite::plasma(3)

plots$rct_error_biased <- 
  plots$rct_error +
    geom_hline(data = biased_results, aes(yintercept = mean_error, color = factor(bias, levels = increase_biases, labels = increase_biases_labels)), size = hline_size) + 
  scale_color_viridis_d("Zhoršení:zlepšení")

plots$rct_error_biased

biased_results %>% transmute(bias = bias, biased_mean_error = mean_error) %>% 
  crossing(rct_results) %>% filter(biased_mean_error < mean_error) %>% group_by(bias) %>%
  summarise(max(sample_size))

```

Láme se to u cca 80 pacientů v RCT

## Conditioning on a collider

```{r, fig.width= 6, fig.height=5}
set.seed(56852266)
data_pohledny <- data.frame(pohledny = rnorm(200), laskavy = rnorm(200))

plot_pohledny <- function(split = "none", trend = FALSE) {
  
  if(split == "none") {
    data_to_plot <- data_pohledny %>% mutate(group = 1)
    alpha_scale = scale_alpha_continuous(range = c(0.999, 1), guide = FALSE)
  } else {
    alpha_scale <- scale_alpha_continuous(range = c(0.2, 1), guide = FALSE)
    if (split == "up") {
      data_to_plot <- data_pohledny %>% mutate(group = if_else(pohledny + laskavy > 0, 1, 0))
    } else if (split == "down") {
      data_to_plot <- data_pohledny %>% mutate(group = if_else(pohledny + laskavy < 0, 1, 0))
    }
  }
  
  if(trend) {
  #  smooth <- geom_smooth(aes(x = pohledny, y = laskavy), inherit.aes = FALSE, data = rbind(data_to_plot %>% filter(group > 0), data.frame(laskavy = 0, pohledny = 0, group = 0)), method = "lm", formula = y ~x)
    smooth <- geom_smooth(aes(x = pohledny, y = laskavy), inherit.aes = FALSE, data = data_to_plot %>% filter(group > 0), method = "lm", formula = y ~x)
  } else {
    smooth <- NULL
  }
  data_to_plot %>% 
    ggplot(aes(x = pohledny, y = laskavy, alpha = group)) + 
    scale_x_continuous("Pohledný", breaks = c(-1.5, 1.5), labels = c("Málo", "Hodně")) + scale_y_continuous("Laskavý", breaks = c(-1.5, 1.5), labels = c("Málo", "Hodně"))+ 
    geom_vline(xintercept = 0) + geom_hline(yintercept =  0) + 
    geom_point(color = viridisLite::plasma(3)[2]) +
    smooth +
    alpha_scale +
    theme(axis.line = element_blank(), axis.ticks = element_blank())
}

plots$laskavy_pohledny_1 <- plot_pohledny("none")
plots$laskavy_pohledny_2 <- plot_pohledny("none", TRUE)
plots$laskavy_pohledny_3 <- plot_pohledny("up")
plots$laskavy_pohledny_4 <- plot_pohledny("up", TRUE)   
plots$laskavy_pohledny_5 <- plot_pohledny("down")
plots$laskavy_pohledny_6 <- plot_pohledny("down", TRUE)
for(plot_name in paste0("laskavy_pohledny_", 1:6)){
  print(plots[[plot_name]])
  widths[[plot_name]] <- 6
  heights[[plot_name]] <- 5
}
```

## Regression to the mean

```{r}
set.seed(99852665)
n_obs <- 100
data_regression_to_the_mean <- data.frame(id = 1:n_obs, before = rnorm(n_obs, mean = 30, sd = 2), after = rnorm(n_obs, mean = 30, sd = 2)) %>% mutate(selected = as.numeric(before > 32), improved = factor(before > after, levels = c(TRUE, FALSE), labels = c("Ano","Ne"))) %>%
  pivot_longer(c("before", "after"),names_to = "group", values_to = "BMI") %>%
  mutate(group = factor(group, levels = c("before","after"), labels = c("Před zásahem", "Po zásahu")))

base_rtm_plot <- data_regression_to_the_mean %>% ggplot(aes(x = group,y = BMI, alpha = selected, color = improved, group = id)) + geom_line() + geom_point() + scale_x_discrete("Skupina") + scale_color_discrete("Snížení BMI")

plots$regression_to_the_mean <- base_rtm_plot + scale_alpha(range = c(0.9999, 1), guide = FALSE)
plots$regression_to_the_mean 

plots$regression_to_the_mean_highlight <- base_rtm_plot + scale_alpha(range = c(0.2, 1), guide = FALSE) 
plots$regression_to_the_mean_highlight 



```


```{r}
tmp_dir <- here::here("local_tmp_data")
if(!dir.exists(tmp_dir)) {
  dir.create(tmp_dir)
}

for(plot_name in names(plots)) {
  if(!is.null(widths[[plot_name]])) {
    width <- widths[[plot_name]]
  } else {
    width <- 8
  }
  if(!is.null(heights[[plot_name]])) {
    height <- heights[[plot_name]]
  } else {
    height <- width * 0.6666666666667
  }
  ggsave(plot = plots[[plot_name]], filename = paste0(tmp_dir,"/",plot_name,".png"), dpi = 300, width = width, height = height)
}
```