From 200e5435d37680c7a80dc3ca564f78924ca0d9fc Mon Sep 17 00:00:00 2001
From: Nathan Gaddis <ngaddis@rti.org>
Date: Thu, 21 Nov 2024 19:26:32 +0000
Subject: [PATCH] Updated run_gxe.R to remove chunking

---
 pulmonary_longitudinal_gxe/v1/Dockerfile |   1 +
 pulmonary_longitudinal_gxe/v1/run_gxe.R  | 379 +++++++++++------------
 2 files changed, 175 insertions(+), 205 deletions(-)

diff --git a/pulmonary_longitudinal_gxe/v1/Dockerfile b/pulmonary_longitudinal_gxe/v1/Dockerfile
index a7bacc3..211d6f4 100644
--- a/pulmonary_longitudinal_gxe/v1/Dockerfile
+++ b/pulmonary_longitudinal_gxe/v1/Dockerfile
@@ -20,3 +20,4 @@ RUN Rscript -e 'BiocManager::install(c("dbplyr"), ask = FALSE)'
 # Set wrkdir
 RUN mkdir /data
 WORKDIR /data
+
diff --git a/pulmonary_longitudinal_gxe/v1/run_gxe.R b/pulmonary_longitudinal_gxe/v1/run_gxe.R
index 58e3860..6e9d409 100644
--- a/pulmonary_longitudinal_gxe/v1/run_gxe.R
+++ b/pulmonary_longitudinal_gxe/v1/run_gxe.R
@@ -9,25 +9,18 @@ library(optparse)
 
 option_list = list(
     make_option(
-        c('--work-dir'),
+        c('--file-bgen'),
         action='store',
         default=NULL,
         type='character',
-        help="Path to working directory (required)"
-    ),
-    make_option(
-        c('--file-rds'),
-        action='store',
-        default=NULL,
-        type='character',
-        help="Path to rds file containing imputed genotypes (required)"
+        help="Path to bgen file containing imputed genotypes (required)"
     ),
     make_option(
         c('--file-bgi'),
         action='store',
         default=NULL,
         type='character',
-        help="Path to bgen info file (required)"
+        help="Path to bgen index file (required)"
     ),
     make_option(
         c('--file-sample'),
@@ -43,13 +36,6 @@ option_list = list(
         type='character',
         help="Path to phenotype file (required)"
     ),
-    make_option(
-        c('--chr'),
-        action='store',
-        default=NULL,
-        type='character',
-        help="Label to use for chromosome being analyzed (optional)"
-    ),
     make_option(
         c('--ancestry'),
         action='store',
@@ -71,19 +57,19 @@ option_list = list(
         type='character',
         help="Omega-3 to use for interaction (required)"
     ),
-    make_option(
-        c('--chunk-size'),
-        action='store',
-        default=10,
-        type='integer',
-        help="# of variants in each chunk"
-    ),
     make_option(
         c('--file-out-prefix'),
         action='store',
         default=NULL,
         type='character',
         help="Path to output file (required)"
+    ),
+    make_option(
+        c('--ncores'),
+        action='store',
+        default=1,
+        type='integer',
+        help="# of cores to use"
     )
 )
 
@@ -93,220 +79,203 @@ get_arg = function(args, parameter) {
     return(args[parameter][[1]])
 }
 
-work_dir = get_arg(args, 'work-dir')
-if (nchar(work_dir) > 0 && substr(work_dir, nchar(work_dir), nchar(work_dir)) != '/') {
-    work_dir = paste0(work_dir, '/')
-}
-chr = get_arg(args, "chr")
-pheno = get_arg(args, "pheno")
+file_bgen = get_arg(args, "file-bgen")
+file_bgi = get_arg(args, "file-bgi")
+file_sample = get_arg(args, "file-sample")
+file_pheno = get_arg(args, "file-pheno")
 ancestry = get_arg(args, "ancestry")
+pheno = get_arg(args, "pheno")
 omega3 = get_arg(args, "omega3")
 file_out_prefix = get_arg(args, "file-out-prefix")
+ncores = get_arg(args, "ncores")
 
 # Read sample file
 print("Reading sample file")
 samp_file = read.table(
-    get_arg(args, "file-sample"),
+    file_sample,
     header = TRUE
 )
 
 # Read pheno file
 print("Reading pheno file")
 pheno_file = read.table(
-    get_arg(args, "file-pheno"),
+    file_pheno,
     header = TRUE
 )
+
+# Get list of samples in both pheno and geno files
 with_geno_pheno = subset(pheno_file, pheno_file$IID %in% samp_file$ID_1)
 with_geno_pheno$FID_IID = paste0(with_geno_pheno$FID, "_", with_geno_pheno$IID)
 
-## Import bigsnp file 
-print("Reading rds file")
-ukb.bigSNP = bigsnpr::snp_attach(get_arg(args, "file-rds"))
-
-## Split imputed data into chunks for parallelizing
-info = bigsnpr::snp_readBGI(get_arg(args, "file-bgi"))
+## Read index file
+print("Reading bgi file")
+info = snp_readBGI(file_bgi)
 all_snp_ids = list(with(info, paste(chromosome, position, allele1, allele2, sep = "_")))
-snp_count = length(all_snp_ids[[1]])
-num_snps_vector = seq(1:snp_count)
-chunk_count = ceiling(snp_count / get_arg(args, "chunk-size"))
-get_chunk = function(x,n){
-    asdf = split(x, cut(seq_along(x), n, labels = FALSE))
-    return(asdf)
-}
-
-list_chunks = get_chunk(num_snps_vector, chunk_count)
-for (chunk in 1:chunk_count) {
-    chunk_to_run = list_chunks[[chunk]]
-    start = chunk_to_run[1]
-    end = chunk_to_run[length(chunk_to_run)]
-    index_snps = start:end
 
-    ## Subset bigsnpr file to chunk. Note this creates a _sub file so make sure to delete afterwards
-    rdsfile = bigsnpr::snp_subset(
-        ukb.bigSNP,
-        ind.col = index_snps,
-        backingfile = sprintf("%schr_%s_chunk_%s", work_dir, chr, chunk)
-    )
-    chunk_subset.bigSNP = bigsnpr::snp_attach(rdsfile)
+## Read bgen file
+print("Reading bgen file")
+rds = snp_readBGEN(
+    bgenfiles = file_bgen,
+    backingfile = sprintf("%s.bgen", file_out_prefix),
+    list_snp_id = all_snp_ids,
+    ncores = ncores
+)
+ukb.bigSNP = bigsnpr::snp_attach(rds)
 
-    # Get chunk genotype matrix
-    snp_ids = chunk_subset.bigSNP$map$rsid
-    chunk_genotypes = as.data.frame(chunk_subset.bigSNP$genotypes[])
-    colnames(chunk_genotypes) = snp_ids
-    chunk_genotypes$IID = samp_file[-1,]$ID_1
+# Get genotype matrix
+snp_ids = ukb.bigSNP$map$rsid
+genotypes = as.data.frame(ukb.bigSNP$genotypes[])
+colnames(genotypes) = snp_ids
+genotypes$IID = samp_file[-1,]$ID_1
 
-    ## Create df with phenotype and chunk of genotypes to run
-    merged = merge(with_geno_pheno, chunk_genotypes, by = 'IID')
+## Create df with phenotype and chunk of genotypes to run
+merged = merge(with_geno_pheno, genotypes, by = 'IID')
 
-    ## Convert time to time in years, as well as PFT to ML rather than L (this is how Bonnie had run things for her PFT GWAS paper)
-    merged$time_years = merged$time_since_baseline / 365
-    merged$FEV1 = merged$FEV1 * 1000
-    merged$FVC = merged$FVC * 1000
-    merged$FEV1_FVC_ratio = merged$FEV1/merged$FVC
+## Convert time to time in years, as well as PFT to ML rather than L (this is how Bonnie had run things for her PFT GWAS paper)
+merged$time_years = merged$time_since_baseline / 365
+merged$FEV1 = merged$FEV1 * 1000
+merged$FVC = merged$FVC * 1000
+merged$FEV1_FVC_ratio = merged$FEV1/merged$FVC
 
-    ## Subset to ancestry
-    final_data = subset(merged, merged$anc == ancestry) #Subset to eurs for now
+## Subset to ancestry
+final_data = subset(merged, merged$anc == ancestry) #Subset to eurs for now
 
-    #Fit models
-    gwas_1df_inter = as.data.frame(matrix(nrow=length(snp_ids), ncol=6))
-    gwas_2df_time = as.data.frame(matrix(nrow=length(snp_ids), ncol=3))
-    gwas_2df_n3 = as.data.frame(matrix(nrow=length(snp_ids), ncol=3))
+#Fit models
+gwas_1df_inter = as.data.frame(matrix(nrow=length(snp_ids), ncol=6))
+gwas_2df_time = as.data.frame(matrix(nrow=length(snp_ids), ncol=3))
+gwas_2df_n3 = as.data.frame(matrix(nrow=length(snp_ids), ncol=3))
 
-    i=1
-    for(snp in snp_ids){
-        # Compute maf
-        maf = sum(final_data[[snp]]) / (2*length(final_data[[snp]]))
+i=1
+for(snp in snp_ids){
+    # Compute maf
+    maf = sum(final_data[[snp]]) / (2*length(final_data[[snp]]))
+    if(maf < 0.001){
+        i = i + 1
+        print(sprintf("%s maf %s too low, skipped", snp, maf))
+        next
+    } else {
         print(snp)
-        if(maf < 0.001){
-          i = i + 1
-          print(sprintf("%s maf %s too low, skipped", snp, maf))
-          next
-        }
-        
-        # 0) Fit model that will be used to test each hypothesis. If pheno is FVC then include weight as a covariate
-        skip_to_next = FALSE
-        
-        if(pheno == 'FVC'){
-            tryCatch(
-                {
-                    mod = lmerTest::lmer(
-                        sprintf('%s ~ CURRENT_SMOKER + FORMER_SMOKER + AGE + AGE2 + SEX + HEIGHT + HEIGHT2 + WEIGHT +
-                                                time_years + `%s` + %s + `%s`*time_years + `%s`*%s + `%s`*time_years*%s + 
-                                                PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
-                                                (1+time_years|IID)', pheno, snp, omega3, snp, snp, omega3, snp, omega3),
-                        data = final_data,
-                        control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
-                    )
-                },
-                error = function(e) { skip_to_next = TRUE}
-            )
-        } else {
-            tryCatch(
-                {
-                    mod = lmerTest::lmer(
-                        sprintf('%s ~ CURRENT_SMOKER + FORMER_SMOKER + AGE + AGE2 + SEX + HEIGHT + HEIGHT2 +
-                                                time_years + `%s` + %s + `%s`*time_years + `%s`*%s + `%s`*time_years*%s + 
-                                                PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
-                                                (1+time_years|IID)', pheno, snp, omega3, snp, snp, omega3, snp, omega3), 
-                        data = final_data,
-                        control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5))
-                    )
-                },
-                error = function(e) { skip_to_next = TRUE}
-            )
-        }
-        
-        if(skip_to_next == TRUE) {
-            print("Can't fit initial model. Skipping SNP")
-            gwas_1df_inter[i,] = c(snp, NA, NA, NA, NA, NA)
-            gwas_2df_time[i,] = c(snp, NA, NA)
-            gwas_2df_n3[i,] = c(snp, NA, NA)
-            next
-        } 
-        
-        mod_coefs = summary(mod)$coefficients
-        cov = vcov(mod)
-        
-        # 1) test SNP:w3:time using lmerTest/lmer function
-        if((sprintf("time_years:`%s`:%s",snp, omega3) %in% rownames(mod_coefs)) | (sprintf("time_years:%s:%s",snp, omega3) %in% rownames(mod_coefs))){
-            snp_n3_time_coefs = mod_coefs[grep(sprintf("time_years:`%s`:%s|time_years:%s:%s",snp, omega3, snp, omega3), rownames(mod_coefs)), ] #5 cols
-            gwas_1df_inter[i,] = c(snp, snp_n3_time_coefs)
-        } else {
-            gwas_1df_inter[i,] = c(snp, NA, NA, NA, NA, NA)
-            print("Skipped interaction test")
-        }
-        
-        # 2) test SNP:time and SNP:w3:time 
-        #Get fixed effect estimates
-        snp_time_coefs = as.matrix(
-            mod_coefs[grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), rownames(mod_coefs)), ]
+    }
+    
+    # 0) Fit model that will be used to test each hypothesis. If pheno is FVC then include weight as a covariate
+    skip_to_next = FALSE
+    
+    if(pheno == 'FVC'){
+        tryCatch(
+            {
+                mod = lmerTest::lmer(
+                    sprintf('%s ~ CURRENT_SMOKER + FORMER_SMOKER + AGE + AGE2 + SEX + HEIGHT + HEIGHT2 + WEIGHT +
+                                            time_years + `%s` + %s + `%s`*time_years + `%s`*%s + `%s`*time_years*%s + 
+                                            PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
+                                            (1+time_years|IID)', pheno, snp, omega3, snp, snp, omega3, snp, omega3),
+                    data = final_data,
+                    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
+                )
+            },
+            error = function(e) { skip_to_next = TRUE}
         )
-        
-        if(dim(snp_time_coefs)[1] == 2){ #Check if two terms. If not, sometimes will be 5x1 or other
-            snp_time_betas = snp_time_coefs[,1]
-            snp_time_cov_betas = cov[grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), rownames(cov)), 
-                                      grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), colnames(cov))]
-            
-            #Compute wald test 
-            snp_time_wald_stat = t(snp_time_betas) %*% solve(snp_time_cov_betas) %*% snp_time_betas
-            snp_time_wald_pval = pchisq(as.numeric(snp_time_wald_stat), 2, lower.tail = FALSE) #2 df test
-            gwas_2df_time[i,] = c(snp, as.numeric(snp_time_wald_stat), as.numeric(snp_time_wald_pval))
-        } else {
-            gwas_2df_time[i,] = c(snp, NA, NA)
-            print("Skipped joint time test")
-        }
-        
-        # 3) test SNP:w3 and SNP:w3:time 
-        snp_n3_coefs = as.matrix(
-            mod_coefs[grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s", snp, omega3, snp, omega3, snp, omega3, snp, omega3), rownames(mod_coefs)), ]
+    } else {
+        tryCatch(
+            {
+                mod = lmerTest::lmer(
+                    sprintf('%s ~ CURRENT_SMOKER + FORMER_SMOKER + AGE + AGE2 + SEX + HEIGHT + HEIGHT2 +
+                                            time_years + `%s` + %s + `%s`*time_years + `%s`*%s + `%s`*time_years*%s + 
+                                            PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
+                                            (1+time_years|IID)', pheno, snp, omega3, snp, snp, omega3, snp, omega3), 
+                    data = final_data,
+                    control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5))
+                )
+            },
+            error = function(e) { skip_to_next = TRUE}
         )
+    }
+    
+    if(skip_to_next == TRUE) {
+        print("Can't fit initial model. Skipping SNP")
+        gwas_1df_inter[i,] = c(snp, NA, NA, NA, NA, NA)
+        gwas_2df_time[i,] = c(snp, NA, NA)
+        gwas_2df_n3[i,] = c(snp, NA, NA)
+        next
+    } 
+    
+    mod_coefs = summary(mod)$coefficients
+    cov = vcov(mod)
+    
+    # 1) test SNP:w3:time using lmerTest/lmer function
+    if((sprintf("time_years:`%s`:%s",snp, omega3) %in% rownames(mod_coefs)) | (sprintf("time_years:%s:%s",snp, omega3) %in% rownames(mod_coefs))){
+        snp_n3_time_coefs = mod_coefs[grep(sprintf("time_years:`%s`:%s|time_years:%s:%s",snp, omega3, snp, omega3), rownames(mod_coefs)), ] #5 cols
+        gwas_1df_inter[i,] = c(snp, snp_n3_time_coefs)
+    } else {
+        gwas_1df_inter[i,] = c(snp, NA, NA, NA, NA, NA)
+        print("Skipped interaction test")
+    }
+    
+    # 2) test SNP:time and SNP:w3:time 
+    #Get fixed effect estimates
+    snp_time_coefs = as.matrix(
+        mod_coefs[grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), rownames(mod_coefs)), ]
+    )
+    
+    if(dim(snp_time_coefs)[1] == 2){ #Check if two terms. If not, sometimes will be 5x1 or other
+        snp_time_betas = snp_time_coefs[,1]
+        snp_time_cov_betas = cov[grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), rownames(cov)), 
+                                    grep(sprintf("time_years:%s:%s|time_years:%s|time_years:`%s`:%s|time_years:`%s`",snp, omega3, snp, snp, omega3, snp), colnames(cov))]
         
-        if(dim(snp_n3_coefs)[1] == 2){
-            snp_n3_betas = (snp_n3_coefs[,1])
-            snp_n3_cov_betas = cov[grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s",snp, omega3, snp, omega3, snp, omega3, snp, omega3), rownames(cov)), 
-                                    grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s",snp, omega3, snp, omega3, snp, omega3, snp, omega3), colnames(cov))]
-            
-            #Compute wald test 
-            snp_n3_wald_stat = t(snp_n3_betas) %*% solve(snp_n3_cov_betas) %*% snp_n3_betas
-            snp_n3_wald_pval = pchisq(as.numeric(snp_n3_wald_stat), 2, lower.tail = FALSE) #2 df test
-            gwas_2df_n3[i,] = c(snp, as.numeric(snp_n3_wald_stat), as.numeric(snp_n3_wald_pval))
-        } else{
-            gwas_2df_n3[i,] = c(snp, NA, NA)
-            print("Skipped joint omega3 test")
-        }
-        
-        i = i + 1
+        #Compute wald test 
+        snp_time_wald_stat = t(snp_time_betas) %*% solve(snp_time_cov_betas) %*% snp_time_betas
+        snp_time_wald_pval = pchisq(as.numeric(snp_time_wald_stat), 2, lower.tail = FALSE) #2 df test
+        gwas_2df_time[i,] = c(snp, as.numeric(snp_time_wald_stat), as.numeric(snp_time_wald_pval))
+    } else {
+        gwas_2df_time[i,] = c(snp, NA, NA)
+        print("Skipped joint time test")
     }
-
-    #Add on column names and p value
-    colnames(gwas_1df_inter) = c("rsid", "est", "std_err", "df", "t_value", "p")
-    colnames(gwas_2df_time) = c("rsid", "time_2df_chi_sq", "time_2df_p")
-    colnames(gwas_2df_n3) = c("rsid", "n3_2df_chi_sq", "n3_2df_p")
-
-    #Combine output into one df 
-    all_output = merge(
-        merge(
-            gwas_1df_inter[complete.cases(gwas_1df_inter), ],
-            gwas_2df_time[complete.cases(gwas_2df_time), ],
-            by = 'rsid')
-        ,
-        gwas_2df_n3[complete.cases(gwas_2df_n3), ], by = 'rsid'
+    
+    # 3) test SNP:w3 and SNP:w3:time 
+    snp_n3_coefs = as.matrix(
+        mod_coefs[grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s", snp, omega3, snp, omega3, snp, omega3, snp, omega3), rownames(mod_coefs)), ]
     )
+    
+    if(dim(snp_n3_coefs)[1] == 2){
+        snp_n3_betas = (snp_n3_coefs[,1])
+        snp_n3_cov_betas = cov[grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s",snp, omega3, snp, omega3, snp, omega3, snp, omega3), rownames(cov)), 
+                                grep(sprintf("time_years:%s:%s|%s:%s|time_years:`%s`:%s|`%s`:%s",snp, omega3, snp, omega3, snp, omega3, snp, omega3), colnames(cov))]
+        
+        #Compute wald test 
+        snp_n3_wald_stat = t(snp_n3_betas) %*% solve(snp_n3_cov_betas) %*% snp_n3_betas
+        snp_n3_wald_pval = pchisq(as.numeric(snp_n3_wald_stat), 2, lower.tail = FALSE) #2 df test
+        gwas_2df_n3[i,] = c(snp, as.numeric(snp_n3_wald_stat), as.numeric(snp_n3_wald_pval))
+    } else{
+        gwas_2df_n3[i,] = c(snp, NA, NA)
+        print("Skipped joint omega3 test")
+    }
+    
+    i = i + 1
+}
 
-    # Merge back on chr and pos from map
-    final_output = merge(chunk_subset.bigSNP$map[,c("chromosome", "rsid", "physical.pos", "allele1", "allele2", "freq", "info")], all_output)
+#Add on column names and p value
+colnames(gwas_1df_inter) = c("rsid", "est", "std_err", "df", "t_value", "p")
+colnames(gwas_2df_time) = c("rsid", "time_2df_chi_sq", "time_2df_p")
+colnames(gwas_2df_n3) = c("rsid", "n3_2df_chi_sq", "n3_2df_p")
 
-    # Output 
-    write.table(
-        final_output,
-        sprintf("%s_chunk_%s.tsv", get_arg(args, "file-out-prefix"), chunk),
-        col.names=TRUE,
-        row.names=FALSE,
-        quote=FALSE,
-        sep="\t"
-    )
+#Combine output into one df 
+all_output = merge(
+    merge(
+        gwas_1df_inter[complete.cases(gwas_1df_inter), ],
+        gwas_2df_time[complete.cases(gwas_2df_time), ],
+        by = 'rsid')
+    ,
+    gwas_2df_n3[complete.cases(gwas_2df_n3), ], by = 'rsid'
+)
 
-    # Remove temp file  
-    system(sprintf("rm %schr_%s_chunk_%s.*", work_dir, chr, chunk))
+# Merge back on chr and pos from map
+final_output = merge(ukb.bigSNP$map[,c("chromosome", "rsid", "physical.pos", "allele1", "allele2", "freq", "info")], all_output)
 
-}
+# Output 
+write.table(
+    final_output,
+    sprintf("%s.tsv", file_out_prefix),
+    col.names=TRUE,
+    row.names=FALSE,
+    quote=FALSE,
+    sep="\t"
+)