From d0ecc46c1d6ef8132eba6bc3796768bf396fd818 Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 10:19:05 -0700
Subject: [PATCH 1/7] estimate_sigma now uses Pan-UKB approach

---
 src/make_hdf5.jl | 49 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index 36ad2807..cc4962b7 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -1,24 +1,39 @@
-function estimate_sigma(X::AbstractMatrix; min_eigval=1e-5)
-    # n, p = size(X)
-    # if n > p
-    #     Sigma = cor(X)
-    # else
-    #     error("p > n case not handled yet!")
-    # end
-
-    # shrinkage based covariance, handles high dimensional case
-    Sigma = cov(LinearShrinkage(DiagonalUnequalVariance(), :lw), X)
-
-    # ensure Sigma is PSD
-    evals, evecs = eigen(Sigma)
-    evals[findall(x -> x < min_eigval, evals)] .= min_eigval
-    Sigma = evecs * Diagonal(evals) * evecs'
-    Statistics.cov2cor!(Sigma, sqrt.(diag(Sigma))) # scale to correlation matrix
+"""
+    estimate_sigma(X::AbstractMatrix, Z::AbstractMatrix; [enforce_psd=true],
+        [min_eigval=1e-5])
+
+Estimate LD matrices from data `X` and covariates `C`. We adopt the method
+for Pan-UKB described here: 
+https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices.
+If `enforce_psd=true`, then the correlation matrix will be scaled so that the 
+minimum eigenvalue is `min_eigval`.
+"""
+function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
+    enforce_psd::Bool=true, min_eigval::Float64 = 1e-5)
+    # check for errors
+    n = size(X, 1)
+    n == size(C, 1) || error("Samples in X and C should be the same")
+    all(x -> isapprox(x, 0, atol=1e-12), mean(X, dims=1)) || error(
+        "Columns of X must be scaled to mean 0 variance 1.")
+
+    # pan-ukb routine
+    Mc = I - C * inv(Symmetric(C' * C)) * C'
+    Xadj = Mc * X
+    Sigma = Xadj' * Xadj / n
+
+    # numerical stability
+    if enforce_psd
+        evals, evecs = eigen(Sigma)
+        evals[findall(x -> x < min_eigval, evals)] .= min_eigval
+        Sigma = evecs * Diagonal(evals) * evecs'
+    end
+
+    # scale to correlation matrix
+    StatsBase.cov2cor!(Sigma, sqrt.(diag(Sigma)))
 
     return Sigma
 end
 
-
 """
     get_block(file::String, chr::Int, start_bp::Int, end_bp::Int;
         [min_maf::Float64=0.01], [min_hwe::Float64=0.0], 

From 98f6f748567f475b0fbed45e721a1bb268a81e5e Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 10:29:16 -0700
Subject: [PATCH 2/7] estimate_sigma handles the case with no covariates

---
 src/make_hdf5.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index cc4962b7..3c0457d5 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -1,10 +1,11 @@
 """
-    estimate_sigma(X::AbstractMatrix, Z::AbstractMatrix; [enforce_psd=true],
+    estimate_sigma(X::AbstractMatrix; [enforce_psd=true], [min_eigval=1e-5])
+    estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; [enforce_psd=true],
         [min_eigval=1e-5])
 
-Estimate LD matrices from data `X` and covariates `C`. We adopt the method
-for Pan-UKB described here: 
-https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices.
+Estimate LD matrices from data `X`, accounting for covariates `C` if there are any. 
+We adopt the method for Pan-UKB described in 
+`https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices`.
 If `enforce_psd=true`, then the correlation matrix will be scaled so that the 
 minimum eigenvalue is `min_eigval`.
 """
@@ -17,7 +18,7 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
         "Columns of X must be scaled to mean 0 variance 1.")
 
     # pan-ukb routine
-    Mc = I - C * inv(Symmetric(C' * C)) * C'
+    Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n))
     Xadj = Mc * X
     Sigma = Xadj' * Xadj / n
 
@@ -33,6 +34,8 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
 
     return Sigma
 end
+estimate_sigma(X; enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) = 
+    estimate_sigma(X, zeros(size(X, 1), 0); enforce_psd=enforce_psd, min_eigval=min_eigval)
 
 """
     get_block(file::String, chr::Int, start_bp::Int, end_bp::Int;

From 03e7b967e5ab7af34c510245979f953b59a5b0ae Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 10:32:13 -0700
Subject: [PATCH 3/7] estimate_sigma now center/scales X internally

---
 src/make_hdf5.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index 3c0457d5..696cd2d1 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -13,11 +13,10 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
     enforce_psd::Bool=true, min_eigval::Float64 = 1e-5)
     # check for errors
     n = size(X, 1)
-    n == size(C, 1) || error("Samples in X and C should be the same")
-    all(x -> isapprox(x, 0, atol=1e-12), mean(X, dims=1)) || error(
-        "Columns of X must be scaled to mean 0 variance 1.")
+    n == size(C, 1) || error("Sample size in X and C should be the same")
 
     # pan-ukb routine
+    zscore!(X, mean(X, dims=1), std(X, dims=1))
     Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n))
     Xadj = Mc * X
     Sigma = Xadj' * Xadj / n

From 2d2a81ee34dfc26b3ad07f0829a18d027c8f74f7 Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 13:39:55 -0700
Subject: [PATCH 4/7] new function get_covariates

---
 src/app.jl       | 22 ++++++++++++++++----
 src/make_hdf5.jl | 54 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/src/app.jl b/src/app.jl
index 1fc79ccf..7822d3db 100644
--- a/src/app.jl
+++ b/src/app.jl
@@ -186,13 +186,14 @@ end
 function julia_solveblock()::Cint
     try
         # read command line arguments
-        file, chr, start_bp, end_bp, outdir, hg_build, tol, min_maf, 
+        file, chr, start_bp, end_bp, outdir, hg_build, covfile, tol, min_maf, 
             min_hwe, method, linkage, force_contiguous, group_cor_cutoff, 
             group_rep_cutoff, verbose = parse_solveblock_commandline(true)
 
         println("\n\nWelcome to the `solve_block` module of GhostKnockoffGWAS!")
         println("You have specified the following options:")
         println("genotype file    = ", abspath(file))
+        println("covariate file   = ", covfile == "" ? "" : abspath(covfile))
         println("chr              = ", chr)
         println("start_bp         = ", start_bp)
         println("end_bp           = ", end_bp)
@@ -209,7 +210,7 @@ function julia_solveblock()::Cint
         println("verbose          = ", verbose)
         println("\n")
 
-        solve_blocks(file, chr, start_bp, end_bp, outdir, 
+        solve_blocks(file, covfile, chr, start_bp, end_bp, outdir, 
             hg_build, tol=tol, min_maf=min_maf, min_hwe=min_hwe, 
             method=method, linkage=linkage, force_contiguous=force_contiguous,
             group_cor_cutoff=group_cor_cutoff, 
@@ -256,6 +257,18 @@ function parse_solveblock_commandline(parseargs::Bool)
                 "38 (hg38)"
             required = true
             arg_type = Int
+        "--covfile"
+            help = "An optional comma- or tab-separated file containing sample" *  
+                " covariates (e.g. sex, age, PCs). These will be used to " * 
+                " improve LD estimation. The first row should be a header row." * 
+                " The first column should be sample IDs " * 
+                " (not necessary to be in the sample order as genotype files" *
+                " ) and all other columns will be used as additional covariates." * 
+                " Note if genotypes are stored in binary PLINK format, then the" *
+                " sample ID column in the covariate file should be FID_IID (that " * 
+                " is, the first 2 columns of the .fam file merged by an underscore)"
+            arg_type = String
+            default = ""
         "--tol"
             help = "Convergence tolerlance for coordinate descent algorithm " *
                 "(default `0.0001`)"
@@ -320,7 +333,7 @@ function parse_solveblock_commandline(parseargs::Bool)
             "--min_maf","0.01","--min_hwe","0.01","--method","maxent",
             "--linkage","average","--force_contiguous","false",
             "--group_cor_cutoff","0.5","--group_rep_cutoff","0.5",
-            "--verbose","true"], s
+            "--verbose","true","--covfile",""], s
         )
         _useless = parse_args(["--help"], s)
         return nothing
@@ -333,6 +346,7 @@ function parse_solveblock_commandline(parseargs::Bool)
     end_bp = parsed_args["end_bp"]
     outdir = parsed_args["outdir"]
     hg_build = parsed_args["genome-build"]
+    covfile = parsed_args["covfile"]
     tol = parsed_args["tol"]
     min_maf = parsed_args["min_maf"]
     min_hwe = parsed_args["min_hwe"]
@@ -344,6 +358,6 @@ function parse_solveblock_commandline(parseargs::Bool)
     verbose = parsed_args["verbose"]
 
     return file, chr, start_bp, end_bp, outdir, 
-        hg_build, tol, min_maf, min_hwe, method, linkage, force_contiguous, 
+        hg_build, covfile, tol, min_maf, min_hwe, method, linkage, force_contiguous, 
         group_cor_cutoff, group_rep_cutoff, verbose
 end
diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index 696cd2d1..c506faaa 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -267,6 +267,48 @@ function rearrange_snps!(groups, group_reps, Sigma, Sigma_info)
     return nothing
 end
 
+"""
+    get_covariates(covfile::String, genotype_file::String)
+
+Read covariates from `covfile` and reorders the rows so that the rows come in 
+the same order as sample IDs specified in (VCF or binary PLINK) `genotype_file`.
+
+# Inputs
++ `covfile`: A comma- or tab-separated file containing sample covariates. The
+    first row should be a header row. The first column should be sample IDs (not
+    necessary to be in the sample order as genotype files and all other columns
+    will be used as additional covariates. Note if genotypes are stored in binary
+    PLINK format, then the sample ID column in the covariate file should be 
+    FID_IID (that is, the first 2 columns of the .fam file merged by an 
+    underscore).
++ `genotype_file`: A VCF or binary PLINK file storing individual level genotypes.
+    Must end in `.vcf`, `.vcf.gz`, or `.bed`. 
+"""
+function get_covariates(covfile::String, genotype_file::String)
+    if endswith(genotype_file, ".vcf") || endswith(genotype_file, ".vcf.gz")
+        sampleIDs = sampleID(genotype_file)
+    elseif endswith(genotype_file, ".bed")
+        famfile = genotype_file[1:end-4] * ".fam"
+        fam_df = CSV.read(famfile, DataFrame, header=false)
+        sampleIDs = string.(fam_df[!, 1], "_", fam_df[!, 2])
+    else
+        error("Genotype file should be in VCF (ends in .vcf " *
+            "or .vcf.gz) or binary PLINK (ends in .bed) format.")
+    end
+
+    # read covariate data and match sample IDs
+    covdata = CSV.read(covfile, DataFrame)
+    cov_sampleIDs = string.(covdata[!, 1])
+    idx = indexin(sampleIDs, cov_sampleIDs)
+    if length(idx) != length(sampleIDs)
+        error("A covariate file was supplied but >=1 genotyped sample(s)" * 
+            " does not have covariate data. Please check if the covariate" * 
+            " file has the correct sample IDs.")
+    end
+    C = Matrix(covdata[idx, 2:end])
+    return C::Matrix{Float64}
+end
+
 """
     solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int, 
         outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01], 
@@ -288,6 +330,11 @@ it is indexed). Thus, we *strongly recommend* one convert to binary PLINK format
     end in `.vcf`, `.vcf.gz`, or `.bed`. If a VCF file is used, the ALT field for
     each record must be unique, i.e. multiallelic records must be split first. 
     Missing genotypes will be imputed by column mean. 
++ `covfile`: An optional comma- or tab-separated file containing sample covariates 
+    (e.g. sex, age, PCs). This argument can be an empty string. The supplied 
+    covariates will be used to improve LD estimation. The first column should be
+    sample IDs (not necessary to be in the sample order as VCF or PLINK files)
+    and all other columns will be used as additional covariates.
 + `chr`: Target chromosome. This MUST be an integer and it must match the `CHROM`
     field in your VCF/PLINK file. For example, if your VCF file has CHROM field
     like `chr1`, `CHR1`, or `CHROM1` etc, they must be renamed into `1`. 
@@ -354,6 +401,7 @@ Calling `solve_blocks` will create 3 files in the directory `outdir/chr`:
 """
 function solve_blocks(
     file::String,
+    covfile::String,
     chr::Int,
     start_bp::Int, 
     end_bp::Int, 
@@ -391,7 +439,11 @@ function solve_blocks(
         size(X, 1) ≥ 10 || 
             error("Detected less than 10 samples, not recommended")
 
-        Sigma = estimate_sigma(X)
+        # read covariates, if any
+        C = covfile == "" ? zeros(size(X, 1), 0) : get_covariates(covfile, file)
+
+        # estimate correlation matrix
+        Sigma = estimate_sigma(X, C)
         rename!(data_info, "pos" => "pos_hg$hg_build") # associate pos with hg_build
     end
 

From fad9f3405197e3a07a20d7bdbbd5ad969fe13dfd Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 13:41:59 -0700
Subject: [PATCH 5/7] up project version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 142dda95..c7154cb8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "GhostKnockoffGWAS"
 uuid = "28dc8d00-4921-4061-9921-3f423e4be5cc"
 authors = ["Benjamin Chu <benchu99@hotmail.com>"]
-version = "0.2.2"
+version = "0.2.3"
 
 [deps]
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"

From e636485178fcd61aa5bf774b2505598fd5c037c0 Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Tue, 29 Oct 2024 16:00:06 -0700
Subject: [PATCH 6/7] stop mutationg X in estimate_sigma

---
 src/app.jl       | 6 +++---
 src/make_hdf5.jl | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/app.jl b/src/app.jl
index 7822d3db..34ab1f03 100644
--- a/src/app.jl
+++ b/src/app.jl
@@ -259,13 +259,13 @@ function parse_solveblock_commandline(parseargs::Bool)
             arg_type = Int
         "--covfile"
             help = "An optional comma- or tab-separated file containing sample" *  
-                " covariates (e.g. sex, age, PCs). These will be used to " * 
+                " covariates (e.g. sex, age, PCs). These will be used to" * 
                 " improve LD estimation. The first row should be a header row." * 
-                " The first column should be sample IDs " * 
+                " The first column should be sample IDs" * 
                 " (not necessary to be in the sample order as genotype files" *
                 " ) and all other columns will be used as additional covariates." * 
                 " Note if genotypes are stored in binary PLINK format, then the" *
-                " sample ID column in the covariate file should be FID_IID (that " * 
+                " sample ID column in the covariate file should be FID_IID (that" * 
                 " is, the first 2 columns of the .fam file merged by an underscore)"
             arg_type = String
             default = ""
diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index c506faaa..b96e4bac 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -16,9 +16,9 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
     n == size(C, 1) || error("Sample size in X and C should be the same")
 
     # pan-ukb routine
-    zscore!(X, mean(X, dims=1), std(X, dims=1))
+    Xc = StatsBase.zscore(X, mean(X, dims=1), std(X, dims=1))
     Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n))
-    Xadj = Mc * X
+    Xadj = Mc * Xc
     Sigma = Xadj' * Xadj / n
 
     # numerical stability

From b548ca6d1be7915c7b51f5478591bd2308a3466a Mon Sep 17 00:00:00 2001
From: Benjamin Chu <benchu99@hotmail.com>
Date: Wed, 30 Oct 2024 10:56:22 -0700
Subject: [PATCH 7/7] update unit tests

---
 src/make_hdf5.jl | 2 +-
 test/runtests.jl | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
index b96e4bac..7f6c97fd 100644
--- a/src/make_hdf5.jl
+++ b/src/make_hdf5.jl
@@ -310,7 +310,7 @@ function get_covariates(covfile::String, genotype_file::String)
 end
 
 """
-    solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int, 
+    solve_blocks(file::String, covfile::String, chr::Int, start_bp::Int, end_bp::Int, 
         outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01], 
         [min_hwe=0.0], [force_block_diag=true], 
         [method::String = "maxent"], [linkage::String="average"],
diff --git a/test/runtests.jl b/test/runtests.jl
index 3d1cfff7..0168bad1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -220,7 +220,8 @@ end
     outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files")
     isdir(outdir) || mkpath(outdir)
     hg_build = 19
-    @time solve_blocks(vcffile, chr, start_bp, end_bp, outdir, hg_build, 
+    covfile = ""
+    @time solve_blocks(vcffile, covfile, chr, start_bp, end_bp, outdir, hg_build, 
         min_maf = 0.01, min_hwe = 0.0)
 
     # test basic output structure
@@ -289,7 +290,8 @@ end
     outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files2")
     isdir(outdir) || mkpath(outdir)
     hg_build = 19
-    @time solve_blocks(plinkfile * ".bed", chr, start_bp, end_bp, outdir,
+    covfile = ""
+    @time solve_blocks(plinkfile * ".bed", covfile, chr, start_bp, end_bp, outdir,
         hg_build, min_maf = 0.01, min_hwe = 0.0)
 
     # check PLINK vs VCF output is the same