biona001 · biona001 · Oct 30, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "GhostKnockoffGWAS"
 uuid = "28dc8d00-4921-4061-9921-3f423e4be5cc"
 authors = ["Benjamin Chu <[email protected]>"]
-version = "0.2.2"
+version = "0.2.3"
 
 [deps]
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"

diff --git a/src/app.jl b/src/app.jl
@@ -186,13 +186,14 @@ end
 function julia_solveblock()::Cint
     try
         # read command line arguments
-        file, chr, start_bp, end_bp, outdir, hg_build, tol, min_maf, 
+        file, chr, start_bp, end_bp, outdir, hg_build, covfile, tol, min_maf, 
             min_hwe, method, linkage, force_contiguous, group_cor_cutoff, 
             group_rep_cutoff, verbose = parse_solveblock_commandline(true)
 
         println("\n\nWelcome to the `solve_block` module of GhostKnockoffGWAS!")
         println("You have specified the following options:")
         println("genotype file    = ", abspath(file))
+        println("covariate file   = ", covfile == "" ? "" : abspath(covfile))
         println("chr              = ", chr)
         println("start_bp         = ", start_bp)
         println("end_bp           = ", end_bp)
@@ -209,7 +210,7 @@ function julia_solveblock()::Cint
         println("verbose          = ", verbose)
         println("\n")
 
-        solve_blocks(file, chr, start_bp, end_bp, outdir, 
+        solve_blocks(file, covfile, chr, start_bp, end_bp, outdir, 
             hg_build, tol=tol, min_maf=min_maf, min_hwe=min_hwe, 
             method=method, linkage=linkage, force_contiguous=force_contiguous,
             group_cor_cutoff=group_cor_cutoff, 
@@ -256,6 +257,18 @@ function parse_solveblock_commandline(parseargs::Bool)
                 "38 (hg38)"
             required = true
             arg_type = Int
+        "--covfile"
+            help = "An optional comma- or tab-separated file containing sample" *  
+                " covariates (e.g. sex, age, PCs). These will be used to" * 
+                " improve LD estimation. The first row should be a header row." * 
+                " The first column should be sample IDs" * 
+                " (not necessary to be in the sample order as genotype files" *
+                " ) and all other columns will be used as additional covariates." * 
+                " Note if genotypes are stored in binary PLINK format, then the" *
+                " sample ID column in the covariate file should be FID_IID (that" * 
+                " is, the first 2 columns of the .fam file merged by an underscore)"
+            arg_type = String
+            default = ""
         "--tol"
             help = "Convergence tolerlance for coordinate descent algorithm " *
                 "(default `0.0001`)"
@@ -320,7 +333,7 @@ function parse_solveblock_commandline(parseargs::Bool)
             "--min_maf","0.01","--min_hwe","0.01","--method","maxent",
             "--linkage","average","--force_contiguous","false",
             "--group_cor_cutoff","0.5","--group_rep_cutoff","0.5",
-            "--verbose","true"], s
+            "--verbose","true","--covfile",""], s
         )
         _useless = parse_args(["--help"], s)
         return nothing
@@ -333,6 +346,7 @@ function parse_solveblock_commandline(parseargs::Bool)
     end_bp = parsed_args["end_bp"]
     outdir = parsed_args["outdir"]
     hg_build = parsed_args["genome-build"]
+    covfile = parsed_args["covfile"]
     tol = parsed_args["tol"]
     min_maf = parsed_args["min_maf"]
     min_hwe = parsed_args["min_hwe"]
@@ -344,6 +358,6 @@ function parse_solveblock_commandline(parseargs::Bool)
     verbose = parsed_args["verbose"]
 
     return file, chr, start_bp, end_bp, outdir, 
-        hg_build, tol, min_maf, min_hwe, method, linkage, force_contiguous, 
+        hg_build, covfile, tol, min_maf, min_hwe, method, linkage, force_contiguous, 
         group_cor_cutoff, group_rep_cutoff, verbose
 end
diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl
@@ -1,23 +1,40 @@
-function estimate_sigma(X::AbstractMatrix; min_eigval=1e-5)
-    # n, p = size(X)
-    # if n > p
-    #     Sigma = cor(X)
-    # else
-    #     error("p > n case not handled yet!")
-    # end
-
-    # shrinkage based covariance, handles high dimensional case
-    Sigma = cov(LinearShrinkage(DiagonalUnequalVariance(), :lw), X)
-
-    # ensure Sigma is PSD
-    evals, evecs = eigen(Sigma)
-    evals[findall(x -> x < min_eigval, evals)] .= min_eigval
-    Sigma = evecs * Diagonal(evals) * evecs'
-    Statistics.cov2cor!(Sigma, sqrt.(diag(Sigma))) # scale to correlation matrix
+"""
+    estimate_sigma(X::AbstractMatrix; [enforce_psd=true], [min_eigval=1e-5])
+    estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; [enforce_psd=true],
+        [min_eigval=1e-5])
+
+Estimate LD matrices from data `X`, accounting for covariates `C` if there are any. 
+We adopt the method for Pan-UKB described in 
+`https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices`.
+If `enforce_psd=true`, then the correlation matrix will be scaled so that the 
+minimum eigenvalue is `min_eigval`.
+"""
+function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
+    enforce_psd::Bool=true, min_eigval::Float64 = 1e-5)
+    # check for errors
+    n = size(X, 1)
+    n == size(C, 1) || error("Sample size in X and C should be the same")
+
+    # pan-ukb routine
+    Xc = StatsBase.zscore(X, mean(X, dims=1), std(X, dims=1))
+    Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n))
+    Xadj = Mc * Xc
+    Sigma = Xadj' * Xadj / n
+
+    # numerical stability
+    if enforce_psd
+        evals, evecs = eigen(Sigma)
+        evals[findall(x -> x < min_eigval, evals)] .= min_eigval
+        Sigma = evecs * Diagonal(evals) * evecs'
+    end
+
+    # scale to correlation matrix
+    StatsBase.cov2cor!(Sigma, sqrt.(diag(Sigma)))
 
     return Sigma
 end
-
+estimate_sigma(X; enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) = 
+    estimate_sigma(X, zeros(size(X, 1), 0); enforce_psd=enforce_psd, min_eigval=min_eigval)
 
 """
     get_block(file::String, chr::Int, start_bp::Int, end_bp::Int;
@@ -251,7 +268,49 @@ function rearrange_snps!(groups, group_reps, Sigma, Sigma_info)
 end
 
 """
-    solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int, 
+    get_covariates(covfile::String, genotype_file::String)
+
+Read covariates from `covfile` and reorders the rows so that the rows come in 
+the same order as sample IDs specified in (VCF or binary PLINK) `genotype_file`.
+
+# Inputs
++ `covfile`: A comma- or tab-separated file containing sample covariates. The
+    first row should be a header row. The first column should be sample IDs (not
+    necessary to be in the sample order as genotype files and all other columns
+    will be used as additional covariates. Note if genotypes are stored in binary
+    PLINK format, then the sample ID column in the covariate file should be 
+    FID_IID (that is, the first 2 columns of the .fam file merged by an 
+    underscore).
++ `genotype_file`: A VCF or binary PLINK file storing individual level genotypes.
+    Must end in `.vcf`, `.vcf.gz`, or `.bed`. 
+"""
+function get_covariates(covfile::String, genotype_file::String)
+    if endswith(genotype_file, ".vcf") || endswith(genotype_file, ".vcf.gz")
+        sampleIDs = sampleID(genotype_file)
+    elseif endswith(genotype_file, ".bed")
+        famfile = genotype_file[1:end-4] * ".fam"
+        fam_df = CSV.read(famfile, DataFrame, header=false)
+        sampleIDs = string.(fam_df[!, 1], "_", fam_df[!, 2])
+    else
+        error("Genotype file should be in VCF (ends in .vcf " *
+            "or .vcf.gz) or binary PLINK (ends in .bed) format.")
+    end
+
+    # read covariate data and match sample IDs
+    covdata = CSV.read(covfile, DataFrame)
+    cov_sampleIDs = string.(covdata[!, 1])
+    idx = indexin(sampleIDs, cov_sampleIDs)
+    if length(idx) != length(sampleIDs)
+        error("A covariate file was supplied but >=1 genotyped sample(s)" * 
+            " does not have covariate data. Please check if the covariate" * 
+            " file has the correct sample IDs.")
+    end
+    C = Matrix(covdata[idx, 2:end])
+    return C::Matrix{Float64}
+end
+
+"""
+    solve_blocks(file::String, covfile::String, chr::Int, start_bp::Int, end_bp::Int, 
         outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01], 
         [min_hwe=0.0], [force_block_diag=true], 
         [method::String = "maxent"], [linkage::String="average"],
@@ -271,6 +330,11 @@ it is indexed). Thus, we *strongly recommend* one convert to binary PLINK format
     end in `.vcf`, `.vcf.gz`, or `.bed`. If a VCF file is used, the ALT field for
     each record must be unique, i.e. multiallelic records must be split first. 
     Missing genotypes will be imputed by column mean. 
++ `covfile`: An optional comma- or tab-separated file containing sample covariates 
+    (e.g. sex, age, PCs). This argument can be an empty string. The supplied 
+    covariates will be used to improve LD estimation. The first column should be
+    sample IDs (not necessary to be in the sample order as VCF or PLINK files)
+    and all other columns will be used as additional covariates.
 + `chr`: Target chromosome. This MUST be an integer and it must match the `CHROM`
     field in your VCF/PLINK file. For example, if your VCF file has CHROM field
     like `chr1`, `CHR1`, or `CHROM1` etc, they must be renamed into `1`. 
@@ -337,6 +401,7 @@ Calling `solve_blocks` will create 3 files in the directory `outdir/chr`:
 """
 function solve_blocks(
     file::String,
+    covfile::String,
     chr::Int,
     start_bp::Int, 
     end_bp::Int, 
@@ -374,7 +439,11 @@ function solve_blocks(
         size(X, 1) ≥ 10 || 
             error("Detected less than 10 samples, not recommended")
 
-        Sigma = estimate_sigma(X)
+        # read covariates, if any
+        C = covfile == "" ? zeros(size(X, 1), 0) : get_covariates(covfile, file)
+
+        # estimate correlation matrix
+        Sigma = estimate_sigma(X, C)
         rename!(data_info, "pos" => "pos_hg$hg_build") # associate pos with hg_build
     end
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -220,7 +220,8 @@ end
     outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files")
     isdir(outdir) || mkpath(outdir)
     hg_build = 19
-    @time solve_blocks(vcffile, chr, start_bp, end_bp, outdir, hg_build, 
+    covfile = ""
+    @time solve_blocks(vcffile, covfile, chr, start_bp, end_bp, outdir, hg_build, 
         min_maf = 0.01, min_hwe = 0.0)
 
     # test basic output structure
@@ -289,7 +290,8 @@ end
     outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files2")
     isdir(outdir) || mkpath(outdir)
     hg_build = 19
-    @time solve_blocks(plinkfile * ".bed", chr, start_bp, end_bp, outdir,
+    covfile = ""
+    @time solve_blocks(plinkfile * ".bed", covfile, chr, start_bp, end_bp, outdir,
         hg_build, min_maf = 0.01, min_hwe = 0.0)
 
     # check PLINK vs VCF output is the same