From d0ecc46c1d6ef8132eba6bc3796768bf396fd818 Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 10:19:05 -0700 Subject: [PATCH 1/7] estimate_sigma now uses Pan-UKB approach --- src/make_hdf5.jl | 49 +++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index 36ad2807..cc4962b7 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -1,24 +1,39 @@ -function estimate_sigma(X::AbstractMatrix; min_eigval=1e-5) - # n, p = size(X) - # if n > p - # Sigma = cor(X) - # else - # error("p > n case not handled yet!") - # end - - # shrinkage based covariance, handles high dimensional case - Sigma = cov(LinearShrinkage(DiagonalUnequalVariance(), :lw), X) - - # ensure Sigma is PSD - evals, evecs = eigen(Sigma) - evals[findall(x -> x < min_eigval, evals)] .= min_eigval - Sigma = evecs * Diagonal(evals) * evecs' - Statistics.cov2cor!(Sigma, sqrt.(diag(Sigma))) # scale to correlation matrix +""" + estimate_sigma(X::AbstractMatrix, Z::AbstractMatrix; [enforce_psd=true], + [min_eigval=1e-5]) + +Estimate LD matrices from data `X` and covariates `C`. We adopt the method +for Pan-UKB described here: +https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices. +If `enforce_psd=true`, then the correlation matrix will be scaled so that the +minimum eigenvalue is `min_eigval`. +""" +function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; + enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) + # check for errors + n = size(X, 1) + n == size(C, 1) || error("Samples in X and C should be the same") + all(x -> isapprox(x, 0, atol=1e-12), mean(X, dims=1)) || error( + "Columns of X must be scaled to mean 0 variance 1.") + + # pan-ukb routine + Mc = I - C * inv(Symmetric(C' * C)) * C' + Xadj = Mc * X + Sigma = Xadj' * Xadj / n + + # numerical stability + if enforce_psd + evals, evecs = eigen(Sigma) + evals[findall(x -> x < min_eigval, evals)] .= min_eigval + Sigma = evecs * Diagonal(evals) * evecs' + end + + # scale to correlation matrix + StatsBase.cov2cor!(Sigma, sqrt.(diag(Sigma))) return Sigma end - """ get_block(file::String, chr::Int, start_bp::Int, end_bp::Int; [min_maf::Float64=0.01], [min_hwe::Float64=0.0], From 98f6f748567f475b0fbed45e721a1bb268a81e5e Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 10:29:16 -0700 Subject: [PATCH 2/7] estimate_sigma handles the case with no covariates --- src/make_hdf5.jl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index cc4962b7..3c0457d5 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -1,10 +1,11 @@ """ - estimate_sigma(X::AbstractMatrix, Z::AbstractMatrix; [enforce_psd=true], + estimate_sigma(X::AbstractMatrix; [enforce_psd=true], [min_eigval=1e-5]) + estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; [enforce_psd=true], [min_eigval=1e-5]) -Estimate LD matrices from data `X` and covariates `C`. We adopt the method -for Pan-UKB described here: -https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices. +Estimate LD matrices from data `X`, accounting for covariates `C` if there are any. +We adopt the method for Pan-UKB described in +`https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices`. If `enforce_psd=true`, then the correlation matrix will be scaled so that the minimum eigenvalue is `min_eigval`. """ @@ -17,7 +18,7 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; "Columns of X must be scaled to mean 0 variance 1.") # pan-ukb routine - Mc = I - C * inv(Symmetric(C' * C)) * C' + Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n)) Xadj = Mc * X Sigma = Xadj' * Xadj / n @@ -33,6 +34,8 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; return Sigma end +estimate_sigma(X; enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) = + estimate_sigma(X, zeros(size(X, 1), 0); enforce_psd=enforce_psd, min_eigval=min_eigval) """ get_block(file::String, chr::Int, start_bp::Int, end_bp::Int; From 03e7b967e5ab7af34c510245979f953b59a5b0ae Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 10:32:13 -0700 Subject: [PATCH 3/7] estimate_sigma now center/scales X internally --- src/make_hdf5.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index 3c0457d5..696cd2d1 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -13,11 +13,10 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) # check for errors n = size(X, 1) - n == size(C, 1) || error("Samples in X and C should be the same") - all(x -> isapprox(x, 0, atol=1e-12), mean(X, dims=1)) || error( - "Columns of X must be scaled to mean 0 variance 1.") + n == size(C, 1) || error("Sample size in X and C should be the same") # pan-ukb routine + zscore!(X, mean(X, dims=1), std(X, dims=1)) Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n)) Xadj = Mc * X Sigma = Xadj' * Xadj / n From 2d2a81ee34dfc26b3ad07f0829a18d027c8f74f7 Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 13:39:55 -0700 Subject: [PATCH 4/7] new function get_covariates --- src/app.jl | 22 ++++++++++++++++---- src/make_hdf5.jl | 54 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/app.jl b/src/app.jl index 1fc79ccf..7822d3db 100644 --- a/src/app.jl +++ b/src/app.jl @@ -186,13 +186,14 @@ end function julia_solveblock()::Cint try # read command line arguments - file, chr, start_bp, end_bp, outdir, hg_build, tol, min_maf, + file, chr, start_bp, end_bp, outdir, hg_build, covfile, tol, min_maf, min_hwe, method, linkage, force_contiguous, group_cor_cutoff, group_rep_cutoff, verbose = parse_solveblock_commandline(true) println("\n\nWelcome to the `solve_block` module of GhostKnockoffGWAS!") println("You have specified the following options:") println("genotype file = ", abspath(file)) + println("covariate file = ", covfile == "" ? "" : abspath(covfile)) println("chr = ", chr) println("start_bp = ", start_bp) println("end_bp = ", end_bp) @@ -209,7 +210,7 @@ function julia_solveblock()::Cint println("verbose = ", verbose) println("\n") - solve_blocks(file, chr, start_bp, end_bp, outdir, + solve_blocks(file, covfile, chr, start_bp, end_bp, outdir, hg_build, tol=tol, min_maf=min_maf, min_hwe=min_hwe, method=method, linkage=linkage, force_contiguous=force_contiguous, group_cor_cutoff=group_cor_cutoff, @@ -256,6 +257,18 @@ function parse_solveblock_commandline(parseargs::Bool) "38 (hg38)" required = true arg_type = Int + "--covfile" + help = "An optional comma- or tab-separated file containing sample" * + " covariates (e.g. sex, age, PCs). These will be used to " * + " improve LD estimation. The first row should be a header row." * + " The first column should be sample IDs " * + " (not necessary to be in the sample order as genotype files" * + " ) and all other columns will be used as additional covariates." * + " Note if genotypes are stored in binary PLINK format, then the" * + " sample ID column in the covariate file should be FID_IID (that " * + " is, the first 2 columns of the .fam file merged by an underscore)" + arg_type = String + default = "" "--tol" help = "Convergence tolerlance for coordinate descent algorithm " * "(default `0.0001`)" @@ -320,7 +333,7 @@ function parse_solveblock_commandline(parseargs::Bool) "--min_maf","0.01","--min_hwe","0.01","--method","maxent", "--linkage","average","--force_contiguous","false", "--group_cor_cutoff","0.5","--group_rep_cutoff","0.5", - "--verbose","true"], s + "--verbose","true","--covfile",""], s ) _useless = parse_args(["--help"], s) return nothing @@ -333,6 +346,7 @@ function parse_solveblock_commandline(parseargs::Bool) end_bp = parsed_args["end_bp"] outdir = parsed_args["outdir"] hg_build = parsed_args["genome-build"] + covfile = parsed_args["covfile"] tol = parsed_args["tol"] min_maf = parsed_args["min_maf"] min_hwe = parsed_args["min_hwe"] @@ -344,6 +358,6 @@ function parse_solveblock_commandline(parseargs::Bool) verbose = parsed_args["verbose"] return file, chr, start_bp, end_bp, outdir, - hg_build, tol, min_maf, min_hwe, method, linkage, force_contiguous, + hg_build, covfile, tol, min_maf, min_hwe, method, linkage, force_contiguous, group_cor_cutoff, group_rep_cutoff, verbose end diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index 696cd2d1..c506faaa 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -267,6 +267,48 @@ function rearrange_snps!(groups, group_reps, Sigma, Sigma_info) return nothing end +""" + get_covariates(covfile::String, genotype_file::String) + +Read covariates from `covfile` and reorders the rows so that the rows come in +the same order as sample IDs specified in (VCF or binary PLINK) `genotype_file`. + +# Inputs ++ `covfile`: A comma- or tab-separated file containing sample covariates. The + first row should be a header row. The first column should be sample IDs (not + necessary to be in the sample order as genotype files and all other columns + will be used as additional covariates. Note if genotypes are stored in binary + PLINK format, then the sample ID column in the covariate file should be + FID_IID (that is, the first 2 columns of the .fam file merged by an + underscore). ++ `genotype_file`: A VCF or binary PLINK file storing individual level genotypes. + Must end in `.vcf`, `.vcf.gz`, or `.bed`. +""" +function get_covariates(covfile::String, genotype_file::String) + if endswith(genotype_file, ".vcf") || endswith(genotype_file, ".vcf.gz") + sampleIDs = sampleID(genotype_file) + elseif endswith(genotype_file, ".bed") + famfile = genotype_file[1:end-4] * ".fam" + fam_df = CSV.read(famfile, DataFrame, header=false) + sampleIDs = string.(fam_df[!, 1], "_", fam_df[!, 2]) + else + error("Genotype file should be in VCF (ends in .vcf " * + "or .vcf.gz) or binary PLINK (ends in .bed) format.") + end + + # read covariate data and match sample IDs + covdata = CSV.read(covfile, DataFrame) + cov_sampleIDs = string.(covdata[!, 1]) + idx = indexin(sampleIDs, cov_sampleIDs) + if length(idx) != length(sampleIDs) + error("A covariate file was supplied but >=1 genotyped sample(s)" * + " does not have covariate data. Please check if the covariate" * + " file has the correct sample IDs.") + end + C = Matrix(covdata[idx, 2:end]) + return C::Matrix{Float64} +end + """ solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int, outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01], @@ -288,6 +330,11 @@ it is indexed). Thus, we *strongly recommend* one convert to binary PLINK format end in `.vcf`, `.vcf.gz`, or `.bed`. If a VCF file is used, the ALT field for each record must be unique, i.e. multiallelic records must be split first. Missing genotypes will be imputed by column mean. ++ `covfile`: An optional comma- or tab-separated file containing sample covariates + (e.g. sex, age, PCs). This argument can be an empty string. The supplied + covariates will be used to improve LD estimation. The first column should be + sample IDs (not necessary to be in the sample order as VCF or PLINK files) + and all other columns will be used as additional covariates. + `chr`: Target chromosome. This MUST be an integer and it must match the `CHROM` field in your VCF/PLINK file. For example, if your VCF file has CHROM field like `chr1`, `CHR1`, or `CHROM1` etc, they must be renamed into `1`. @@ -354,6 +401,7 @@ Calling `solve_blocks` will create 3 files in the directory `outdir/chr`: """ function solve_blocks( file::String, + covfile::String, chr::Int, start_bp::Int, end_bp::Int, @@ -391,7 +439,11 @@ function solve_blocks( size(X, 1) ≥ 10 || error("Detected less than 10 samples, not recommended") - Sigma = estimate_sigma(X) + # read covariates, if any + C = covfile == "" ? zeros(size(X, 1), 0) : get_covariates(covfile, file) + + # estimate correlation matrix + Sigma = estimate_sigma(X, C) rename!(data_info, "pos" => "pos_hg$hg_build") # associate pos with hg_build end From fad9f3405197e3a07a20d7bdbbd5ad969fe13dfd Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 13:41:59 -0700 Subject: [PATCH 5/7] up project version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 142dda95..c7154cb8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "GhostKnockoffGWAS" uuid = "28dc8d00-4921-4061-9921-3f423e4be5cc" authors = ["Benjamin Chu "] -version = "0.2.2" +version = "0.2.3" [deps] ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" From e636485178fcd61aa5bf774b2505598fd5c037c0 Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Tue, 29 Oct 2024 16:00:06 -0700 Subject: [PATCH 6/7] stop mutationg X in estimate_sigma --- src/app.jl | 6 +++--- src/make_hdf5.jl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/app.jl b/src/app.jl index 7822d3db..34ab1f03 100644 --- a/src/app.jl +++ b/src/app.jl @@ -259,13 +259,13 @@ function parse_solveblock_commandline(parseargs::Bool) arg_type = Int "--covfile" help = "An optional comma- or tab-separated file containing sample" * - " covariates (e.g. sex, age, PCs). These will be used to " * + " covariates (e.g. sex, age, PCs). These will be used to" * " improve LD estimation. The first row should be a header row." * - " The first column should be sample IDs " * + " The first column should be sample IDs" * " (not necessary to be in the sample order as genotype files" * " ) and all other columns will be used as additional covariates." * " Note if genotypes are stored in binary PLINK format, then the" * - " sample ID column in the covariate file should be FID_IID (that " * + " sample ID column in the covariate file should be FID_IID (that" * " is, the first 2 columns of the .fam file merged by an underscore)" arg_type = String default = "" diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index c506faaa..b96e4bac 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -16,9 +16,9 @@ function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; n == size(C, 1) || error("Sample size in X and C should be the same") # pan-ukb routine - zscore!(X, mean(X, dims=1), std(X, dims=1)) + Xc = StatsBase.zscore(X, mean(X, dims=1), std(X, dims=1)) Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n)) - Xadj = Mc * X + Xadj = Mc * Xc Sigma = Xadj' * Xadj / n # numerical stability From b548ca6d1be7915c7b51f5478591bd2308a3466a Mon Sep 17 00:00:00 2001 From: Benjamin Chu Date: Wed, 30 Oct 2024 10:56:22 -0700 Subject: [PATCH 7/7] update unit tests --- src/make_hdf5.jl | 2 +- test/runtests.jl | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/make_hdf5.jl b/src/make_hdf5.jl index b96e4bac..7f6c97fd 100644 --- a/src/make_hdf5.jl +++ b/src/make_hdf5.jl @@ -310,7 +310,7 @@ function get_covariates(covfile::String, genotype_file::String) end """ - solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int, + solve_blocks(file::String, covfile::String, chr::Int, start_bp::Int, end_bp::Int, outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01], [min_hwe=0.0], [force_block_diag=true], [method::String = "maxent"], [linkage::String="average"], diff --git a/test/runtests.jl b/test/runtests.jl index 3d1cfff7..0168bad1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -220,7 +220,8 @@ end outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files") isdir(outdir) || mkpath(outdir) hg_build = 19 - @time solve_blocks(vcffile, chr, start_bp, end_bp, outdir, hg_build, + covfile = "" + @time solve_blocks(vcffile, covfile, chr, start_bp, end_bp, outdir, hg_build, min_maf = 0.01, min_hwe = 0.0) # test basic output structure @@ -289,7 +290,8 @@ end outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files2") isdir(outdir) || mkpath(outdir) hg_build = 19 - @time solve_blocks(plinkfile * ".bed", chr, start_bp, end_bp, outdir, + covfile = "" + @time solve_blocks(plinkfile * ".bed", covfile, chr, start_bp, end_bp, outdir, hg_build, min_maf = 0.01, min_hwe = 0.0) # check PLINK vs VCF output is the same