Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Estimate LD based on Pan-UKB routine #37

Merged
merged 7 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "GhostKnockoffGWAS"
uuid = "28dc8d00-4921-4061-9921-3f423e4be5cc"
authors = ["Benjamin Chu <[email protected]>"]
version = "0.2.2"
version = "0.2.3"

[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Expand Down
22 changes: 18 additions & 4 deletions src/app.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,14 @@ end
function julia_solveblock()::Cint
try
# read command line arguments
file, chr, start_bp, end_bp, outdir, hg_build, tol, min_maf,
file, chr, start_bp, end_bp, outdir, hg_build, covfile, tol, min_maf,
min_hwe, method, linkage, force_contiguous, group_cor_cutoff,
group_rep_cutoff, verbose = parse_solveblock_commandline(true)

println("\n\nWelcome to the `solve_block` module of GhostKnockoffGWAS!")
println("You have specified the following options:")
println("genotype file = ", abspath(file))
println("covariate file = ", covfile == "" ? "" : abspath(covfile))
println("chr = ", chr)
println("start_bp = ", start_bp)
println("end_bp = ", end_bp)
Expand All @@ -209,7 +210,7 @@ function julia_solveblock()::Cint
println("verbose = ", verbose)
println("\n")

solve_blocks(file, chr, start_bp, end_bp, outdir,
solve_blocks(file, covfile, chr, start_bp, end_bp, outdir,
hg_build, tol=tol, min_maf=min_maf, min_hwe=min_hwe,
method=method, linkage=linkage, force_contiguous=force_contiguous,
group_cor_cutoff=group_cor_cutoff,
Expand Down Expand Up @@ -256,6 +257,18 @@ function parse_solveblock_commandline(parseargs::Bool)
"38 (hg38)"
required = true
arg_type = Int
"--covfile"
help = "An optional comma- or tab-separated file containing sample" *
" covariates (e.g. sex, age, PCs). These will be used to" *
" improve LD estimation. The first row should be a header row." *
" The first column should be sample IDs" *
" (not necessary to be in the sample order as genotype files" *
" ) and all other columns will be used as additional covariates." *
" Note if genotypes are stored in binary PLINK format, then the" *
" sample ID column in the covariate file should be FID_IID (that" *
" is, the first 2 columns of the .fam file merged by an underscore)"
arg_type = String
default = ""
"--tol"
help = "Convergence tolerlance for coordinate descent algorithm " *
"(default `0.0001`)"
Expand Down Expand Up @@ -320,7 +333,7 @@ function parse_solveblock_commandline(parseargs::Bool)
"--min_maf","0.01","--min_hwe","0.01","--method","maxent",
"--linkage","average","--force_contiguous","false",
"--group_cor_cutoff","0.5","--group_rep_cutoff","0.5",
"--verbose","true"], s
"--verbose","true","--covfile",""], s
)
_useless = parse_args(["--help"], s)
return nothing
Expand All @@ -333,6 +346,7 @@ function parse_solveblock_commandline(parseargs::Bool)
end_bp = parsed_args["end_bp"]
outdir = parsed_args["outdir"]
hg_build = parsed_args["genome-build"]
covfile = parsed_args["covfile"]
tol = parsed_args["tol"]
min_maf = parsed_args["min_maf"]
min_hwe = parsed_args["min_hwe"]
Expand All @@ -344,6 +358,6 @@ function parse_solveblock_commandline(parseargs::Bool)
verbose = parsed_args["verbose"]

return file, chr, start_bp, end_bp, outdir,
hg_build, tol, min_maf, min_hwe, method, linkage, force_contiguous,
hg_build, covfile, tol, min_maf, min_hwe, method, linkage, force_contiguous,
group_cor_cutoff, group_rep_cutoff, verbose
end
107 changes: 88 additions & 19 deletions src/make_hdf5.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,40 @@
function estimate_sigma(X::AbstractMatrix; min_eigval=1e-5)
# n, p = size(X)
# if n > p
# Sigma = cor(X)
# else
# error("p > n case not handled yet!")
# end

# shrinkage based covariance, handles high dimensional case
Sigma = cov(LinearShrinkage(DiagonalUnequalVariance(), :lw), X)

# ensure Sigma is PSD
evals, evecs = eigen(Sigma)
evals[findall(x -> x < min_eigval, evals)] .= min_eigval
Sigma = evecs * Diagonal(evals) * evecs'
Statistics.cov2cor!(Sigma, sqrt.(diag(Sigma))) # scale to correlation matrix
"""
estimate_sigma(X::AbstractMatrix; [enforce_psd=true], [min_eigval=1e-5])
estimate_sigma(X::AbstractMatrix, C::AbstractMatrix; [enforce_psd=true],
[min_eigval=1e-5])

Estimate LD matrices from data `X`, accounting for covariates `C` if there are any.
We adopt the method for Pan-UKB described in
`https://pan-dev.ukbb.broadinstitute.org/docs/ld/index.html#ld-matrices`.
If `enforce_psd=true`, then the correlation matrix will be scaled so that the
minimum eigenvalue is `min_eigval`.
"""
function estimate_sigma(X::AbstractMatrix, C::AbstractMatrix;
enforce_psd::Bool=true, min_eigval::Float64 = 1e-5)
# check for errors
n = size(X, 1)
n == size(C, 1) || error("Sample size in X and C should be the same")

# pan-ukb routine
Xc = StatsBase.zscore(X, mean(X, dims=1), std(X, dims=1))
Mc = size(C, 2) > 1 ? I - C * inv(Symmetric(C' * C)) * C' : Diagonal(ones(n))
Xadj = Mc * Xc
Sigma = Xadj' * Xadj / n

# numerical stability
if enforce_psd
evals, evecs = eigen(Sigma)
evals[findall(x -> x < min_eigval, evals)] .= min_eigval
Sigma = evecs * Diagonal(evals) * evecs'
end

# scale to correlation matrix
StatsBase.cov2cor!(Sigma, sqrt.(diag(Sigma)))

return Sigma
end

estimate_sigma(X; enforce_psd::Bool=true, min_eigval::Float64 = 1e-5) =
estimate_sigma(X, zeros(size(X, 1), 0); enforce_psd=enforce_psd, min_eigval=min_eigval)

"""
get_block(file::String, chr::Int, start_bp::Int, end_bp::Int;
Expand Down Expand Up @@ -251,7 +268,49 @@ function rearrange_snps!(groups, group_reps, Sigma, Sigma_info)
end

"""
solve_blocks(file::String, chr::Int, start_bp::Int, end_bp::Int,
get_covariates(covfile::String, genotype_file::String)

Read covariates from `covfile` and reorders the rows so that the rows come in
the same order as sample IDs specified in (VCF or binary PLINK) `genotype_file`.

# Inputs
+ `covfile`: A comma- or tab-separated file containing sample covariates. The
first row should be a header row. The first column should be sample IDs (not
necessary to be in the sample order as genotype files and all other columns
will be used as additional covariates. Note if genotypes are stored in binary
PLINK format, then the sample ID column in the covariate file should be
FID_IID (that is, the first 2 columns of the .fam file merged by an
underscore).
+ `genotype_file`: A VCF or binary PLINK file storing individual level genotypes.
Must end in `.vcf`, `.vcf.gz`, or `.bed`.
"""
function get_covariates(covfile::String, genotype_file::String)
if endswith(genotype_file, ".vcf") || endswith(genotype_file, ".vcf.gz")
sampleIDs = sampleID(genotype_file)
elseif endswith(genotype_file, ".bed")
famfile = genotype_file[1:end-4] * ".fam"
fam_df = CSV.read(famfile, DataFrame, header=false)
sampleIDs = string.(fam_df[!, 1], "_", fam_df[!, 2])
else
error("Genotype file should be in VCF (ends in .vcf " *
"or .vcf.gz) or binary PLINK (ends in .bed) format.")
end

# read covariate data and match sample IDs
covdata = CSV.read(covfile, DataFrame)
cov_sampleIDs = string.(covdata[!, 1])
idx = indexin(sampleIDs, cov_sampleIDs)
if length(idx) != length(sampleIDs)
error("A covariate file was supplied but >=1 genotyped sample(s)" *
" does not have covariate data. Please check if the covariate" *
" file has the correct sample IDs.")
end
C = Matrix(covdata[idx, 2:end])
return C::Matrix{Float64}
end

"""
solve_blocks(file::String, covfile::String, chr::Int, start_bp::Int, end_bp::Int,
outdir::String, hg_build::Int; [m=5], [tol=0.0001], [min_maf=0.01],
[min_hwe=0.0], [force_block_diag=true],
[method::String = "maxent"], [linkage::String="average"],
Expand All @@ -271,6 +330,11 @@ it is indexed). Thus, we *strongly recommend* one convert to binary PLINK format
end in `.vcf`, `.vcf.gz`, or `.bed`. If a VCF file is used, the ALT field for
each record must be unique, i.e. multiallelic records must be split first.
Missing genotypes will be imputed by column mean.
+ `covfile`: An optional comma- or tab-separated file containing sample covariates
(e.g. sex, age, PCs). This argument can be an empty string. The supplied
covariates will be used to improve LD estimation. The first column should be
sample IDs (not necessary to be in the sample order as VCF or PLINK files)
and all other columns will be used as additional covariates.
+ `chr`: Target chromosome. This MUST be an integer and it must match the `CHROM`
field in your VCF/PLINK file. For example, if your VCF file has CHROM field
like `chr1`, `CHR1`, or `CHROM1` etc, they must be renamed into `1`.
Expand Down Expand Up @@ -337,6 +401,7 @@ Calling `solve_blocks` will create 3 files in the directory `outdir/chr`:
"""
function solve_blocks(
file::String,
covfile::String,
chr::Int,
start_bp::Int,
end_bp::Int,
Expand Down Expand Up @@ -374,7 +439,11 @@ function solve_blocks(
size(X, 1) ≥ 10 ||
error("Detected less than 10 samples, not recommended")

Sigma = estimate_sigma(X)
# read covariates, if any
C = covfile == "" ? zeros(size(X, 1), 0) : get_covariates(covfile, file)

# estimate correlation matrix
Sigma = estimate_sigma(X, C)
rename!(data_info, "pos" => "pos_hg$hg_build") # associate pos with hg_build
end

Expand Down
6 changes: 4 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ end
outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files")
isdir(outdir) || mkpath(outdir)
hg_build = 19
@time solve_blocks(vcffile, chr, start_bp, end_bp, outdir, hg_build,
covfile = ""
@time solve_blocks(vcffile, covfile, chr, start_bp, end_bp, outdir, hg_build,
min_maf = 0.01, min_hwe = 0.0)

# test basic output structure
Expand Down Expand Up @@ -289,7 +290,8 @@ end
outdir = joinpath(dirname(pathof(GhostKnockoffGWAS)), "..", "test/LD_files2")
isdir(outdir) || mkpath(outdir)
hg_build = 19
@time solve_blocks(plinkfile * ".bed", chr, start_bp, end_bp, outdir,
covfile = ""
@time solve_blocks(plinkfile * ".bed", covfile, chr, start_bp, end_bp, outdir,
hg_build, min_maf = 0.01, min_hwe = 0.0)

# check PLINK vs VCF output is the same
Expand Down
Loading