Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:poseidon-framework/poseidon-eager in…
Browse files Browse the repository at this point in the history
…to dev
  • Loading branch information
TCLamnidis committed Mar 26, 2024
2 parents 69138f9 + 47a9138 commit 379d829
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 3 deletions.
39 changes: 37 additions & 2 deletions scripts/minotaur_packager.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
VERSION='0.2.1dev'
VERSION='0.2.2dev'
set -o pipefail ## Pipefail, complain on new unassigned variables.
# set -x ## Debugging

Expand Down Expand Up @@ -165,6 +165,36 @@ function add_versions_file() {
echo " - Minotaur-packager version: ${VERSION}" >> ${version_fn}
}

## Function to add SSF file to minotaur package
## usage add_ssf_file <ssf_file_path> <package_dir>
function add_ssf_file() {
local ssf_file_path
local ssf_name
local package_dir
local package_name

ssf_file_path=${1}
ssf_name=${ssf_file_path##*/}
package_dir=${2}
package_name=${package_dir##*/}

## Check that the SSF file exists.
if [[ ! -f ${ssf_file_path} ]]; then
errecho -r "[${package_name}]: SSF file '${ssf_file_path}' not found."
exit 1
fi

## Ensure the provided package dir exists
if [[ ! -d ${package_dir} ]]; then
errecho -r "[${package_name}]: Package directory '${package_dir}' not found."
exit 1
fi

## Copy the SSF file to the package directory
errecho -y "[${package_name}]: Adding SSF file to package directory."
cp ${ssf_file_path} ${package_dir}/${ssf_name}
}

## Parse CLI args.
TEMP=`getopt -q -o dhfv --long debug,help,force,version -n "${0}" -- "$@"`
eval set -- "${TEMP}"
Expand Down Expand Up @@ -205,6 +235,7 @@ package_oven_dir="/mnt/archgen/poseidon/minotaur/minotaur-package-oven/" ## Hard
output_package_dir="${package_oven_dir}/${package_name}" ## Hard-coded path for EVA
finalisedtsv_fn="${package_minotaur_directory}/${package_name}.finalised.tsv"
root_results_dir="${package_minotaur_directory}/results"
minotaur_recipe_dir="/mnt/archgen/poseidon/minotaur/minotaur-recipes/packages/${package_name}" ## Hard-coded path for EVA

## Get current date for versioning
errecho -y "[minotaur_packager.sh]: version ${VERSION}"
Expand Down Expand Up @@ -260,14 +291,18 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p

## Fill in janno
errecho -y "Populating janno file"
${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml
${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml -s ${minotaur_recipe_dir}/${package_name}.ssf

## TODO-dev Infer genetic sex from janno and mirror to ind file.

## Add Minotaur version info to README of package
add_versions_file ${root_results_dir} ${tmp_dir}/package/README.md
echo "readmeFile: README.md" >> ${tmp_dir}/package/POSEIDON.yml

## Add SSF file to package
add_ssf_file ${minotaur_recipe_dir}/${package_name}.ssf ${tmp_dir}/package
echo "sequencingSourceFile: ${package_name}.ssf" >> ${tmp_dir}/package/POSEIDON.yml

## Convert data to PLINK format
errecho -y "[${package_name}] Converting data to PLINK format"
trident genoconvert -d ${tmp_dir}/package \
Expand Down
45 changes: 44 additions & 1 deletion scripts/populate_janno.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,13 @@ def weighted_mean(
action="store_true",
help="Activate safe mode. The package's janno and ind files will not be updated, but instead new files will be created with the '.new' suffix. Only useful for testing.",
)
parser.add_argument(
"-s",
"--ssf_path",
metavar="<SSF>",
required=True,
help="The path to the SSF file of the recipe for the minotaur package.",
)
parser.add_argument("-v", "--version", action="version", version=VERSION)

args = parser.parse_args()
Expand Down Expand Up @@ -194,6 +201,8 @@ def weighted_mean(
tsv_table, run_trim_bam=True, skip_deduplication=False
)

ssf_table = pd.read_table(args.ssf_path, dtype=str)

## Read poseidon yaml, infer path to janno file and read janno file.
poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path)
janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str)
Expand All @@ -214,6 +223,16 @@ def weighted_mean(
endogenous_table = endogenous_table[["id", "endogenous_dna"]].rename(
columns={"id": "Library_ID", "endogenous_dna": "endogenous"}
)
## Get df with minotaur_library_ids that are WGS. Used to decide on which libraries to keep the endogenous results for.
library_strategy_table = ssf_table[["poseidon_IDs","library_name", "library_strategy"]].drop_duplicates()
library_strategy_table = library_strategy_table[library_strategy_table.library_strategy == "WGS"]
library_strategy_table['poseidon_IDs'] = library_strategy_table.poseidon_IDs.apply(lambda x: x.split(';'))
library_strategy_table = library_strategy_table.explode('poseidon_IDs')
library_strategy_table['minotaur_library_ID'] = library_strategy_table.poseidon_IDs+"_"+library_strategy_table.library_name
library_strategy_table = library_strategy_table[["minotaur_library_ID", "library_strategy"]]

## Merge the two tables, only keeping endogenous values for WGS libraries.
endogenous_table = endogenous_table.merge(library_strategy_table, left_on="Library_ID", right_on="minotaur_library_ID", how='right').drop(columns=['minotaur_library_ID', 'library_strategy'])

## Prepare SNP coverage table for joining. Should always be on the sample level, so only need to fix column names.
snp_coverage_table = snp_coverage_table.drop("Total_Snps", axis=1).rename(
Expand Down Expand Up @@ -264,6 +283,13 @@ def weighted_mean(
on="Library_ID",
validate="many_to_one",
)
.merge(
## Add endogenous DNA results per Library_ID
endogenous_table,
on="Library_ID",
validate="one_to_one",
how='left',
)
.merge(
## Add sex determination results per Sample_ID
sex_determination_table,
Expand Down Expand Up @@ -299,6 +325,7 @@ def weighted_mean(

summarised_stats = pd.DataFrame()
summarised_stats["Sample_Name"] = compound_eager_table["Sample_Name"].unique()
## Contamination_Note: Add note about contamination estimation in libraries with more SNPs than the cutoff.
summarised_stats = (
compound_eager_table.astype("string")
.groupby("Sample_Name")[["Contamination_Nr_SNPs"]]
Expand All @@ -322,13 +349,15 @@ def weighted_mean(
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

## Nr_Libraries: Count number of libraries per sample
summarised_stats = (
compound_eager_table.groupby("Sample_Name")[["Library_ID"]]
.agg("nunique")
.rename(columns={"Library_ID": "Nr_Libraries"})
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

## Contamination_Est: Calculated weighted mean across libraries of a sample.
summarised_stats = (
compound_eager_table.groupby("Sample_Name")[
["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"]
Expand All @@ -345,6 +374,7 @@ def weighted_mean(
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

## Contamination_SE: Calculated weighted mean across libraries of a sample.
summarised_stats = (
compound_eager_table.groupby("Sample_Name")[
["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"]
Expand All @@ -361,11 +391,12 @@ def weighted_mean(
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

## If Contamination column is not empty, add the contamination measure
## Contamination_Meas: If Contamination column is not empty, add the contamination measure
summarised_stats["Contamination_Meas"] = summarised_stats.apply(
set_contamination_measure, axis=1
)

## Damage: Calculated weighted mean across libraries of a sample.
summarised_stats = (
compound_eager_table.groupby("Sample_Name")[["damage", "n_reads"]]
.apply(
Expand All @@ -380,6 +411,17 @@ def weighted_mean(
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

## Endogenous: The maximum value of endogenous DNA across WGS libraries of a sample.
summarised_stats = (
compound_eager_table.groupby("Sample_Name")["endogenous"]
.apply(
max,
)
.reset_index("Sample_Name")
.rename(columns={"endogenous": "Endogenous"})
.merge(summarised_stats, on="Sample_Name", validate="one_to_one")
)

final_eager_table = compound_eager_table.merge(
summarised_stats, on="Sample_Name", validate="many_to_one"
).drop(
Expand All @@ -390,6 +432,7 @@ def weighted_mean(
"Contamination_SE",
"n_reads",
"damage",
"endogenous",
"Original_library_names",
],
)
Expand Down

0 comments on commit 379d829

Please sign in to comment.