From 4fc4bc57a20ac626517296c45524f937422f1b2c Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Thu, 14 Mar 2024 16:19:01 +0100 Subject: [PATCH 1/3] add SSF to packages --- scripts/minotaur_packager.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh index 0a73b52..59018d7 100755 --- a/scripts/minotaur_packager.sh +++ b/scripts/minotaur_packager.sh @@ -165,6 +165,36 @@ function add_versions_file() { echo " - Minotaur-packager version: ${VERSION}" >> ${version_fn} } +## Function to add SSF file to minotaur package +## usage add_ssf_file +function add_ssf_file() { + local ssf_file_path + local ssf_name + local package_dir + local package_name + + ssf_file_path=${1} + ssf_name=${ssf_file_path##*/} + package_dir=${2} + package_name=${package_dir##*/} + + ## Check that the SSF file exists. + if [[ ! -f ${ssf_file_path} ]]; then + errecho -r "[${package_name}]: SSF file '${ssf_file_path}' not found." + exit 1 + fi + + ## Ensure the provided package dir exists + if [[ ! -d ${package_dir} ]]; then + errecho -r "[${package_name}]: Package directory '${package_dir}' not found." + exit 1 + fi + + ## Copy the SSF file to the package directory + errecho -y "[${package_name}]: Adding SSF file to package directory." + cp ${ssf_file_path} ${package_dir}/${ssf_name} +} + ## Parse CLI args. TEMP=`getopt -q -o dhfv --long debug,help,force,version -n "${0}" -- "$@"` eval set -- "${TEMP}" @@ -205,6 +235,7 @@ package_oven_dir="/mnt/archgen/poseidon/minotaur/minotaur-package-oven/" ## Hard output_package_dir="${package_oven_dir}/${package_name}" ## Hard-coded path for EVA finalisedtsv_fn="${package_minotaur_directory}/${package_name}.finalised.tsv" root_results_dir="${package_minotaur_directory}/results" +minotaur_recipe_dir="/mnt/archgen/poseidon/minotaur/minotaur-recipes/packages/${package_name}" ## Hard-coded path for EVA ## Get current date for versioning errecho -y "[minotaur_packager.sh]: version ${VERSION}" @@ -268,6 +299,10 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p add_versions_file ${root_results_dir} ${tmp_dir}/package/README.md echo "readmeFile: README.md" >> ${tmp_dir}/package/POSEIDON.yml + ## Add SSF file to package + add_ssf_file ${minotaur_recipe_dir}/${package_name}.ssf ${tmp_dir}/package + echo "sequencingSourceFile: ${package_name}.ssf" >> ${tmp_dir}/package/POSEIDON.yml + ## Convert data to PLINK format errecho -y "[${package_name}] Converting data to PLINK format" trident genoconvert -d ${tmp_dir}/package \ From a2a5b3213d43b82226542e86d1578dd75141eb82 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Thu, 14 Mar 2024 16:19:50 +0100 Subject: [PATCH 2/3] bump version --- scripts/minotaur_packager.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh index 59018d7..6025bd2 100755 --- a/scripts/minotaur_packager.sh +++ b/scripts/minotaur_packager.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -VERSION='0.2.1dev' +VERSION='0.2.2dev' set -o pipefail ## Pipefail, complain on new unassigned variables. # set -x ## Debugging From 36d4f1aa7b1f7f999da9744d19bf5c556261efdb Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Thu, 14 Mar 2024 16:22:42 +0100 Subject: [PATCH 3/3] add endogenous DNA inference --- scripts/minotaur_packager.sh | 2 +- scripts/populate_janno.py | 47 ++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh index 6025bd2..ad9f43e 100755 --- a/scripts/minotaur_packager.sh +++ b/scripts/minotaur_packager.sh @@ -291,7 +291,7 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p ## Fill in janno errecho -y "Populating janno file" - ${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml + ${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml -s ${minotaur_recipe_dir}/${package_name}.ssf ## TODO-dev Infer genetic sex from janno and mirror to ind file. diff --git a/scripts/populate_janno.py b/scripts/populate_janno.py index 90c5164..9accdc9 100755 --- a/scripts/populate_janno.py +++ b/scripts/populate_janno.py @@ -10,7 +10,7 @@ import numpy as np from collections import namedtuple -VERSION = "0.2.0dev" +VERSION = "0.2.1dev" def camel_to_snake(name): @@ -174,6 +174,13 @@ def weighted_mean( action="store_true", help="Activate safe mode. The package's janno and ind files will not be updated, but instead new files will be created with the '.new' suffix. Only useful for testing.", ) +parser.add_argument( + "-s", + "--ssf_path", + metavar="", + required=True, + help="The path to the SSF file of the recipe for the minotaur package.", +) parser.add_argument("-v", "--version", action="version", version=VERSION) args = parser.parse_args() @@ -215,6 +222,8 @@ def weighted_mean( tsv_table, run_trim_bam=True, skip_deduplication=False ) +ssf_table = pd.read_table(args.ssf_path, dtype=str) + ## Read poseidon yaml, infer path to janno file and read janno file. poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path) janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str) @@ -235,6 +244,16 @@ def weighted_mean( endogenous_table = endogenous_table[["id", "endogenous_dna"]].rename( columns={"id": "Library_ID", "endogenous_dna": "endogenous"} ) +## Get df with minotaur_library_ids that are WGS. Used to decide on which libraries to keep the endogenous results for. +library_strategy_table = ssf_table[["poseidon_IDs","library_name", "library_strategy"]].drop_duplicates() +library_strategy_table = library_strategy_table[library_strategy_table.library_strategy == "WGS"] +library_strategy_table['poseidon_IDs'] = library_strategy_table.poseidon_IDs.apply(lambda x: x.split(';')) +library_strategy_table = library_strategy_table.explode('poseidon_IDs') +library_strategy_table['minotaur_library_ID'] = library_strategy_table.poseidon_IDs+"_"+library_strategy_table.library_name +library_strategy_table = library_strategy_table[["minotaur_library_ID", "library_strategy"]] + +## Merge the two tables, only keeping endogenous values for WGS libraries. +endogenous_table = endogenous_table.merge(library_strategy_table, left_on="Library_ID", right_on="minotaur_library_ID", how='right').drop(columns=['minotaur_library_ID', 'library_strategy']) ## Prepare SNP coverage table for joining. Should always be on the sample level, so only need to fix column names. snp_coverage_table = snp_coverage_table.drop("Total_Snps", axis=1).rename( @@ -285,6 +304,13 @@ def weighted_mean( on="Library_ID", validate="many_to_one", ) + .merge( + ## Add endogenous DNA results per Library_ID + endogenous_table, + on="Library_ID", + validate="one_to_one", + how='left', + ) .merge( ## Add sex determination results per Sample_ID sex_determination_table, @@ -320,6 +346,7 @@ def weighted_mean( summarised_stats = pd.DataFrame() summarised_stats["Sample_Name"] = compound_eager_table["Sample_Name"].unique() +## Contamination_Note: Add note about contamination estimation in libraries with more SNPs than the cutoff. summarised_stats = ( compound_eager_table.astype("string") .groupby("Sample_Name")[["Contamination_Nr_SNPs"]] @@ -343,6 +370,7 @@ def weighted_mean( .merge(summarised_stats, on="Sample_Name", validate="one_to_one") ) +## Nr_Libraries: Count number of libraries per sample summarised_stats = ( compound_eager_table.groupby("Sample_Name")[["Library_ID"]] .agg("nunique") @@ -350,6 +378,7 @@ def weighted_mean( .merge(summarised_stats, on="Sample_Name", validate="one_to_one") ) +## Contamination_Est: Calculated weighted mean across libraries of a sample. summarised_stats = ( compound_eager_table.groupby("Sample_Name")[ ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"] @@ -366,6 +395,7 @@ def weighted_mean( .merge(summarised_stats, on="Sample_Name", validate="one_to_one") ) +## Contamination_SE: Calculated weighted mean across libraries of a sample. summarised_stats = ( compound_eager_table.groupby("Sample_Name")[ ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"] @@ -382,11 +412,12 @@ def weighted_mean( .merge(summarised_stats, on="Sample_Name", validate="one_to_one") ) -## If Contamination column is not empty, add the contamination measure +## Contamination_Meas: If Contamination column is not empty, add the contamination measure summarised_stats["Contamination_Meas"] = summarised_stats.apply( set_contamination_measure, axis=1 ) +## Damage: Calculated weighted mean across libraries of a sample. summarised_stats = ( compound_eager_table.groupby("Sample_Name")[["damage", "n_reads"]] .apply( @@ -401,6 +432,17 @@ def weighted_mean( .merge(summarised_stats, on="Sample_Name", validate="one_to_one") ) +## Endogenous: The maximum value of endogenous DNA across WGS libraries of a sample. +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")["endogenous"] + .apply( + max, + ) + .reset_index("Sample_Name") + .rename(columns={"endogenous": "Endogenous"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + final_eager_table = compound_eager_table.merge( summarised_stats, on="Sample_Name", validate="many_to_one" ).drop( @@ -411,6 +453,7 @@ def weighted_mean( "Contamination_SE", "n_reads", "damage", + "endogenous", "Original_library_names", ], )