From 4fc4bc57a20ac626517296c45524f937422f1b2c Mon Sep 17 00:00:00 2001
From: "Thiseas C. Lamnidis" <thisseass@gmail.com>
Date: Thu, 14 Mar 2024 16:19:01 +0100
Subject: [PATCH 1/3] add SSF to packages

---
 scripts/minotaur_packager.sh | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh
index 0a73b52..59018d7 100755
--- a/scripts/minotaur_packager.sh
+++ b/scripts/minotaur_packager.sh
@@ -165,6 +165,36 @@ function add_versions_file() {
   echo " - Minotaur-packager version: ${VERSION}"                 >> ${version_fn}
 }
 
+## Function to add SSF file to minotaur package
+## usage add_ssf_file <ssf_file_path> <package_dir>
+function add_ssf_file() {
+  local ssf_file_path
+  local ssf_name
+  local package_dir
+  local package_name
+
+  ssf_file_path=${1}
+  ssf_name=${ssf_file_path##*/}
+  package_dir=${2}
+  package_name=${package_dir##*/}
+
+  ## Check that the SSF file exists.
+  if [[ ! -f ${ssf_file_path} ]]; then
+    errecho -r "[${package_name}]: SSF file '${ssf_file_path}' not found."
+    exit 1
+  fi
+
+  ## Ensure the provided package dir exists
+  if [[ ! -d ${package_dir} ]]; then
+    errecho -r "[${package_name}]: Package directory '${package_dir}' not found."
+    exit 1
+  fi
+
+  ## Copy the SSF file to the package directory
+  errecho -y "[${package_name}]: Adding SSF file to package directory."
+  cp ${ssf_file_path} ${package_dir}/${ssf_name}
+}
+
 ## Parse CLI args.
 TEMP=`getopt -q -o dhfv --long debug,help,force,version -n "${0}" -- "$@"`
 eval set -- "${TEMP}"
@@ -205,6 +235,7 @@ package_oven_dir="/mnt/archgen/poseidon/minotaur/minotaur-package-oven/" ## Hard
 output_package_dir="${package_oven_dir}/${package_name}" ## Hard-coded path for EVA
 finalisedtsv_fn="${package_minotaur_directory}/${package_name}.finalised.tsv"
 root_results_dir="${package_minotaur_directory}/results"
+minotaur_recipe_dir="/mnt/archgen/poseidon/minotaur/minotaur-recipes/packages/${package_name}" ## Hard-coded path for EVA
 
 ## Get current date for versioning
 errecho -y "[minotaur_packager.sh]: version ${VERSION}"
@@ -268,6 +299,10 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p
   add_versions_file ${root_results_dir} ${tmp_dir}/package/README.md
   echo "readmeFile: README.md" >> ${tmp_dir}/package/POSEIDON.yml
 
+  ## Add SSF file to package
+  add_ssf_file ${minotaur_recipe_dir}/${package_name}.ssf ${tmp_dir}/package
+  echo "sequencingSourceFile: ${package_name}.ssf" >> ${tmp_dir}/package/POSEIDON.yml
+
   ## Convert data to PLINK format
   errecho -y "[${package_name}] Converting data to PLINK format"
   trident genoconvert -d ${tmp_dir}/package \

From a2a5b3213d43b82226542e86d1578dd75141eb82 Mon Sep 17 00:00:00 2001
From: "Thiseas C. Lamnidis" <thisseass@gmail.com>
Date: Thu, 14 Mar 2024 16:19:50 +0100
Subject: [PATCH 2/3] bump version

---
 scripts/minotaur_packager.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh
index 59018d7..6025bd2 100755
--- a/scripts/minotaur_packager.sh
+++ b/scripts/minotaur_packager.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-VERSION='0.2.1dev'
+VERSION='0.2.2dev'
 set -o pipefail ## Pipefail, complain on new unassigned variables.
 # set -x ## Debugging
 

From 36d4f1aa7b1f7f999da9744d19bf5c556261efdb Mon Sep 17 00:00:00 2001
From: "Thiseas C. Lamnidis" <thisseass@gmail.com>
Date: Thu, 14 Mar 2024 16:22:42 +0100
Subject: [PATCH 3/3] add endogenous DNA inference

---
 scripts/minotaur_packager.sh |  2 +-
 scripts/populate_janno.py    | 47 ++++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh
index 6025bd2..ad9f43e 100755
--- a/scripts/minotaur_packager.sh
+++ b/scripts/minotaur_packager.sh
@@ -291,7 +291,7 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p
 
   ## Fill in janno
   errecho -y "Populating janno file"
-  ${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml
+  ${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml -s ${minotaur_recipe_dir}/${package_name}.ssf
 
   ## TODO-dev Infer genetic sex from janno and mirror to ind file.
 
diff --git a/scripts/populate_janno.py b/scripts/populate_janno.py
index 90c5164..9accdc9 100755
--- a/scripts/populate_janno.py
+++ b/scripts/populate_janno.py
@@ -10,7 +10,7 @@
 import numpy as np
 from collections import namedtuple
 
-VERSION = "0.2.0dev"
+VERSION = "0.2.1dev"
 
 
 def camel_to_snake(name):
@@ -174,6 +174,13 @@ def weighted_mean(
     action="store_true",
     help="Activate safe mode. The package's janno and ind files will not be updated, but instead new files will be created with the '.new' suffix. Only useful for testing.",
 )
+parser.add_argument(
+    "-s",
+    "--ssf_path",
+    metavar="<SSF>",
+    required=True,
+    help="The path to the SSF file of the recipe for the minotaur package.",
+)
 parser.add_argument("-v", "--version", action="version", version=VERSION)
 
 args = parser.parse_args()
@@ -215,6 +222,8 @@ def weighted_mean(
     tsv_table, run_trim_bam=True, skip_deduplication=False
 )
 
+ssf_table = pd.read_table(args.ssf_path, dtype=str)
+
 ## Read poseidon yaml, infer path to janno file and read janno file.
 poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path)
 janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str)
@@ -235,6 +244,16 @@ def weighted_mean(
 endogenous_table = endogenous_table[["id", "endogenous_dna"]].rename(
     columns={"id": "Library_ID", "endogenous_dna": "endogenous"}
 )
+## Get df with minotaur_library_ids that are WGS. Used to decide on which libraries to keep the endogenous results for.
+library_strategy_table = ssf_table[["poseidon_IDs","library_name", "library_strategy"]].drop_duplicates()
+library_strategy_table = library_strategy_table[library_strategy_table.library_strategy == "WGS"]
+library_strategy_table['poseidon_IDs'] = library_strategy_table.poseidon_IDs.apply(lambda x: x.split(';'))
+library_strategy_table = library_strategy_table.explode('poseidon_IDs')
+library_strategy_table['minotaur_library_ID'] = library_strategy_table.poseidon_IDs+"_"+library_strategy_table.library_name
+library_strategy_table = library_strategy_table[["minotaur_library_ID", "library_strategy"]]
+
+## Merge the two tables, only keeping endogenous values for WGS libraries.
+endogenous_table = endogenous_table.merge(library_strategy_table, left_on="Library_ID", right_on="minotaur_library_ID", how='right').drop(columns=['minotaur_library_ID', 'library_strategy'])
 
 ## Prepare SNP coverage table for joining. Should always be on the sample level, so only need to fix column names.
 snp_coverage_table = snp_coverage_table.drop("Total_Snps", axis=1).rename(
@@ -285,6 +304,13 @@ def weighted_mean(
         on="Library_ID",
         validate="many_to_one",
     )
+    .merge(
+        ## Add endogenous DNA results per Library_ID
+        endogenous_table,
+        on="Library_ID",
+        validate="one_to_one",
+        how='left',
+    )
     .merge(
         ## Add sex determination results per Sample_ID
         sex_determination_table,
@@ -320,6 +346,7 @@ def weighted_mean(
 
 summarised_stats = pd.DataFrame()
 summarised_stats["Sample_Name"] = compound_eager_table["Sample_Name"].unique()
+## Contamination_Note: Add note about contamination estimation in libraries with more SNPs than the cutoff.
 summarised_stats = (
     compound_eager_table.astype("string")
     .groupby("Sample_Name")[["Contamination_Nr_SNPs"]]
@@ -343,6 +370,7 @@ def weighted_mean(
     .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
 )
 
+## Nr_Libraries: Count number of libraries per sample
 summarised_stats = (
     compound_eager_table.groupby("Sample_Name")[["Library_ID"]]
     .agg("nunique")
@@ -350,6 +378,7 @@ def weighted_mean(
     .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
 )
 
+## Contamination_Est: Calculated weighted mean across libraries of a sample.
 summarised_stats = (
     compound_eager_table.groupby("Sample_Name")[
         ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"]
@@ -366,6 +395,7 @@ def weighted_mean(
     .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
 )
 
+## Contamination_SE: Calculated weighted mean across libraries of a sample.
 summarised_stats = (
     compound_eager_table.groupby("Sample_Name")[
         ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"]
@@ -382,11 +412,12 @@ def weighted_mean(
     .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
 )
 
-## If Contamination column is not empty, add the contamination measure
+## Contamination_Meas: If Contamination column is not empty, add the contamination measure
 summarised_stats["Contamination_Meas"] = summarised_stats.apply(
     set_contamination_measure, axis=1
 )
 
+## Damage: Calculated weighted mean across libraries of a sample.
 summarised_stats = (
     compound_eager_table.groupby("Sample_Name")[["damage", "n_reads"]]
     .apply(
@@ -401,6 +432,17 @@ def weighted_mean(
     .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
 )
 
+## Endogenous: The maximum value of endogenous DNA across WGS libraries of a sample.
+summarised_stats = (
+    compound_eager_table.groupby("Sample_Name")["endogenous"]
+    .apply(
+        max,
+    )
+    .reset_index("Sample_Name")
+    .rename(columns={"endogenous": "Endogenous"})
+    .merge(summarised_stats, on="Sample_Name", validate="one_to_one")
+)
+
 final_eager_table = compound_eager_table.merge(
     summarised_stats, on="Sample_Name", validate="many_to_one"
 ).drop(
@@ -411,6 +453,7 @@ def weighted_mean(
         "Contamination_SE",
         "n_reads",
         "damage",
+        "endogenous",
         "Original_library_names",
     ],
 )