diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..db6f505 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +## All line endings should be linux file endings +* text eol=lf +## git lfs support for bed and pos files +*.pos filter=lfs diff=lfs merge=lfs -text +*.bed filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 8eb7182..32facfb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ eager/ *.finalised.tsv array_Logs/ array_tempfiles/ -testing_assets/ \ No newline at end of file +testing_assets/ +poseidon_packages/ +.tmp/ diff --git a/CHANGELOG.md b/CHANGELOG.md index cf9cad8..9f4bfa4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v0.2.1dev - 02/11/2023 + +### `Added` + +- `scripts/minotaur_packager.sh`: Script to create poseidon half-packages and fill in janno from eager results +- `scripts/populate_janno.py`: Script to fill in janno files with poseidon metadata from nf-core eager results. + +### `Fixed` + +- `scripts/validate_downloaded_data.sh`: Add helptext +- `scripts/run_eager.sh`: Now uses `big_data` profile +- Updates to templates for packages and configs. These are now defunct as they are pulled from the minotaur-recipes repo, and will be removed in netx release. +- `scripts/download_and_localise_package_files.sh` distinction between symlink dir and package_eager_dir. + +### `Dependencies` + +- nf-core/eager=2.4.6 + +### `Deprecated` + ## v0.2.0dev - 25/04/2023 ### `Added` diff --git a/assets/template.config b/assets/template.config index 1b03016..0ba9aec 100644 --- a/assets/template.config +++ b/assets/template.config @@ -1,6 +1,6 @@ // Keep track of config versions -config_template_version='0.2.0dev' -package_config_version='0.2.0dev' +config_template_version='0.2.2dev' +package_config_version='0.2.2dev' // This configuration file is designed to be a used with the nf-core/eager pipeline. // Instead of having to specify all other configurations for the Minotaur pipeline @@ -22,7 +22,7 @@ includeConfig '../../conf/CaptureType_profiles/1240K.config' params { // Keep track of config file versions used when processing - config_profile_description = "${config_profile_description}\nconfig_template_version: ${config_template_version}\npackage_config_version: ${package_config_version}" + config_profile_description = "${config_profile_description}, config_template_version: ${config_template_version}, package_config_version: ${package_config_version}" config_profile_contact = "Thiseas C. Lamnidis (@TCLamnidis)" /* diff --git a/conf/CaptureType_profiles/1240K.config b/conf/CaptureType_profiles/1240K.config index 508c3b8..7154e93 100644 --- a/conf/CaptureType_profiles/1240K.config +++ b/conf/CaptureType_profiles/1240K.config @@ -1,12 +1,12 @@ // Keep track of config versions -capturetype_config_version='0.2.0dev' +capturetype_config_version='0.2.2dev' //TODO Backup all bed/snp files in the repo somewhere and use from central location instead of hard-coding paths. // Profile specifying capture-specific parameters for packages of the '1240K' CaptureType. params{ // Keep track of config file versions used when processing config_profile_name = "${config_profile_name}, on 1240K sites" - config_profile_description="${config_profile_description}\n1240K.config: ${capturetype_config_version}" + config_profile_description="${config_profile_description}\n - CaptureType.1240K.config: ${capturetype_config_version}" // Qualimap bedfile for on-target coverage calculation snpcapture_bed = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' diff --git a/conf/Minotaur.config b/conf/Minotaur.config index d11accd..df2308f 100644 --- a/conf/Minotaur.config +++ b/conf/Minotaur.config @@ -1,30 +1,32 @@ // Keep track of config versions -minotaur_config_version='0.2.0dev' +minotaur_config_version='0.2.1dev' // Default parameters for processing of data through Minotaur workflow. params{ // Keep track of config file versions used when processing config_profile_name = "Minotaur pipeline config" - config_profile_description = "Minortaur.config: ${minotaur_config_version}" + config_profile_description = "Minotaur.config: ${minotaur_config_version}" config_profile_url = 'https://github.com/poseidon-framework/poseidon-eager' + // Skip steps + skip_preseq = true // This data is published so won't be sequencing it more. // Mapping bwaalnn = 0.01 - + // BAM filtering run_bam_filtering = true // Filter out unmapped reads, so barplots in MultiQC are not completely overtaken by unmapped reads. bam_unmapped_type = 'fastq' // Keep unmapped reads as a separate fastq file. Preferred format for possible future pathogen screening. bam_mapping_quality_threshold = 30 // Keep MapQ 30+ (together with snpcapture_bed is needed for poseidon "coverage on target SNPs" field) // The above also means that reads that are mapped with MapQ below 30 are lost after filtering, not present in the fastq OR the filtered bam! - + // mtDNA to nuclear ratio run_mtnucratio = true mtnucratio_header = "MT" - + // Bam Trimming run_trim_bam = true - + // Double-stranded library clipping parameters bamutils_clip_double_stranded_half_udg_left = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. bamutils_clip_double_stranded_half_udg_right = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. diff --git a/packages/2021_PattersonNature/2021_PattersonNature.config b/packages/2021_PattersonNature/2021_PattersonNature.config index 229a3c2..1b03016 100644 --- a/packages/2021_PattersonNature/2021_PattersonNature.config +++ b/packages/2021_PattersonNature/2021_PattersonNature.config @@ -34,6 +34,6 @@ params { https://nf-co.re/eager/2.4.6/parameters You can see the default values for parameters within poseidon-eager at: - https://github.com/poseidon-framework/poseidon-eager/blob/main/conf/Poseidon.config + https://github.com/poseidon-framework/poseidon-eager/blob/main/conf/Minotaur.config */ } diff --git a/packages/2021_PattersonNature/2021_PattersonNature.tsv b/packages/2021_PattersonNature/2021_PattersonNature.tsv index 0b5c617..17fe08d 100644 --- a/packages/2021_PattersonNature/2021_PattersonNature.tsv +++ b/packages/2021_PattersonNature/2021_PattersonNature.tsv @@ -253,7 +253,6 @@ I13714 I13714_ERS8208145 1 4 SE Homo sapiens (modern human) double half /I13713_ERS8208143_L1_R1.fastq.gz NA NA ERR7194848.fastq.gz NA I13711 I13711_ERS8208139 1 4 SE Homo sapiens (modern human) double half /I13711_ERS8208139_L1_R1.fastq.gz NA NA ERR7194844.fastq.gz NA I1389 I1389_ERS8208234 1 4 SE Homo sapiens (modern human) double half /I1389_ERS8208234_L1_R1.fastq.gz NA NA ERR7194939.fastq.gz NA -I1389 I1389_ERS8208233 1 4 SE Homo sapiens (modern human) double half /I1389_ERS8208233_L1_R1.fastq.gz NA NA 0 1 I14192 I14192_ERS8208274 1 4 SE Homo sapiens (modern human) double half /I14192_ERS8208274_L1_R1.fastq.gz NA NA ERR7194979.fastq.gz NA I15045 I15045_ERS8208427 1 4 SE Homo sapiens (modern human) double half /I15045_ERS8208427_L1_R1.fastq.gz NA NA ERR7195132.fastq.gz NA I14190 I14190_ERS8208270 1 4 SE Homo sapiens (modern human) double half /I14190_ERS8208270_L1_R1.fastq.gz NA NA ERR7194975.fastq.gz NA @@ -281,7 +280,6 @@ I14188 I14188_ERS8208265 1 4 SE Homo sapiens (modern human) double half /I14191_ERS8208272_L1_R1.fastq.gz NA NA ERR7194977.fastq.gz NA I13890 I13890_ERS8208219 1 4 SE Homo sapiens (modern human) double half /I13890_ERS8208219_L1_R1.fastq.gz NA NA ERR7194924.fastq.gz NA I13795 I13795_ERS8208209 1 4 SE Homo sapiens (modern human) double half /I13795_ERS8208209_L1_R1.fastq.gz NA NA ERR7194914.fastq.gz NA -I1391 I1391_ERS8208235 1 4 SE Homo sapiens (modern human) double half /I1391_ERS8208235_L1_R1.fastq.gz NA NA 0 1 I13898 I13898_ERS8208229 1 4 SE Homo sapiens (modern human) double half /I13898_ERS8208229_L1_R1.fastq.gz NA NA ERR7194934.fastq.gz NA I13899 I13899_ERS8208232 1 4 SE Homo sapiens (modern human) double half /I13899_ERS8208232_L1_R1.fastq.gz NA NA ERR7194937.fastq.gz NA I13897 I13897_ERS8208228 1 4 SE Homo sapiens (modern human) double half /I13897_ERS8208228_L1_R1.fastq.gz NA NA ERR7194933.fastq.gz NA @@ -750,7 +748,6 @@ I16612 I16612_ERS8208683 1 4 SE Homo sapiens (modern human) double half /I11154_ERS8207914_L1_R1.fastq.gz NA NA ERR7194619.fastq.gz NA I11156 I11156_ERS8207915 1 4 SE Homo sapiens (modern human) double half /I11156_ERS8207915_L1_R1.fastq.gz NA NA ERR7194620.fastq.gz NA I11156 I11156_ERS8207916 1 4 SE Homo sapiens (modern human) double half /I11156_ERS8207916_L1_R1.fastq.gz NA NA ERR7194621.fastq.gz NA -I11158 I11158_ERS8207917 1 4 SE Homo sapiens (modern human) double half /I11158_ERS8207917_L1_R1.fastq.gz NA NA 0 1 I11158 I11158_ERS8207918 1 4 SE Homo sapiens (modern human) double half /I11158_ERS8207918_L1_R1.fastq.gz NA NA ERR7194623.fastq.gz NA I12097_ss I12097_ss_ERS8207978_ss 1 4 SE Homo sapiens (modern human) single half /I12097_ss_ERS8207978_ss_L1_R1.fastq.gz NA NA ERR7194683.fastq.gz NA I17139_ss I17139_ss_ERS8208715_ss 1 4 SE Homo sapiens (modern human) single half /I17139_ss_ERS8208715_ss_L1_R1.fastq.gz NA NA ERR7195420.fastq.gz NA @@ -761,12 +758,10 @@ I17017 I17017_ERS8208711 1 4 SE Homo sapiens (modern human) double half /I17143_ss_ERS8208717_ss_L1_R1.fastq.gz NA NA ERR7195422.fastq.gz NA I17139_ss I17139_ss_ERS8208716_ss 1 4 SE Homo sapiens (modern human) single half /I17139_ss_ERS8208716_ss_L1_R1.fastq.gz NA NA ERR7195421.fastq.gz NA I12098 I12098_ERS8207979 1 4 SE Homo sapiens (modern human) double half /I12098_ERS8207979_L1_R1.fastq.gz NA NA ERR7194684.fastq.gz NA -I11159 I11159_ERS8207919 1 4 SE Homo sapiens (modern human) double half /I11159_ERS8207919_L1_R1.fastq.gz NA NA 0 1 I12098 I12098_ERS8207980 1 4 SE Homo sapiens (modern human) double half /I12098_ERS8207980_L1_R1.fastq.gz NA NA ERR7194685.fastq.gz NA I11152 I11152_ERS8207909 1 4 SE Homo sapiens (modern human) double half /I11152_ERS8207909_L1_R1.fastq.gz NA NA ERR7194614.fastq.gz NA I11151 I11151_ERS8207908 1 4 SE Homo sapiens (modern human) double half /I11151_ERS8207908_L1_R1.fastq.gz NA NA ERR7194613.fastq.gz NA I11159 I11159_ERS8207920 1 4 SE Homo sapiens (modern human) double half /I11159_ERS8207920_L1_R1.fastq.gz NA NA ERR7194625.fastq.gz NA -I11160 I11160_ERS8207921 1 4 SE Homo sapiens (modern human) double half /I11160_ERS8207921_L1_R1.fastq.gz NA NA 0 1 I11160 I11160_ERS8207922 1 4 SE Homo sapiens (modern human) double half /I11160_ERS8207922_L1_R1.fastq.gz NA NA ERR7194627.fastq.gz NA I11629 I11629_ERS8207923 1 4 SE Homo sapiens (modern human) double half /I11629_ERS8207923_L1_R1.fastq.gz NA NA ERR7194628.fastq.gz NA I12099 I12099_ERS8207981 1 4 SE Homo sapiens (modern human) double half /I12099_ERS8207981_L1_R1.fastq.gz NA NA ERR7194686.fastq.gz NA @@ -1353,13 +1348,9 @@ I23995 I23995_ERS8209257 1 4 SE Homo sapiens (modern human) double half /I23995_ERS8209258_L1_R1.fastq.gz NA NA ERR7195963.fastq.gz NA I23996 I23996_ERS8209259 1 4 SE Homo sapiens (modern human) double half /I23996_ERS8209259_L1_R1.fastq.gz NA NA ERR7195964.fastq.gz NA I23996 I23996_ERS8209260 1 4 SE Homo sapiens (modern human) double half /I23996_ERS8209260_L1_R1.fastq.gz NA NA ERR7195965.fastq.gz NA -I2405 I2405_ERS8209261 1 4 SE Homo sapiens (modern human) double half /I2405_ERS8209261_L1_R1.fastq.gz NA NA 0 1 I2405 I2405_ERS8209262 1 4 SE Homo sapiens (modern human) double half /I2405_ERS8209262_L1_R1.fastq.gz NA NA ERR7195967.fastq.gz NA -I2407 I2407_ERS8209263 1 4 SE Homo sapiens (modern human) double half /I2407_ERS8209263_L1_R1.fastq.gz NA NA 0 1 I2407 I2407_ERS8209264 1 4 SE Homo sapiens (modern human) double half /I2407_ERS8209264_L1_R1.fastq.gz NA NA ERR7195969.fastq.gz NA -I2413 I2413_ERS8209265 1 4 SE Homo sapiens (modern human) double half /I2413_ERS8209265_L1_R1.fastq.gz NA NA 0 1 I2413 I2413_ERS8209266 1 4 SE Homo sapiens (modern human) double half /I2413_ERS8209266_L1_R1.fastq.gz NA NA ERR7195971.fastq.gz NA -I2419 I2419_ERS8209267 1 4 SE Homo sapiens (modern human) double half /I2419_ERS8209267_L1_R1.fastq.gz NA NA 0 1 I2419 I2419_ERS8209268 1 4 SE Homo sapiens (modern human) double half /I2419_ERS8209268_L1_R1.fastq.gz NA NA ERR7195973.fastq.gz NA I24342 I24342_ERS8209269 1 4 SE Homo sapiens (modern human) double half /I24342_ERS8209269_L1_R1.fastq.gz NA NA ERR7195974.fastq.gz NA I24342 I24342_ERS8209270 1 4 SE Homo sapiens (modern human) double half /I24342_ERS8209270_L1_R1.fastq.gz NA NA ERR7195975.fastq.gz NA @@ -1367,7 +1358,6 @@ I24344 I24344_ERS8209271 1 4 SE Homo sapiens (modern human) double half /I24344_ERS8209272_L1_R1.fastq.gz NA NA ERR7195977.fastq.gz NA I24345 I24345_ERS8209273 1 4 SE Homo sapiens (modern human) double half /I24345_ERS8209273_L1_R1.fastq.gz NA NA ERR7195978.fastq.gz NA I24345 I24345_ERS8209274 1 4 SE Homo sapiens (modern human) double half /I24345_ERS8209274_L1_R1.fastq.gz NA NA ERR7195979.fastq.gz NA -I2435 I2435_ERS8209275 1 4 SE Homo sapiens (modern human) double half /I2435_ERS8209275_L1_R1.fastq.gz NA NA 0 1 I2435 I2435_ERS8209276 1 4 SE Homo sapiens (modern human) double half /I2435_ERS8209276_L1_R1.fastq.gz NA NA ERR7195981.fastq.gz NA I2446 I2446_ERS8209277 1 4 SE Homo sapiens (modern human) double half /I2446_ERS8209277_L1_R1.fastq.gz NA NA ERR7195982.fastq.gz NA I2446 I2446_ERS8209278 1 4 SE Homo sapiens (modern human) double half /I2446_ERS8209278_L1_R1.fastq.gz NA NA ERR7195983.fastq.gz NA @@ -1419,19 +1409,15 @@ I25525 I25525_ERS8209323 1 4 SE Homo sapiens (modern human) double half /I25525_ERS8209324_L1_R1.fastq.gz NA NA ERR7196029.fastq.gz NA I2565 I2565_ERS8209325 1 4 SE Homo sapiens (modern human) double half /I2565_ERS8209325_L1_R1.fastq.gz NA NA ERR7196030.fastq.gz NA I2565 I2565_ERS8209326 1 4 SE Homo sapiens (modern human) double half /I2565_ERS8209326_L1_R1.fastq.gz NA NA ERR7196031.fastq.gz NA -I2569 I2569_ERS8209327 1 4 SE Homo sapiens (modern human) double half /I2569_ERS8209327_L1_R1.fastq.gz NA NA 0 1 I2569 I2569_ERS8209328 1 4 SE Homo sapiens (modern human) double half /I2569_ERS8209328_L1_R1.fastq.gz NA NA ERR7196033.fastq.gz NA I2574 I2574_ERS8209329 1 4 SE Homo sapiens (modern human) double half /I2574_ERS8209329_L1_R1.fastq.gz NA NA ERR7196034.fastq.gz NA I2574 I2574_ERS8209330 1 4 SE Homo sapiens (modern human) double half /I2574_ERS8209330_L1_R1.fastq.gz NA NA ERR7196035.fastq.gz NA -I2575 I2575_ERS8209331 1 4 SE Homo sapiens (modern human) double half /I2575_ERS8209331_L1_R1.fastq.gz NA NA 0 1 I2575 I2575_ERS8209332 1 4 SE Homo sapiens (modern human) double half /I2575_ERS8209332_L1_R1.fastq.gz NA NA ERR7196037.fastq.gz NA I2598 I2598_ERS8209333 1 4 SE Homo sapiens (modern human) double half /I2598_ERS8209333_L1_R1.fastq.gz NA NA ERR7196038.fastq.gz NA I2598 I2598_ERS8209334 1 4 SE Homo sapiens (modern human) double half /I2598_ERS8209334_L1_R1.fastq.gz NA NA ERR7196039.fastq.gz NA I2611 I2611_ERS8209335 1 4 SE Homo sapiens (modern human) double half /I2611_ERS8209335_L1_R1.fastq.gz NA NA ERR7196040.fastq.gz NA I2611 I2611_ERS8209336 1 4 SE Homo sapiens (modern human) double half /I2611_ERS8209336_L1_R1.fastq.gz NA NA ERR7196041.fastq.gz NA -I2629 I2629_ERS8209337 1 4 SE Homo sapiens (modern human) double half /I2629_ERS8209337_L1_R1.fastq.gz NA NA 0 1 I2629 I2629_ERS8209338 1 4 SE Homo sapiens (modern human) double half /I2629_ERS8209338_L1_R1.fastq.gz NA NA ERR7196043.fastq.gz NA -I2656 I2656_ERS8209339 1 4 SE Homo sapiens (modern human) double half /I2656_ERS8209339_L1_R1.fastq.gz NA NA 0 1 I2656 I2656_ERS8209340 1 4 SE Homo sapiens (modern human) double half /I2656_ERS8209340_L1_R1.fastq.gz NA NA ERR7196045.fastq.gz NA I2658 I2658_ERS8209341 1 4 SE Homo sapiens (modern human) double half /I2658_ERS8209341_L1_R1.fastq.gz NA NA ERR7196046.fastq.gz NA I2658 I2658_ERS8209342 1 4 SE Homo sapiens (modern human) double half /I2658_ERS8209342_L1_R1.fastq.gz NA NA ERR7196047.fastq.gz NA @@ -1477,7 +1463,6 @@ I2695 I2695_ERS8209381 1 4 SE Homo sapiens (modern human) double half /I2695_ERS8209382_L1_R1.fastq.gz NA NA ERR7196087.fastq.gz NA I2696 I2696_ERS8209383 1 4 SE Homo sapiens (modern human) double half /I2696_ERS8209383_L1_R1.fastq.gz NA NA ERR7196088.fastq.gz NA I2696 I2696_ERS8209384 1 4 SE Homo sapiens (modern human) double half /I2696_ERS8209384_L1_R1.fastq.gz NA NA ERR7196089.fastq.gz NA -I2699 I2699_ERS8209385 1 4 SE Homo sapiens (modern human) double half /I2699_ERS8209385_L1_R1.fastq.gz NA NA 0 1 I2699 I2699_ERS8209386 1 4 SE Homo sapiens (modern human) double half /I2699_ERS8209386_L1_R1.fastq.gz NA NA ERR7196091.fastq.gz NA I27379 I27379_ERS8209387 1 4 SE Homo sapiens (modern human) double half /I27379_ERS8209387_L1_R1.fastq.gz NA NA ERR7196092.fastq.gz NA I27379 I27379_ERS8209388 1 4 SE Homo sapiens (modern human) double half /I27379_ERS8209388_L1_R1.fastq.gz NA NA ERR7196093.fastq.gz NA @@ -1493,11 +1478,9 @@ I27384 I27384_ERS8209397 1 4 SE Homo sapiens (modern human) double half /I27384_ERS8209398_L1_R1.fastq.gz NA NA ERR7196103.fastq.gz NA I27385 I27385_ERS8209399 1 4 SE Homo sapiens (modern human) double half /I27385_ERS8209399_L1_R1.fastq.gz NA NA ERR7196104.fastq.gz NA I27385 I27385_ERS8209400 1 4 SE Homo sapiens (modern human) double half /I27385_ERS8209400_L1_R1.fastq.gz NA NA ERR7196105.fastq.gz NA -I2796 I2796_ERS8209401 1 4 SE Homo sapiens (modern human) double half /I2796_ERS8209401_L1_R1.fastq.gz NA NA 0 1 I2796 I2796_ERS8209402 1 4 SE Homo sapiens (modern human) double half /I2796_ERS8209402_L1_R1.fastq.gz NA NA ERR7196107.fastq.gz NA I2799 I2799_ERS8209403 1 4 SE Homo sapiens (modern human) double half /I2799_ERS8209403_L1_R1.fastq.gz NA NA ERR7196108.fastq.gz NA I2799 I2799_ERS8209404 1 4 SE Homo sapiens (modern human) double half /I2799_ERS8209404_L1_R1.fastq.gz NA NA ERR7196109.fastq.gz NA -I2824 I2824_ERS8209405 1 4 SE Homo sapiens (modern human) double half /I2824_ERS8209405_L1_R1.fastq.gz NA NA 0 1 I2824 I2824_ERS8209406 1 4 SE Homo sapiens (modern human) double half /I2824_ERS8209406_L1_R1.fastq.gz NA NA ERR7196111.fastq.gz NA I2982 I2982_ERS8209407 1 4 SE Homo sapiens (modern human) double half /I2982_ERS8209407_L1_R1.fastq.gz NA NA ERR7196112.fastq.gz NA I2982 I2982_ERS8209408 1 4 SE Homo sapiens (modern human) double half /I2982_ERS8209408_L1_R1.fastq.gz NA NA ERR7196113.fastq.gz NA @@ -1511,7 +1494,6 @@ I3035 I3035_ERS8209415 1 4 SE Homo sapiens (modern human) double half /I3035_ERS8209416_L1_R1.fastq.gz NA NA ERR7196121.fastq.gz NA I3083 I3083_ERS8209417 1 4 SE Homo sapiens (modern human) double full /I3083_ERS8209417_L1_R1.fastq.gz NA NA ERR7196122.fastq.gz NA I3083 I3083_ERS8209418 1 4 SE Homo sapiens (modern human) double full /I3083_ERS8209418_L1_R1.fastq.gz NA NA ERR7196123.fastq.gz NA -I3137 I3137_ERS8209419 1 4 SE Homo sapiens (modern human) double half /I3137_ERS8209419_L1_R1.fastq.gz NA NA 0 1 I3137 I3137_ERS8209420 1 4 SE Homo sapiens (modern human) double half /I3137_ERS8209420_L1_R1.fastq.gz NA NA ERR7196125.fastq.gz NA I3139 I3139_ERS8209421 1 4 SE Homo sapiens (modern human) double half /I3139_ERS8209421_L1_R1.fastq.gz NA NA ERR7196126.fastq.gz NA I3139 I3139_ERS8209422 1 4 SE Homo sapiens (modern human) double half /I3139_ERS8209422_L1_R1.fastq.gz NA NA ERR7196127.fastq.gz NA @@ -1529,7 +1511,6 @@ I3567 I3567_ERS8209433 1 4 SE Homo sapiens (modern human) double half /I3567_ERS8209434_L1_R1.fastq.gz NA NA ERR7196139.fastq.gz NA I3568 I3568_ERS8209435 1 4 SE Homo sapiens (modern human) double half /I3568_ERS8209435_L1_R1.fastq.gz NA NA ERR7196140.fastq.gz NA I3568 I3568_ERS8209436 1 4 SE Homo sapiens (modern human) double half /I3568_ERS8209436_L1_R1.fastq.gz NA NA ERR7196141.fastq.gz NA -I4067 I4067_ERS8209437 1 4 SE Homo sapiens (modern human) double half /I4067_ERS8209437_L1_R1.fastq.gz NA NA 0 1 I4067 I4067_ERS8209438 1 4 SE Homo sapiens (modern human) double half /I4067_ERS8209438_L1_R1.fastq.gz NA NA ERR7196143.fastq.gz NA I4188 I4188_ERS8209439 1 4 SE Homo sapiens (modern human) double half /I4188_ERS8209439_L1_R1.fastq.gz NA NA ERR7196144.fastq.gz NA I4188 I4188_ERS8209440 1 4 SE Homo sapiens (modern human) double half /I4188_ERS8209440_L1_R1.fastq.gz NA NA ERR7196145.fastq.gz NA @@ -1547,9 +1528,7 @@ I5365 I5365_ERS8209451 1 4 SE Homo sapiens (modern human) double half /I5365_ERS8209452_L1_R1.fastq.gz NA NA ERR7196157.fastq.gz NA I5440 I5440_ERS8209453 1 4 SE Homo sapiens (modern human) double half /I5440_ERS8209453_L1_R1.fastq.gz NA NA ERR7196158.fastq.gz NA I5440 I5440_ERS8209454 1 4 SE Homo sapiens (modern human) double half /I5440_ERS8209454_L1_R1.fastq.gz NA NA ERR7196159.fastq.gz NA -I5471 I5471_ERS8209455 1 4 SE Homo sapiens (modern human) double half /I5471_ERS8209455_L1_R1.fastq.gz NA NA 0 1 I5471 I5471_ERS8209456 1 4 SE Homo sapiens (modern human) double half /I5471_ERS8209456_L1_R1.fastq.gz NA NA ERR7196161.fastq.gz NA -I5474 I5474_ERS8209457 1 4 SE Homo sapiens (modern human) double half /I5474_ERS8209457_L1_R1.fastq.gz NA NA 0 1 I5474 I5474_ERS8209458 1 4 SE Homo sapiens (modern human) double half /I5474_ERS8209458_L1_R1.fastq.gz NA NA ERR7196163.fastq.gz NA I5502 I5502_ERS8209459 1 4 SE Homo sapiens (modern human) double half /I5502_ERS8209459_L1_R1.fastq.gz NA NA ERR7196164.fastq.gz NA I5502 I5502_ERS8209460 1 4 SE Homo sapiens (modern human) double half /I5502_ERS8209460_L1_R1.fastq.gz NA NA ERR7196165.fastq.gz NA @@ -1601,23 +1580,14 @@ I7629 I7629_ERS8209505 1 4 SE Homo sapiens (modern human) double half /I7629_ERS8209506_L1_R1.fastq.gz NA NA ERR7196211.fastq.gz NA I7632 I7632_ERS8209507 1 4 SE Homo sapiens (modern human) double half /I7632_ERS8209507_L1_R1.fastq.gz NA NA ERR7196212.fastq.gz NA I7632 I7632_ERS8209508 1 4 SE Homo sapiens (modern human) double half /I7632_ERS8209508_L1_R1.fastq.gz NA NA ERR7196213.fastq.gz NA -I7950 I7950_ERS8209509 1 4 SE Homo sapiens (modern human) double half /I7950_ERS8209509_L1_R1.fastq.gz NA NA 0 1 I7950 I7950_ERS8209510 1 4 SE Homo sapiens (modern human) double half /I7950_ERS8209510_L1_R1.fastq.gz NA NA ERR7196215.fastq.gz NA -I7951 I7951_ERS8209511 1 4 SE Homo sapiens (modern human) double half /I7951_ERS8209511_L1_R1.fastq.gz NA NA 0 1 I7951 I7951_ERS8209512 1 4 SE Homo sapiens (modern human) double half /I7951_ERS8209512_L1_R1.fastq.gz NA NA ERR7196217.fastq.gz NA -I7952 I7952_ERS8209513 1 4 SE Homo sapiens (modern human) double half /I7952_ERS8209513_L1_R1.fastq.gz NA NA 0 1 I7952 I7952_ERS8209514 1 4 SE Homo sapiens (modern human) double half /I7952_ERS8209514_L1_R1.fastq.gz NA NA ERR7196219.fastq.gz NA -I7953 I7953_ERS8209515 1 4 SE Homo sapiens (modern human) double half /I7953_ERS8209515_L1_R1.fastq.gz NA NA 0 1 I7953 I7953_ERS8209516 1 4 SE Homo sapiens (modern human) double half /I7953_ERS8209516_L1_R1.fastq.gz NA NA ERR7196221.fastq.gz NA -I7958 I7958_ERS8209517 1 4 SE Homo sapiens (modern human) double half /I7958_ERS8209517_L1_R1.fastq.gz NA NA 0 1 I7958 I7958_ERS8209518 1 4 SE Homo sapiens (modern human) double half /I7958_ERS8209518_L1_R1.fastq.gz NA NA ERR7196223.fastq.gz NA -I7959 I7959_ERS8209519 1 4 SE Homo sapiens (modern human) double half /I7959_ERS8209519_L1_R1.fastq.gz NA NA 0 1 I7959 I7959_ERS8209520 1 4 SE Homo sapiens (modern human) double half /I7959_ERS8209520_L1_R1.fastq.gz NA NA ERR7196225.fastq.gz NA -I7960 I7960_ERS8209521 1 4 SE Homo sapiens (modern human) double half /I7960_ERS8209521_L1_R1.fastq.gz NA NA 0 1 I7960 I7960_ERS8209522 1 4 SE Homo sapiens (modern human) double half /I7960_ERS8209522_L1_R1.fastq.gz NA NA ERR7196227.fastq.gz NA -I7962 I7962_ERS8209523 1 4 SE Homo sapiens (modern human) double half /I7962_ERS8209523_L1_R1.fastq.gz NA NA 0 1 I7962 I7962_ERS8209524 1 4 SE Homo sapiens (modern human) double half /I7962_ERS8209524_L1_R1.fastq.gz NA NA ERR7196229.fastq.gz NA -I7963 I7963_ERS8209525 1 4 SE Homo sapiens (modern human) double half /I7963_ERS8209525_L1_R1.fastq.gz NA NA 0 1 I7963 I7963_ERS8209526 1 4 SE Homo sapiens (modern human) double half /I7963_ERS8209526_L1_R1.fastq.gz NA NA ERR7196231.fastq.gz NA I7964 I7964_ERS8209527 1 4 SE Homo sapiens (modern human) double half /I7964_ERS8209527_L1_R1.fastq.gz NA NA ERR7196232.fastq.gz NA I7964 I7964_ERS8209528 1 4 SE Homo sapiens (modern human) double half /I7964_ERS8209528_L1_R1.fastq.gz NA NA ERR7196233.fastq.gz NA @@ -1637,17 +1607,11 @@ I0525 I0525_ERS8207822 1 4 SE Homo sapiens (modern human) double half /I0525_ERS8207823_L1_R1.fastq.gz NA NA ERR7194575.fastq.gz NA I0527 I0527_ERS8207824 1 4 SE Homo sapiens (modern human) double half /I0527_ERS8207824_L1_R1.fastq.gz NA NA ERR7194576.fastq.gz NA I0527 I0527_ERS8207825 1 4 SE Homo sapiens (modern human) double half /I0527_ERS8207825_L1_R1.fastq.gz NA NA ERR7194577.fastq.gz NA -I10342 I10342_ERS8207826 1 4 SE Homo sapiens (modern human) double half /I10342_ERS8207826_L1_R1.fastq.gz NA NA 0 1 I10342 I10342_ERS8207827 1 4 SE Homo sapiens (modern human) double half /I10342_ERS8207827_L1_R1.fastq.gz NA NA ERR7194579.fastq.gz NA -I10343 I10343_ERS8207828 1 4 SE Homo sapiens (modern human) double half /I10343_ERS8207828_L1_R1.fastq.gz NA NA 0 1 I10343 I10343_ERS8207829 1 4 SE Homo sapiens (modern human) double half /I10343_ERS8207829_L1_R1.fastq.gz NA NA ERR7194581.fastq.gz NA -I10344 I10344_ERS8207830 1 4 SE Homo sapiens (modern human) double half /I10344_ERS8207830_L1_R1.fastq.gz NA NA 0 1 I10344 I10344_ERS8207831 1 4 SE Homo sapiens (modern human) double half /I10344_ERS8207831_L1_R1.fastq.gz NA NA ERR7194583.fastq.gz NA -I10345 I10345_ERS8207832 1 4 SE Homo sapiens (modern human) double half /I10345_ERS8207832_L1_R1.fastq.gz NA NA 0 1 I10345 I10345_ERS8207833 1 4 SE Homo sapiens (modern human) double half /I10345_ERS8207833_L1_R1.fastq.gz NA NA ERR7194585.fastq.gz NA -I10347 I10347_ERS8207834 1 4 SE Homo sapiens (modern human) double half /I10347_ERS8207834_L1_R1.fastq.gz NA NA 0 1 I10347 I10347_ERS8207835 1 4 SE Homo sapiens (modern human) double half /I10347_ERS8207835_L1_R1.fastq.gz NA NA ERR7194587.fastq.gz NA -I10348 I10348_ERS8207836 1 4 SE Homo sapiens (modern human) double half /I10348_ERS8207836_L1_R1.fastq.gz NA NA 0 1 I10348 I10348_ERS8207837 1 4 SE Homo sapiens (modern human) double half /I10348_ERS8207837_L1_R1.fastq.gz NA NA ERR7194589.fastq.gz NA I11033 I11033_ERS8207838 1 4 SE Homo sapiens (modern human) double half /I11033_ERS8207838_L1_R1.fastq.gz NA NA ERR7194590.fastq.gz NA I11033 I11033_ERS8207839 1 4 SE Homo sapiens (modern human) double half /I11033_ERS8207839_L1_R1.fastq.gz NA NA ERR7194591.fastq.gz NA diff --git a/packages/2021_PattersonNature/script_versions.txt b/packages/2021_PattersonNature/script_versions.txt index fc73d50..e2cf8ab 100644 --- a/packages/2021_PattersonNature/script_versions.txt +++ b/packages/2021_PattersonNature/script_versions.txt @@ -1,2 +1,7 @@ -create_eager_input.sh: 0.2.0dev +create_eager_input.sh: 0.2.1dev source_me.sh for initial TSV: 0.2.0dev +download_ena_data.py: 0.2.0dev +validate_downloaded_data.sh: 0.2.1dev +source_me.sh for data validation: 0.2.0dev +2021_PattersonNature.tsv_patch.sh: 0.2.0dev +source_me.sh for final TSV: 0.2.0dev diff --git a/scripts/download_and_localise_package_files.sh b/scripts/download_and_localise_package_files.sh index fee662f..b63d3bf 100755 --- a/scripts/download_and_localise_package_files.sh +++ b/scripts/download_and_localise_package_files.sh @@ -46,7 +46,8 @@ ssf_file="${package_dir}/${package_name}.ssf" download_dir="${repo_dir}/raw_sequencing_data/" download_log_dir="${download_dir}/download_logs" local_data_dir="${repo_dir}/raw_sequencing_data/${package_name}" -symlink_dir="${repo_dir}/eager/${package_name}/data" +package_eager_dir="${repo_dir}/eager/${package_name}/" +symlink_dir="${package_eager_dir}/data" tsv_patch_fn="${package_dir}/${package_name}.tsv_patch.sh" original_tsv="${package_dir}/${package_name}.tsv" @@ -59,7 +60,7 @@ check_fail $? "${script_debug_string} Downloads did not finish completely. Try a ## STEP 2: Validate downloaded files. mkdir -p ${symlink_dir} -${repo_dir}/scripts/validate_downloaded_data.sh ${ssf_file} ${local_data_dir} ${symlink_dir} +${repo_dir}/scripts/validate_downloaded_data.sh ${ssf_file} ${local_data_dir} ${package_eager_dir} check_fail $? "${script_debug_string} Validation and symlink creation failed." ## STEP 3: Localise TSV file. diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh new file mode 100755 index 0000000..cc26c49 --- /dev/null +++ b/scripts/minotaur_packager.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash +VERSION='0.2.1dev' +set -o pipefail ## Pipefail, complain on new unassigned variables. +# set -x ## Debugging + +## Helptext function +function Helptext() { + echo -ne "\t usage: ${0} [options] Package_Minotaur_Directory\n\n" + echo -ne "This script collects the genotype data and metadata from Minotaur-processing and creates/updates the requested poseidon package if needed.\n\n" + echo -ne "Options:\n" + echo -ne "-d, --debug\t\tActivates debug mode, and keeps temporary directories for troubleshooting.\n" + echo -ne "-h, --help\t\tPrint this text and exit.\n" + echo -ne "-v, --version\t\tPrint version and exit.\n" +} + +## Source helper functions +repo_dir=$(dirname $(readlink -f ${0}))/.. ## The repository will be the original position of this script. If a user copies instead of symlink, this will fail. +source ${repo_dir}/scripts/source_me.sh ## Source helper functions + +## A function to make a genotype dataset for the output package out of an array of genotype files. +## Usage make_genotype_dataset_out_of_genotypes ... +## format: The format of the genotype files. Either 'PLINK' or 'EIGENSTRAT'. +## out_name: The name of the output genotype dataset. +## tempdir: The temporary directory to use for mixing the genotypes. +## geno_fn*: The genotype files to merge together. +## NOTE: This function uses the errecho() function defined in source_me.sh +function make_genotype_dataset_out_of_genotypes() { + local format + local tempdir + local out_name + local input_fns + local input_fn + local base_fn + local ind_fns + local geno_top_length + local geno_bot_length + + format=${1} + out_name=${2} + tempdir=${3} + shift 3 + input_fns=("${@}") + + ## Check that the format is valid. + if [[ ${format} != "PLINK" && ${format} != "EIGENSTRAT" ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Invalid format '${format}' provided." + exit 1 + fi + + ## Ensure the provided temp dir exists + if [[ ! -d ${tempdir} ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Temporary directory '${tempdir}' not found." + exit 1 + fi + + ## Check that the genotype files exist. + for input_fn in ${input_fns[@]}; do + if [[ ! -f ${input_fn} ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Genotype file '${input_fn}' not found." + exit 1 + fi + done + + ## Merge eigenstrat genotypes + if [[ ${format} == "EIGENSTRAT" ]]; then + ## Paste genos together. If only one is there, then it's simply a copy of it + paste -d '\0' ${input_fns[@]} > ${tempdir}/${out_name}.geno + + ## Copy the snp file + cp ${input_fns[0]%.geno}.snp ${tempdir}/${out_name}.snp + + ## And cat the ind files (this needs a bit of variable expansion to work) + ind_fns='' + for base_fn in ${input_fns[@]%.geno}; do + ind_fns+="${base_fn}.ind " + done + ## Also add '_MNT' suffix to individual IDs. Set input and output field sep to TAB. + cat ${ind_fns} | awk 'BEGIN {OFS=FS="\t"}; {$1=$1"_MNT"; print $0}' > ${tempdir}/${out_name}.ind + + ## Final sanity check, that the file dimensions are correct. + if [[ $(wc -l ${tempdir}/${out_name}.geno | cut -f 1 -d ' ') != $(wc -l ${tempdir}/${out_name}.snp | cut -f 1 -d ' ') ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Genotype file '${out_name}.geno' has a different number of lines than the snp file." + exit 1 + fi + + ## Check that the genotype dataset has a consistent length across first and last snp. + geno_top_length=$(bc <<< "$(head -n1 ${tempdir}/${out_name}.geno | wc -c | cut -f 1 -d ' ') - 1") + geno_bot_length=$(bc <<< "$(tail -n1 ${tempdir}/${out_name}.geno | wc -c | cut -f 1 -d ' ') - 1") + if [[ ${geno_top_length} != ${geno_bot_length} ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Genotype file '${out_name}.geno' has inconsistent line lengths. Check the input datasets and try again." + exit 1 + fi + + ## The number of characters (excluding the new line character) in the last row of the genotype file should match than the number of individuals. + if [[ ${geno_bot_length} != $(wc -l ${tempdir}/${out_name}.ind | cut -f 1 -d ' ') ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: Genotype file '${out_name}.geno' has a different number of lines than the ind file." + exit 1 + fi + errecho "[make_genotype_dataset_out_of_genotypes()]: Successfully created genotype dataset '${out_name}.{geno,snp,ind}'." + + ## Merge plink genotypes + elif [[ ${format} == 'PLINK' ]]; then + errecho -r "[make_genotype_dataset_out_of_genotypes()]: PLINK genotype merging not yet implemented." + exit 1 + fi +} + +## Helper function to pull minotaur versions from Config Profile Description and add them to a file. +## usage add_versions_file +## Will create a file with the following information: +## nf-core/eager version +## Minotaur config version +## CaptureType config version +## Package config version +## Minotaur-packager version +function add_versions_file() { + local package_eager_result_dir + local capture_type_config + local version_fn + local minotaur_version + local eager_version + local minotaur_versioning_string + local minotaur_version + local config_version + local capture_type_version + local capture_type_version_string + local pipeline_report_fn + + ## Read in function params + package_eager_result_dir=${1} + version_fn=${2} + + pipeline_report_fn=${package_eager_result_dir}/pipeline_info/pipeline_report.txt ## The pipeline report file from nf-core/eager + eager_version=$(grep "Pipeline Release:" ${pipeline_report_fn} | awk -F ":" '{print $NF}') + + ## Each attribute now comes in its own line. (0.2.0dev +) + minotaur_version=$(grep "Minortaur.config" ${pipeline_report_fn} | awk -F ' ' '{print $NF}') + ## TODO-dev un-hard-code 1240K config (future packages will have the keyword "CaptureType.XXXX.config") + capture_type_version_string=$(grep "1240K.config" ${pipeline_report_fn}) + ## If the grep above returned nothing, then there is no Capture Type profile. + if [[ -z ${capture_type_version_string} ]]; then + capture_type_version='' + capture_type_config='' + else + capture_type_version=$(echo ${capture_type_version_string} | awk -F ' ' '{print $NF}') + capture_type_config="1240K" ## TODO-dev un-hard-code 1240K config (future packages will have the keyword "CaptureType.XXXX.config") + fi + + config_version=$(grep "config_template_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}') + package_config_version=$(grep "package_config_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}') + + ## Create the versions file. Flush any old file contents if the file exists. + echo "nf-core/eager version: ${eager_version}" > ${version_fn} + echo "Minotaur config version: ${minotaur_version}" >> ${version_fn} + if [[ ! -z ${capture_type_version_string} ]]; then + echo "CaptureType profile: ${capture_type_config} ${capture_type_version}" >> ${version_fn} + fi + echo "Config template version: ${config_version}" >> ${version_fn} + echo "Package config version: ${package_config_version}" >> ${version_fn} + echo "Minotaur-packager version: ${VERSION}" >> ${version_fn} +} + +## Parse CLI args. +TEMP=`getopt -q -o dhv --long debug,help,version -n "${0}" -- "$@"` +eval set -- "${TEMP}" + +## Parameter defaults +## The package_minotaur_directory is the directory where the minotaur output is stored. +## It should look something like this: +## /path/to/minotaur/outputs/2021_my_package/ +## ├── 2021_my_package.finalised.tsv ## The finalised tsv file for processing with nf-core/eager +## ├── data/ ## The symlinks to the raw data used as eager input +## ├── results/ ## nf-core/eager results directory +## └── work/ ## Netxtflow work directory +package_minotaur_directory='' + +## Print helptext and exit when no option is provided. +if [[ "${#@}" == "1" ]]; then + Helptext + exit 0 +fi + +## Read in CLI arguments +while true ; do + case "$1" in + -h|--help) Helptext; exit 0 ;; + -v|--version) echo ${VERSION}; exit 0;; + -d|--debug) errecho -y "[minotaur_packager.sh]: Debug mode activated."; debug_mode=1; shift ;; + --) package_minotaur_directory="${2%/}"; break ;; + *) echo -e "invalid option provided.\n"; Helptext; exit 1;; + esac +done + +## Infer other variables from the package_minotaur_directory provided. +package_name="${package_minotaur_directory##*/}" +package_oven_dir="/mnt/archgen/poseidon/minotaur/minotaur-package-oven/" ## Hard-coded path for EVA +output_package_dir="${package_oven_dir}/${package_name}" ## Hard-coded path for EVA +finalisedtsv_fn="${package_minotaur_directory}/${package_name}.finalised.tsv" +root_results_dir="${package_minotaur_directory}/results" + +## Get current date for versioning +errecho -y "[minotaur_packager.sh]: version ${VERSION}" +date_stamp=$(date +'%Y-%M-%d') + +## Check that the finalised tsv file exists. +if [[ ! -f ${finalisedtsv_fn} ]]; then + errecho -r "[${package_name}]: Finalised tsv file not found in '${package_minotaur_directory}'." + errecho -r "[${package_name}]: Please check the provided directory and try again." + exit 1 +fi + +## Quick sanity check that the minotaur run has completed (i.e. genotypes are not newer than multiQC). +newest_genotype_fn=$(ls -Art -1 ${root_results_dir}/genotyping/*geno | tail -n 1) ## Reverse order and tail to avoid broken pipe errors +if [[ -z newest_genotype_fn && -f ${newest_genotype_fn} && ${root_results_dir}/multiqc/multiqc_report.html && ${newest_genotype_fn} -nt ${root_results_dir}/multiqc/multiqc_report.html ]]; then + errecho -r "Minotaur run has not completed. Please ensure that processing has finished." + exit 1 +fi + +## Create a temporary directory to mix and rename the genotype datasets in. +## 'tmp_dir' outside function, 'tempdir' in make_genotype_dataset_out_of_genotypes function +tmp_dir=$(mktemp -d ${package_oven_dir}/.tmp/MNT_${package_name}.XXXXXXXXXX) +check_fail $? "[${package_name}]: Failed to create temporary directory. Aborting.\nCheck your permissions in ${package_oven_dir}, and that directory ${package_oven_dir}/.tmp/ exists." + +genotype_fns=($(ls -1 ${root_results_dir}/genotyping/*geno)) ## List of genotype files. + +## Infer the SNP set from the config activated in the minotaur run from the config description. +## TODO-dev Currently hard-coded to 1240K since all data thus far is 1240K. This should grep "CaptureType" from the pipeline report, and infer the SNP set from that. +# snp_set=$( grep 'Config Profile Description' ${root_results_dir}/pipeline_info/pipeline_report.txt | cut -f 2 -d ',' | cut -d ':' -f 1 | tr -d ' ' | cut -d '.' -f 1 ) +snp_set="1240K" +errecho -y "[${package_name}]: SNP set inferred as '${snp_set}'." + +## Check that the inferred snp set is supported. Should trigger if the inference somehow breaks. +supported_snpsets=($(ls -1 ${repo_dir}/conf/CaptureType_profiles/ | cut -d "." -f 1)) +if [[ ! -z $(all_x_in_y 1 ${snp_set} ${#supported_snpsets[@]} ${supported_snpsets[@]}) ]]; then + errecho -r "[${package_name}]: Inferred SNP set '${snp_set}' is not supported. SNP set inference might have gone wrong." + exit 1 +fi + +## If the package exists and the genotypes are not newer than the package, then print a message and do nothing. +if [[ -d ${output_package_dir} ]] && [[ ! ${newest_genotype_fn} -nt ${output_package_dir}/${package_name}.geno ]]; then + errecho -y "[${package_name}]: Package is up to date." + exit 0 + +## If genotypes are new or the package does not exist, then create/update the package genotypes. +elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_package_dir}/${package_name}.geno ]]; then + errecho -y "[${package_name}]: Genotypes are new or package does not exist. Creating/Updating package genotypes." + make_genotype_dataset_out_of_genotypes "EIGENSTRAT" "${package_name}" "${tmp_dir}" ${genotype_fns[@]} + + ## Create a new package with the given genotypes. + trident init -p ${tmp_dir}/${package_name}.geno -o ${tmp_dir}/package/ -n ${package_name} --snpSet ${snp_set} + check_fail $? "[${package_name}]: Failed to initialise package. Aborting." + + ## Fill in janno + errecho -y "Populating janno file" + ${repo_dir}/scripts/populate_janno.py -r ${package_minotaur_directory}/results/ -t ${finalisedtsv_fn} -p ${tmp_dir}/package/POSEIDON.yml + + ## TODO-dev Infer genetic sex from janno and mirror to ind file. + + ## Convert data to PLINK format + trident genoconvert -d ${tmp_dir}/package \ + --genoFile ${tmp_dir}/package/${package_name}.geno \ + --snpFile ${tmp_dir}/package/${package_name}.snp \ + --indFile ${tmp_dir}/package/${package_name}.ind \ + --inFormat EIGENSTRAT \ + --outFormat PLINK \ + --removeOld + + ## Update the package yaml to account for the changes in the janno (update renamed to rectify) + trident rectify -d ${tmp_dir}/package \ + --packageVersion Patch \ + --logText "Automatic update of janno file from Minotaur processing." \ + --checksumAll + ## TODO Add poseidon core team, or Minotaur as contributors? + + ## Validate the resulting package + trident validate -d ${tmp_dir}/package + + ## Only move package dir to "package oven" if validation passed + if [[ $? == 0 ]] && [[ ${debug_mode} -ne 1 ]]; then + errecho -y "## Moving '${package_name}' to package oven ##" + ## Create directory for poseidon package in package oven + ## Only created now to not trip up the script if execution did not run through fully. + mkdir -p $(dirname ${output_package_dir}) + + ## Add Minotaur versioning file to package + add_versions_file ${root_results_dir} ${tmp_dir}/package/versions.txt + + ## Move package contents to the oven + mv ${tmp_dir}/package/ ${output_package_dir} + + ## Then remove remaining temp files + errecho -y "## Removing temp directory ##" + + ## Paranoid of removing in root, so extra check for tmp_dir + if [[ ! -z ${tmp_dir} ]]; then + ## Playing it safe by avoiding rm -r + rm ${tmp_dir}/* + rmdir ${tmp_dir} + fi + fi + + ## Partially fill empty fields in janno. +# elif [[ 1 ]]; then + ## TODO Add package updating once that is needed. For now assum each package will be made once and that's it. +fi \ No newline at end of file diff --git a/scripts/populate_janno.py b/scripts/populate_janno.py new file mode 100755 index 0000000..90c5164 --- /dev/null +++ b/scripts/populate_janno.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python3 + +import pyEager +import argparse +import os +import glob +import pandas as pd +import yaml +import re +import numpy as np +from collections import namedtuple + +VERSION = "0.2.0dev" + + +def camel_to_snake(name): + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + +def infer_library_name(row, prefix_col=None, target_col=None): + strip_me = "{}_".format(row[prefix_col]) + from_me = row[target_col] + + ## First, remove the added prefix + if from_me.startswith(strip_me): + inferred_name = from_me.replace(strip_me, "", 1) + else: + inferred_name = from_me + + ## Finally, strip the hard-coded suffix if there. + if inferred_name.endswith("_ss"): + inferred_name = inferred_name.replace("_ss", "", 1) + else: + inferred_name = inferred_name + + return inferred_name + +def set_contamination_measure(row): + ## If the row's contamination is not NaN, then set to "ANGSD", otherwise NaN + if not pd.isna(row["Contamination"]): + return "ANGSD[v0.935]" ## TODO-dev infer the version from eager software_versions.txt + else: + return np.nan + +## Function that takes a pd.DataFrame and the name of a column, and applies the format to it to add a new column named poseidon_id +class PoseidonYaml: + def __init__(self, path_poseidon_yml): + ## Check that path_poseidon_yml exists. Throw error if not. + if not os.path.exists(path_poseidon_yml): + raise ValueError( + "The path to the poseidon yml file does not exist. Provided path '{}'.".format( + path_poseidon_yml + ) + ) + ## Read in yaml file and set attributes. + self.yaml_data = yaml.safe_load(open(path_poseidon_yml)) + self.package_dir = os.path.dirname(path_poseidon_yml) + self.poseidon_version = self.yaml_data["poseidonVersion"] + self.title = self.yaml_data["title"] + self.description = self.yaml_data["description"] + self.contributor = self.yaml_data["contributor"] + self.package_version = self.yaml_data["packageVersion"] + self.last_modified = self.yaml_data["lastModified"] + self.janno_file = os.path.join(self.package_dir, self.yaml_data["jannoFile"]) + ## For each key-value pair in dict, check if key is genoFile. If so, join the package_dir to the value. Else, just add the value. + ## Also convert keys from camelCase to snake_case. + genotype_data = {} + for key, value in self.yaml_data["genotypeData"].items(): + if key.endswith("File"): + genotype_data[camel_to_snake(key)] = os.path.join( + self.package_dir, value + ) + else: + genotype_data[camel_to_snake(key)] = value + ## Genotype data is a dictionary in itself + GenotypeDict = namedtuple("GenotypeDict", " ".join(genotype_data.keys())) + self.genotype_data = GenotypeDict(**genotype_data) + ## Optional attributes + for attribute in [ + "janno_file_chk_sum", + "sequencing_source_file", + "sequencing_source_file_chk_sum", + "bib_file", + "bib_file_chk_sum", + "changelog_file", + ]: + try: + if attribute.endswith("File"): + setattr( + self, + attribute, + os.path.join(self.package_dir, self.yaml_data[attribute]), + ) + else: + setattr(self, attribute, self.yaml_data[attribute]) + except: + setattr(self, attribute, None) + + +## Function to calculate weighted mean of a group from the weight and value columns specified. +def weighted_mean( + group, wt_col="wt", val_col="val", filter_col="filter_col", min_val=100 +): + non_nan_indices = ~group[val_col].isna() + filter_indices = group[filter_col] >= min_val ## Filter based on 'filter_col' >= 15 + valid_indices = non_nan_indices & filter_indices + + if valid_indices.any(): + weighted_values = ( + group.loc[valid_indices, wt_col] * group.loc[valid_indices, val_col] + ) + total_weight = group.loc[ + valid_indices, wt_col + ].sum() # Calculate total weight without excluded weights + weighted_mean = weighted_values.sum() / total_weight + else: + weighted_mean = np.nan # Return NaN if no valid values left + + return weighted_mean + + +def weighted_mean( + group, wt_col="wt", val_col="val", filter_col="filter_col", min_val=100 +): + non_nan_indices = ~group[val_col].isna() + filter_indices = group[filter_col] >= min_val ## Remove values below the cutoff + valid_indices = non_nan_indices & filter_indices + if valid_indices.any(): + weighted_values = ( + group.loc[valid_indices, wt_col] * group.loc[valid_indices, val_col] + ) + total_weight = group.loc[ + valid_indices, wt_col + ].sum() # Calculate total weight without excluded weights + weighted_mean = weighted_values.sum() / total_weight + else: + weighted_mean = np.nan # Return NaN if no valid values left + return weighted_mean + + +parser = argparse.ArgumentParser( + prog="populate_janno", + description="This script reads in different nf-core/eager result files and" + "uses this information to populate the relevant fields in a" + "poseidon janno file. The janno file and .ind/.fam file of the" + "package are updated, unless the --safe option is provided, in" + "which case the output files get the suffix '.new'.", +) + +parser.add_argument( + "-r", + "--eager_result_dir", + metavar="", + required=True, + help="The nf-core/eager result directory for the minotaur package.", +) +parser.add_argument( + "-t", + "--eager_tsv_path", + metavar="", + required=True, + help="The path to the eager input TSV used to generate the nf-core/eager results.", +) +parser.add_argument( + "-p", + "--poseidon_yml_path", + metavar="", + required=True, + help="The poseidon yml file for the package.", +) +parser.add_argument( + "--safe", + action="store_true", + help="Activate safe mode. The package's janno and ind files will not be updated, but instead new files will be created with the '.new' suffix. Only useful for testing.", +) +parser.add_argument("-v", "--version", action="version", version=VERSION) + +args = parser.parse_args() + +## Collect paths for analyses with multiple jsons. +damageprofiler_json_paths = glob.glob( + os.path.join(args.eager_result_dir, "damageprofiler", "*", "*.json") +) +endorspy_json_paths = glob.glob( + os.path.join(args.eager_result_dir, "endorspy", "*.json") +) +snp_coverage_json_paths = glob.glob( + os.path.join(args.eager_result_dir, "genotyping", "*.json") +) + +## Collect paths for analyses with single json. +sexdeterrmine_json_path = os.path.join( + args.eager_result_dir, "sex_determination", "sexdeterrmine.json" +) +nuclear_contamination_json_path = os.path.join( + args.eager_result_dir, "nuclear_contamination", "nuclear_contamination_mqc.json" +) + +## Read in all JSONs into pandas DataFrames. +damage_table = pyEager.wrappers.compile_damage_table(damageprofiler_json_paths) +endogenous_table = pyEager.wrappers.compile_endogenous_table(endorspy_json_paths) +snp_coverage_table = pyEager.wrappers.compile_snp_coverage_table( + snp_coverage_json_paths +) +contamination_table = pyEager.parsers.parse_nuclear_contamination_json( + nuclear_contamination_json_path +) +sex_determination_table = pyEager.parsers.parse_sexdeterrmine_json( + sexdeterrmine_json_path +) + +tsv_table = pyEager.parsers.parse_eager_tsv(args.eager_tsv_path) +tsv_table = pyEager.parsers.infer_merged_bam_names( + tsv_table, run_trim_bam=True, skip_deduplication=False +) + +## Read poseidon yaml, infer path to janno file and read janno file. +poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path) +janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str) +## Add Main_ID to janno table. That is the Poseidon_ID after removing minotaur processing related suffixes. +janno_table["Eager_ID"] = janno_table["Poseidon_ID"].str.replace(r"_MNT", "") +janno_table["Main_ID"] = janno_table["Eager_ID"].str.replace(r"_ss", "") + +## Prepare damage table for joining. Infer eager Library_ID from id column, by removing '_rmdup.bam' suffix +## TODO-dev Check if this is the correct way to infer Library_ID from id column when the results are on the sample level. +damage_table["Library_ID"] = damage_table["id"].str.replace(r"_rmdup.bam", "") +damage_table = damage_table[["Library_ID", "n_reads", "dmg_5p_1bp"]].rename( + columns={"dmg_5p_1bp": "damage"} +) + +## Prepare endogenous table for joining. Should be max value in cases where multiple libraries are merged. But also, should be SG data ONLY, which is unlikely to work well with ENA datasets where TF and SG reads might be merged. +## TODO-dev Decide on how to get keep only SG data. The SSF Could be used to filter. +## NOTE: for now, endogenous DNA is ignored, as inference of SG/TF is difficult. +endogenous_table = endogenous_table[["id", "endogenous_dna"]].rename( + columns={"id": "Library_ID", "endogenous_dna": "endogenous"} +) + +## Prepare SNP coverage table for joining. Should always be on the sample level, so only need to fix column names. +snp_coverage_table = snp_coverage_table.drop("Total_Snps", axis=1).rename( + columns={"id": "Sample_ID", "Covered_Snps": "Nr_SNPs"} +) + +## Prepare contamination table for joining. Always at library level. Only need to fix column names here. +contamination_table = contamination_table[ + ["id", "Num_SNPs", "Method1_ML_estimate", "Method1_ML_SE"] +].rename( + columns={ + "id": "Library_ID", + "Num_SNPs": "Contamination_Nr_SNPs", + "Method1_ML_estimate": "Contamination_Est", + "Method1_ML_SE": "Contamination_SE", + } +) +contamination_table["Contamination_Est"] = pd.to_numeric( + contamination_table["Contamination_Est"], errors="coerce" +) +contamination_table["Contamination_SE"] = pd.to_numeric( + contamination_table["Contamination_SE"], errors="coerce" +) + +## Prepare sex determination table for joining. Naming is sometimes at library and sometimes at sample-level, but results are always at sample level. +sex_determination_table = sex_determination_table[ + ["id", "RateX", "RateY", "RateErrX", "RateErrY"] +] + +## Merge all eager tables together (exclude endogenous for now). +compound_eager_table = ( + pd.DataFrame.merge( + tsv_table, + snp_coverage_table, + left_on="Sample_Name", + right_on="Sample_ID", + validate="many_to_one", + ) + .merge( + ## Add contamination results per Library_ID + contamination_table, + on="Library_ID", + validate="many_to_one", + ) + .merge( + ## Add 5p1 damage results per Library_ID + damage_table, + on="Library_ID", + validate="many_to_one", + ) + .merge( + ## Add sex determination results per Sample_ID + sex_determination_table, + left_on="sexdet_bam_name", + right_on="id", + validate="many_to_one", + ) + .drop( + ## Drop columns that are not relevant anymore + [ + "Lane", + "Colour_Chemistry", + "SeqType", + "Organism", + "Strandedness", + "UDG_Treatment", + "R1", + "R2", + "BAM", + "initial_merge", + "additional_merge", + "strandedness_clash", + "initial_bam_name", + "additional_bam_name", + "sexdet_bam_name", + "Sample_ID", + "id", + ], + axis=1, + ) + .drop_duplicates() +) + +summarised_stats = pd.DataFrame() +summarised_stats["Sample_Name"] = compound_eager_table["Sample_Name"].unique() +summarised_stats = ( + compound_eager_table.astype("string") + .groupby("Sample_Name")[["Contamination_Nr_SNPs"]] + .agg( + lambda x: "Nr Snps (per library): {}. Estimate and error are weighted means of values per library. Libraries with fewer than 100 SNPs used in contamination estimation were excluded.".format( + ";".join(x) + ) + ) + .rename(columns={"Contamination_Nr_SNPs": "Contamination_Note"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +## Add library names +compound_eager_table["Original_library_names"] = compound_eager_table.apply( + infer_library_name, axis=1, args=("Sample_Name", "Library_ID") +) +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")[["Original_library_names"]] + .agg(lambda x: ";".join(x)) + .rename(columns={"Original_library_names": "Library_Names"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")[["Library_ID"]] + .agg("nunique") + .rename(columns={"Library_ID": "Nr_Libraries"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")[ + ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"] + ] + .apply( + weighted_mean, + wt_col="n_reads", + val_col="Contamination_Est", + filter_col="Contamination_Nr_SNPs", + min_val=100, + ) + .reset_index("Sample_Name") + .rename(columns={0: "Contamination"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")[ + ["Contamination_Nr_SNPs", "Contamination_Est", "Contamination_SE", "n_reads"] + ] + .apply( + weighted_mean, + wt_col="n_reads", + val_col="Contamination_SE", + filter_col="Contamination_Nr_SNPs", + min_val=100, + ) + .reset_index("Sample_Name") + .rename(columns={0: "Contamination_Err"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +## If Contamination column is not empty, add the contamination measure +summarised_stats["Contamination_Meas"] = summarised_stats.apply( + set_contamination_measure, axis=1 +) + +summarised_stats = ( + compound_eager_table.groupby("Sample_Name")[["damage", "n_reads"]] + .apply( + weighted_mean, + wt_col="n_reads", + val_col="damage", + filter_col="n_reads", + min_val=0, + ) ## filter on n_reads >= 0, i.e. no filtering. + .reset_index("Sample_Name") + .rename(columns={0: "Damage"}) + .merge(summarised_stats, on="Sample_Name", validate="one_to_one") +) + +final_eager_table = compound_eager_table.merge( + summarised_stats, on="Sample_Name", validate="many_to_one" +).drop( + columns=[ + "Library_ID", + "Contamination_Nr_SNPs", + "Contamination_Est", + "Contamination_SE", + "n_reads", + "damage", + "Original_library_names", + ], +) + +filled_janno_table = janno_table.merge( + final_eager_table, left_on="Eager_ID", right_on="Sample_Name" +) +## TODO-dev need to infer Genetic_Sex from 'RateX', 'RateY', 'RateErrX', 'RateErrY' +for col in [ + "Nr_SNPs", + "Damage", + "Contamination_Err", + "Contamination", + "Nr_Libraries", + "Contamination_Note", + "Library_Names", + "Contamination_Meas", +]: + filled_janno_table[col] = ( + filled_janno_table[[col + "_x", col + "_y"]].bfill(axis=1).iloc[:, 0] + ) + +## Drop columns duplicated from merges, and columns that are not relevant anymore. +filled_janno_table = filled_janno_table.drop( + list(filled_janno_table.filter(regex=r".*_(x|y)")), axis=1 +).drop("Sample_Name", axis=1) + +## Replace NAs with "n/a" +filled_janno_table.replace(np.nan, "n/a", inplace=True) + +final_column_order = [ + "Poseidon_ID", + "Genetic_Sex", + "Group_Name", + "Alternative_IDs", + "Main_ID", ## Added + "Relation_To", + "Relation_Degree", + "Relation_Type", + "Relation_Note", + "Collection_ID", + "Country", + "Country_ISO", + "Location", + "Site", + "Latitude", + "Longitude", + "Date_Type", + "Date_C14_Labnr", + "Date_C14_Uncal_BP", + "Date_C14_Uncal_BP_Err", + "Date_BC_AD_Start", + "Date_BC_AD_Median", + "Date_BC_AD_Stop", + "Date_Note", + "MT_Haplogroup", + "Y_Haplogroup", + "Source_Tissue", + "Nr_Libraries", + "Library_Names", + "Capture_Type", + "UDG", + "Library_Built", + "Genotype_Ploidy", + "Data_Preparation_Pipeline_URL", + "Endogenous", + "Nr_SNPs", + "Coverage_on_Target_SNPs", + "Damage", + "Contamination", + "Contamination_Err", + "Contamination_Meas", + "Contamination_Note", + "Genetic_Source_Accession_IDs", + "Primary_Contact", + "Publication", + "Note", + "Keywords", + "Eager_ID", ## Added + "RateX", ## Added + "RateY", ## Added + "RateErrX", ## Added + "RateErrY", ## Added +] + +## Reorder columns to match desired order +filled_janno_table = filled_janno_table[final_column_order] + +if args.safe: + out_fn = f"{poseidon_yaml_data.janno_file}.new" + print(f"Safe mode is activated. Results saved in: {out_fn}") + filled_janno_table.to_csv(out_fn, sep="\t", index=False) +else: + filled_janno_table.to_csv(poseidon_yaml_data.janno_file, sep="\t", index=False) diff --git a/scripts/run_eager.sh b/scripts/run_eager.sh index 6e2ea49..84653df 100755 --- a/scripts/run_eager.sh +++ b/scripts/run_eager.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -uo pipefail -VERSION='0.2.0dev' +VERSION='0.2.1dev' TEMP=`getopt -q -o hvadD --long help,version,test,array,dry_run,debug: -n 'run_eager.sh' -- "$@"` eval set -- "$TEMP" @@ -51,7 +51,7 @@ nxf_path="/mnt/archgen/tools/nextflow/22.04.5.5708/" ## Use centarlly installed eager_version='2.4.6' root_eager_dir="${repo_dir}/eager/" root_package_dir="${repo_dir}/packages/" -nextflow_profiles="eva,archgen,medium_data,eva_local_paths" +nextflow_profiles="eva,archgen,big_data,eva_local_paths" tower_config="${repo_dir}/.nextflow_tower.conf" ## Optional tower.nf configuration array_temp_fn_dir="${repo_dir}/array_tempfiles" array_logs_dir="${repo_dir}/array_Logs" diff --git a/scripts/validate_downloaded_data.sh b/scripts/validate_downloaded_data.sh index c9dd6b4..6a8c748 100755 --- a/scripts/validate_downloaded_data.sh +++ b/scripts/validate_downloaded_data.sh @@ -1,12 +1,35 @@ #!/usr/bin/env bash set -uo pipefail ## Pipefail, complain on new unassigned variables. -VERSION='0.2.1dev' +VERSION='0.2.2dev' ## Load helper bash functions source $(dirname ${0})/source_me.sh +function Helptext() { + echo -ne "\t usage: ${0} [options] \n\n" + echo -ne "This validates that the md5sums for downloaded FastQ files match the ones in the SSF for the package, and creates symlinks for each line in the eager input TSV.\n\n" + echo -ne "Options:\n" + echo -ne "-h, --help\t\tPrint this text and exit.\n" + echo -ne "-v, --version\t\tPrint version and exit.\n" +} + +## Show helptext and exit if no arguments are provided +if [[ ${#@} -eq 0 ]]; then + Helptext + exit 0 +fi + +if [[ ${1} == '-v' || ${1} == '--version' ]]; then + echo "validate_downloaded_data.sh version: ${VERSION}" + exit 0 +elif [[ ${1} == '-h' || ${1} == '--help' ]]; then + Helptext + exit 0 +fi + ssf_file=$(readlink -f ${1}) download_dir=$(readlink -f ${2}) -symlink_dir=$(readlink -f ${3}) +package_eager_dir=$(readlink -f ${3}) +symlink_dir=${package_eager_dir}/data md5sum_file="${download_dir}/expected_md5sums.txt" newest_fastq=$(ls -Art -1 ${download_dir}/*q.gz | tail -n 1) ## Reverse order and tail to avoid broken pipe errors script_debug_string="[validate_downloaded_data.sh]:"